In [1]:
import pandas as pd
import numpy as np

## Iteration

In [13]:
df = pd.DataFrame({'col1': np.random.randn(3),
                   'col2': np.random.randn(3)}, index=['a', 'b', 'c'])

In [3]:
df

Unnamed: 0,col1,col2
a,0.089793,-0.79336
b,0.724986,0.827297
c,-0.565989,0.40203


In [4]:
for col in df:
    print(col)

col1
col2


### items

In [16]:
for label, ser in df.items():
    print(label)
    print(ser)

col1
a   -0.462246
b   -1.177790
c   -0.800825
Name: col1, dtype: float64
col2
a   -0.250528
b   -0.426655
c    0.676205
Name: col2, dtype: float64


### iterrows

In [17]:
df = pd.DataFrame({
    "x":[4,5,6],
    "y":["a","b","c"]
})
df

Unnamed: 0,x,y
0,4,a
1,5,b
2,6,c


In [18]:
for index,row in df.iterrows():
    print (f"{index} \n{row}".format(index,row))

0 
x    4
y    a
Name: 0, dtype: object
1 
x    5
y    b
Name: 1, dtype: object
2 
x    6
y    c
Name: 2, dtype: object


### .dt accessor

In [28]:
s = pd.Series(pd.date_range('20191011 09:10:12', periods=4))
s

0   2019-10-11 09:10:12
1   2019-10-12 09:10:12
2   2019-10-13 09:10:12
3   2019-10-14 09:10:12
dtype: datetime64[ns]

In [29]:
s.dt.hour

0    9
1    9
2    9
3    9
dtype: int64

In [30]:
s.dt.day

0    11
1    12
2    13
3    14
dtype: int64

In [34]:
s[s.dt.day==12]

1   2019-10-12 09:10:12
dtype: datetime64[ns]

In [35]:
s.dt.tz_localize('US/Eastern')

0   2019-10-11 09:10:12-04:00
1   2019-10-12 09:10:12-04:00
2   2019-10-13 09:10:12-04:00
3   2019-10-14 09:10:12-04:00
dtype: datetime64[ns, US/Eastern]

In [37]:
s.dt.tz_localize('UTC')

0   2019-10-11 09:10:12+00:00
1   2019-10-12 09:10:12+00:00
2   2019-10-13 09:10:12+00:00
3   2019-10-14 09:10:12+00:00
dtype: datetime64[ns, UTC]

In [None]:
s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

In [40]:
#Formatting
s.dt.strftime('%Y/%m/%d')

0    2019/10/11
1    2019/10/12
2    2019/10/13
3    2019/10/14
dtype: object

In [41]:
# timedelta
s = pd.Series(pd.timedelta_range('1 day 00:00:05', periods=4, freq='s'))
s

0   1 days 00:00:05
1   1 days 00:00:06
2   1 days 00:00:07
3   1 days 00:00:08
dtype: timedelta64[ns]

In [42]:
s.dt.seconds

0    5
1    6
2    7
3    8
dtype: int64

In [43]:
s.dt.days

0    1
1    1
2    1
3    1
dtype: int64

In [44]:
s.dt.components

Unnamed: 0,days,hours,minutes,seconds,milliseconds,microseconds,nanoseconds
0,1,0,0,5,0,0,0
1,1,0,0,6,0,0,0
2,1,0,0,7,0,0,0
3,1,0,0,8,0,0,0


### Vectorized string methods

In [45]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [49]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

## Sorting
Pandas supports three kinds of sorting: <br>
1. sorting by index labels,
2. sorting by column values, and 
3. sorting by a combination of both.

### By index

In [50]:
df = pd.DataFrame({
       'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
      'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
     'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df

Unnamed: 0,one,two,three
a,0.979526,-1.419751,
b,-0.218547,0.283523,0.601156
c,-2.316156,0.678264,-0.736922
d,,1.222804,-1.653172


In [51]:
unsorted_df = df.reindex(index=["a","d","c","b"],columns=["three","two","one"])

In [52]:
unsorted_df

Unnamed: 0,three,two,one
a,,-1.419751,0.979526
d,-1.653172,1.222804,
c,-0.736922,0.678264,-2.316156
b,0.601156,0.283523,-0.218547


In [53]:
unsorted_df.sort_index()

Unnamed: 0,three,two,one
a,,-1.419751,0.979526
b,0.601156,0.283523,-0.218547
c,-0.736922,0.678264,-2.316156
d,-1.653172,1.222804,


In [54]:
unsorted_df.sort_index(ascending=False)

Unnamed: 0,three,two,one
d,-1.653172,1.222804,
c,-0.736922,0.678264,-2.316156
b,0.601156,0.283523,-0.218547
a,,-1.419751,0.979526


In [55]:
unsorted_df.sort_index(axis=1)

Unnamed: 0,one,three,two
a,0.979526,,-1.419751
d,,-1.653172,1.222804
c,-2.316156,-0.736922,0.678264
b,-0.218547,0.601156,0.283523


### By values

In [56]:
df1 = pd.DataFrame({'one': [2, 1, 1, 1],
                       'two': [1, 3, 2, 4],
                  'three': [5, 4, 3, 2]})

In [57]:
df1

Unnamed: 0,one,two,three
0,2,1,5
1,1,3,4
2,1,2,3
3,1,4,2


In [58]:
df1.sort_values(by="two")

Unnamed: 0,one,two,three
0,2,1,5
2,1,2,3
1,1,3,4
3,1,4,2


In [59]:
df1.sort_values(by=["one","two"])

Unnamed: 0,one,two,three
2,1,2,3
1,1,3,4
3,1,4,2
0,2,1,5


In [60]:
s.sort_values(na_position='first')

5     NaN
0       A
3    Aaba
1       B
4    Baca
2       C
6    CABA
8     cat
7     dog
dtype: object

### By indexes and values

In [61]:
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2),
                                 ('b', 2), ('b', 1), ('b', 1)])
idx.names=["fisrt","second"]
idx

MultiIndex(levels=[['a', 'b'], [1, 2]],
           codes=[[0, 0, 0, 1, 1, 1], [0, 1, 1, 1, 0, 0]],
           names=['fisrt', 'second'])

In [62]:
df_multi = pd.DataFrame({'A': np.arange(6, 0, -1)},
                        index=idx)
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,A
fisrt,second,Unnamed: 2_level_1
a,1,6
a,2,5
a,2,4
b,2,3
b,1,2
b,1,1


In [65]:
df_multi.sort_values(by=['second',"A"])

Unnamed: 0_level_0,Unnamed: 1_level_0,A
fisrt,second,Unnamed: 2_level_1
b,1,1
b,1,2
a,1,6
b,2,3
a,2,4
a,2,5


### smallest / largest values

In [66]:
s = pd.Series(np.random.permutation(10))
s

0    7
1    2
2    4
3    9
4    8
5    1
6    6
7    5
8    3
9    0
dtype: int64

In [68]:
s = s.sort_values()
s

9    0
5    1
1    2
8    3
2    4
7    5
6    6
0    7
4    8
3    9
dtype: int64

In [69]:
s.nlargest(3)

3    9
4    8
0    7
dtype: int64

In [70]:
s.nsmallest(3)

9    0
5    1
1    2
dtype: int64

In [71]:
df = pd.DataFrame({'a': [-2, -1, 1, 10, 8, 11, -1],
                      'b': list('abdceff'),
                      'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]})
df

Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0
2,1,d,4.0
3,10,c,3.2
4,8,e,
5,11,f,3.0
6,-1,f,4.0


In [73]:
df.nlargest(3,'a')["a"]

5    11
3    10
4     8
Name: a, dtype: int64

## dtypes

In [74]:
dft = pd.DataFrame({'A': np.random.rand(3),
                        'B': 1,
                       'C': 'foo',
                     'D': pd.Timestamp('20010102'),
                      'E': pd.Series([1.0] * 3).astype('float32'),
                       'F': False,
                    'G': pd.Series([1] * 3, dtype='int8')})

In [75]:
dft

Unnamed: 0,A,B,C,D,E,F,G
0,0.801708,1,foo,2001-01-02,1.0,False,1
1,0.058279,1,foo,2001-01-02,1.0,False,1
2,0.766017,1,foo,2001-01-02,1.0,False,1


In [76]:
dft.dtypes

A           float64
B             int64
C            object
D    datetime64[ns]
E           float32
F              bool
G              int8
dtype: object

In [77]:
dft.dtypes.value_counts()

float32           1
int64             1
int8              1
bool              1
float64           1
object            1
datetime64[ns]    1
dtype: int64

## defaults

By default integer types are int64 and float types are float64, regardless of platform (32-bit or 64-bit). The following will all result in int64 dtypes.<br>
<br>
Note that Numpy will choose platform-dependent types when creating arrays. The following WILL result in int32 on 32-bit platform.<br>
`frame = pd.DataFrame(np.array([1, 2]))`

###  astype

In [84]:
df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float16'),
                  'B': pd.Series(np.random.randn(8)),
                   'C': pd.Series(np.array(np.random.randn(8),
                                           dtype='uint8'))})
df2

Unnamed: 0,A,B,C
0,1.771484,-3.273122,0
1,-1.273438,-2.396612,0
2,0.485352,0.510325,0
3,-0.481445,-1.384076,1
4,1.095703,-1.164889,1
5,0.33374,0.833151,0
6,0.570801,-0.000652,1
7,1.384766,1.671266,254


In [85]:
df3 = df1.reindex_like(df2).fillna(value=0.0) + df2
df3

Unnamed: 0,A,B,C
0,1.771484,-3.273122,0.0
1,-1.273438,-2.396612,0.0
2,0.485352,0.510325,0.0
3,-0.481445,-1.384076,1.0
4,1.095703,-1.164889,1.0
5,0.33374,0.833151,0.0
6,0.570801,-0.000652,1.0
7,1.384766,1.671266,254.0


In [86]:
df2.dtypes

A    float16
B    float64
C      uint8
dtype: object

In [87]:
df3.dtypes

A    float64
B    float64
C    float64
dtype: object

In [90]:
## conversion of dtypes
df3.astype('float32').dtypes

A    float32
B    float32
C    float32
dtype: object

#### Convert a subset of columns to a specified type using astype().

In [91]:
dft = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
dft

Unnamed: 0,a,b,c
0,1,4,7
1,2,5,8
2,3,6,9


In [92]:
dft.dtypes

a    int64
b    int64
c    int64
dtype: object

In [94]:
dft.astype('int32').dtypes

a    int32
b    int32
c    int32
dtype: object

In [96]:
dft[["a","b"]] = dft[["a","b"]].astype(np.int32)

In [97]:
dft.dtypes

a    int32
b    int32
c    int64
dtype: object

In [98]:
#Convert certain columns to a specific dtype by passing a dict to astype().
dft1 = pd.DataFrame({'a': [1, 0, 1], 'b': [4, 5, 6], 'c': [7, 8, 9]})
dft1


Unnamed: 0,a,b,c
0,1,4,7
1,0,5,8
2,1,6,9


In [99]:
dft1 = dft1.astype({'a':np.bool,"c":np.float32})

In [100]:
dft1.dtypes

a       bool
b      int64
c    float32
dtype: object

### object conversion
pandas offers various functions to try to force conversion of types from the object dtype to other types. 
<br>In cases where the data is already of the correct type, but stored in an object array, the `DataFrame.infer_objects()` and `Series.infer_objects()` methods can be used to soft convert to the correct type.

In [101]:
import datetime

In [104]:
df = pd.DataFrame([[1, 2],
                      ['a', 'b'],
                     [datetime.datetime(2016, 3, 2),
                    datetime.datetime(2016, 3, 2)]])


In [105]:
df=df.T

In [106]:
df

Unnamed: 0,0,1,2
0,1,a,2016-03-02 00:00:00
1,2,b,2016-03-02 00:00:00


In [107]:
df.dtypes

0    object
1    object
2    object
dtype: object

In [110]:
df.infer_objects().dtypes

0             int64
1            object
2    datetime64[ns]
dtype: object

In [113]:
#to_numeric() (conversion to numeric dtypes)
m = [1.1,2,3]
pd.to_numeric(m)

array([1.1, 2. , 3. ])

In [116]:
#to_datetime() (conversion to datetime objects)
m = ['2016-07-09', datetime.datetime(2016, 3, 2)]
pd.to_datetime(m)

DatetimeIndex(['2016-07-09', '2016-03-02'], dtype='datetime64[ns]', freq=None)

In [117]:
#to_timedelta() (conversion to timedelta objects)
m = ['5us', pd.Timedelta('1day')]
pd.to_timedelta(m)

TimedeltaIndex(['0 days 00:00:00.000005', '1 days 00:00:00'], dtype='timedelta64[ns]', freq=None)

**To force a conversion, we can pass in an errors argument, which specifies how pandas should deal with elements that cannot be converted to desired dtype or object.** 
<br><br>By default, `errors='raise`', meaning that any errors encountered will be raised during the conversion process. However, if `errors='coerce`', these errors will be ignored and pandas will convert problematic elements to `pd.NaT` (for datetime and timedelta) or `np.nan` (for numeric). This might be useful if you are reading in data which is mostly of the desired dtype (e.g. numeric, datetime), but occasionally has non-conforming elements intermixed that you want to represent as missing:

In [118]:
m = ['apple', datetime.datetime(2016, 3, 2)]
pd.to_datetime(m,errors='coerce')

DatetimeIndex(['NaT', '2016-03-02'], dtype='datetime64[ns]', freq=None)

In [119]:
m = ['apple', 2,3]
pd.to_numeric(m,errors='coerce')

array([nan,  2.,  3.])

In [121]:
m = ['apple', datetime.datetime(2016, 3, 2)]
pd.to_datetime(m,errors='ignore')

Index(['apple', 2016-03-02 00:00:00], dtype='object')

In addition to object conversion, `to_numeric()` provides another argument `downcast`, which gives the option of downcasting the newly (or already) numeric data to a smaller dtype, which can conserve memory:

In [123]:
m=['1',2,3]
pd.to_numeric(m,downcast='integer')

array([1, 2, 3], dtype=int8)

In [124]:
m=['1',2,3]
pd.to_numeric(m,downcast='unsigned')

array([1, 2, 3], dtype=uint8)

As these methods apply only to one-dimensional arrays, lists or scalars; they cannot be used directly on multi-dimensional objects such as `DataFrames`. However, with `apply()`, we can “apply” the function over each column efficiently:

In [126]:
df = pd.DataFrame([
    ['2016-07-09', datetime.datetime(2016, 3, 2)]] * 2, dtype='O')
df.dtypes

0    object
1    object
dtype: object

In [129]:
df.apply(pd.to_datetime).dtypes

0    datetime64[ns]
1    datetime64[ns]
dtype: object

### gotchas
Performing selection operations on `integer` type data can easily upcast the data to `floating`. The dtype of the input data will be preserved in cases where nans are not introduced. 

In [135]:
dfi = df3.astype('int32')
dfi["E"]=1
dfi

Unnamed: 0,A,B,C,E
0,1,-3,0,1
1,-1,-2,0,1
2,0,0,0,1
3,0,-1,1,1
4,1,-1,1,1
5,0,0,0,1
6,0,0,1,1
7,1,1,254,1


In [136]:
dfi.dtypes

A    int32
B    int32
C    int32
E    int64
dtype: object

In [137]:
(dfi[dfi>0]).dtypes

A    float64
B    float64
C    float64
E      int64
dtype: object

In [138]:
#While float dtypes are unchanged.
dfa = df3.copy()
dfa['A'] = dfa['A'].astype('float32')
dfa.dtypes

A    float32
B    float64
C    float64
dtype: object

In [139]:
(dfa[df2 > 0]).dtypes

A    float32
B    float64
C    float64
dtype: object

## Selecting columns based on dtype

In [140]:
df = pd.DataFrame({'string': list('abc'),'int64': list(range(1, 4)),
                      'uint8': np.arange(3, 6).astype('u1'),
                       'float64': np.arange(4.0, 7.0),
                     'bool1': [True, False, True],
                  'bool2': [False, True, False],
                      'dates': pd.date_range('now', periods=3),
                    'category': pd.Series(list("ABC")).astype('category')})
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,1,3,4.0,True,False,2019-11-11 11:36:36.402168,A
1,b,2,4,5.0,False,True,2019-11-12 11:36:36.402168,B
2,c,3,5,6.0,True,False,2019-11-13 11:36:36.402168,C


In [142]:
df.select_dtypes(include=['number',bool,object],exclude=np.unsignedinteger)

Unnamed: 0,string,int64,float64,bool1,bool2
0,a,1,4.0,True,False
1,b,2,5.0,False,True
2,c,3,6.0,True,False


In [143]:
#To see all the child dtypes of a generic dtype like numpy.number you can define a function that returns a tree of child dtypes:
def subdtypes(dtype):
    subs = dtype.__subclasses__()
    if not subs:
        return dtype
    return [dtype, [subdtypes(dt) for dt in subs]]

In [144]:
subdtypes(np.generic)

[numpy.generic,
 [[numpy.number,
   [[numpy.integer,
     [[numpy.signedinteger,
       [numpy.int8,
        numpy.int16,
        numpy.int32,
        numpy.int64,
        numpy.int64,
        numpy.timedelta64]],
      [numpy.unsignedinteger,
       [numpy.uint8,
        numpy.uint16,
        numpy.uint32,
        numpy.uint64,
        numpy.uint64]]]],
    [numpy.inexact,
     [[numpy.floating,
       [numpy.float16, numpy.float32, numpy.float64, numpy.float128]],
      [numpy.complexfloating,
       [numpy.complex64, numpy.complex128, numpy.complex256]]]]]],
  [numpy.flexible,
   [[numpy.character, [numpy.bytes_, numpy.str_]],
    [numpy.void, [numpy.record]]]],
  numpy.bool_,
  numpy.datetime64,
  numpy.object_]]