## Pandas Viewing

In [1]:
import numpy as np
import pandas as pd

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range('20130101', periods = 6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index = dates, columns = list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,1.102097,-0.036194,-1.409178,-0.580642
2013-01-02,-0.312553,-0.493394,0.9833,0.122026
2013-01-03,-1.718457,-0.878655,1.614042,0.658006
2013-01-04,0.614871,1.223886,-1.357744,-0.806359
2013-01-05,0.586036,-1.459531,-1.418955,0.376387
2013-01-06,0.912279,1.852195,-2.576505,1.150889


In [62]:
df2 = pd.DataFrame({
    'A': 1,
    'B': pd.Timestamp('20130102'),
    'C': pd.Series(1, index = list(range(4)), dtype = 'float32'),
    'D': np.array([3] * 4, dtype = 'int32'),
    'E': pd.Categorical(['test', 'train', 'test', 'train']),
    'F': 'foo'
})

df2

Unnamed: 0,A,B,C,D,E,F
0,1,2013-01-02,1.0,3,test,foo
1,1,2013-01-02,1.0,3,train,foo
2,1,2013-01-02,1.0,3,test,foo
3,1,2013-01-02,1.0,3,train,foo


TypeError: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type

In [64]:
np.__version__

'1.20.2'

In [65]:
pd.__version__

'1.2.4'

In [66]:
import six

In [67]:
six.__version__

'1.15.0'

In [63]:
df2.dtypes

A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [9]:
df.head()
df.tail(3)
df.index
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [10]:
df.to_numpy()

array([[ 1.10209682, -0.03619357, -1.40917755, -0.58064241],
       [-0.31255311, -0.49339363,  0.98329976,  0.1220258 ],
       [-1.71845658, -0.87865497,  1.61404226,  0.65800622],
       [ 0.61487123,  1.22388619, -1.35774364, -0.80635852],
       [ 0.5860362 , -1.45953055, -1.41895532,  0.37638657],
       [ 0.91227898,  1.85219453, -2.57650471,  1.15088875]])

In [11]:
df.describe()
df.T
df.sort_index(axis = 1, ascending = False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.580642,-1.409178,-0.036194,1.102097
2013-01-02,0.122026,0.9833,-0.493394,-0.312553
2013-01-03,0.658006,1.614042,-0.878655,-1.718457
2013-01-04,-0.806359,-1.357744,1.223886,0.614871
2013-01-05,0.376387,-1.418955,-1.459531,0.586036
2013-01-06,1.150889,-2.576505,1.852195,0.912279


In [12]:
df.sort_values(by = 'B', ascending = False)

Unnamed: 0,A,B,C,D
2013-01-06,0.912279,1.852195,-2.576505,1.150889
2013-01-04,0.614871,1.223886,-1.357744,-0.806359
2013-01-01,1.102097,-0.036194,-1.409178,-0.580642
2013-01-02,-0.312553,-0.493394,0.9833,0.122026
2013-01-03,-1.718457,-0.878655,1.614042,0.658006
2013-01-05,0.586036,-1.459531,-1.418955,0.376387


## Pandas Filtering

In [14]:
df['A']
df.A

2013-01-01    1.102097
2013-01-02   -0.312553
2013-01-03   -1.718457
2013-01-04    0.614871
2013-01-05    0.586036
2013-01-06    0.912279
Freq: D, Name: A, dtype: float64

In [16]:
df[0:3]
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.312553,-0.493394,0.9833,0.122026
2013-01-03,-1.718457,-0.878655,1.614042,0.658006
2013-01-04,0.614871,1.223886,-1.357744,-0.806359


In [19]:
df.loc['2013-01-01']

A    1.102097
B   -0.036194
C   -1.409178
D   -0.580642
Name: 2013-01-01 00:00:00, dtype: float64

In [20]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,1.102097,-0.036194
2013-01-02,-0.312553,-0.493394
2013-01-03,-1.718457,-0.878655
2013-01-04,0.614871,1.223886
2013-01-05,0.586036,-1.459531
2013-01-06,0.912279,1.852195


In [21]:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,-0.312553,-0.493394
2013-01-03,-1.718457,-0.878655
2013-01-04,0.614871,1.223886


In [23]:
df.loc['20130104', ['A', 'B']]
df.loc[dates[0], 'A']

1.1020968244877134

In [24]:
df.iloc[3]

A    0.614871
B    1.223886
C   -1.357744
D   -0.806359
Name: 2013-01-04 00:00:00, dtype: float64

In [25]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.614871,1.223886
2013-01-05,0.586036,-1.459531


In [26]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-0.312553,-0.493394,0.9833,0.122026
2013-01-03,-1.718457,-0.878655,1.614042,0.658006


In [35]:
df = pd.DataFrame({
    'string': list('abc'),
    'int64': list(range(1, 4)),
    'uint8': np.arange(3, 6).astype('u1'),
    'float64': np.arange(4.0, 7.0),
    'bool1': [True, False, True],
    'bool2': [False, True, False],
    'dates': pd.date_range('now', periods=3),
    'category': pd.Series(list("ABC")).astype('category')
})

df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,1,3,4.0,True,False,2021-06-01 16:31:11.429843,A
1,b,2,4,5.0,False,True,2021-06-02 16:31:11.429843,B
2,c,3,5,6.0,True,False,2021-06-03 16:31:11.429843,C


In [43]:
df.select_dtypes(include = [bool, object])

Unnamed: 0,string,bool1,bool2
0,a,True,False
1,b,False,True
2,c,True,False


In [44]:
df[df['float64'] >= 5]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
1,b,2,4,5.0,False,True,2021-06-02 16:31:11.429843,B
2,c,3,5,6.0,True,False,2021-06-03 16:31:11.429843,C


In [47]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two']
df2

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,E
0,a,1,3,4.0,True,False,2021-06-01 16:31:11.429843,A,one
1,b,2,4,5.0,False,True,2021-06-02 16:31:11.429843,B,one
2,c,3,5,6.0,True,False,2021-06-03 16:31:11.429843,C,two


In [51]:
df2[df2['E'].isin(['one'])]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,E
0,a,1,3,4.0,True,False,2021-06-01 16:31:11.429843,A,one
1,b,2,4,5.0,False,True,2021-06-02 16:31:11.429843,B,one


In [54]:
df.iat[0, 1] = -1
df.iloc[0, 1] = 2
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,2,3,4.0,True,False,2021-06-01 16:31:11.429843,A
1,b,2,4,5.0,False,True,2021-06-02 16:31:11.429843,B
2,c,3,5,6.0,True,False,2021-06-03 16:31:11.429843,C


In [58]:
df.at[0, 'float64'] = -10
df.loc[0, 'float64'] = -20
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,2,3,-20.0,True,False,2021-06-01 16:31:11.429843,A
1,b,2,4,5.0,False,True,2021-06-02 16:31:11.429843,B
2,c,3,5,6.0,True,False,2021-06-03 16:31:11.429843,C


In [61]:
df.loc[:, 'uint8'] = np.array([50] * len(df))
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,2,50,-20.0,True,False,2021-06-01 16:31:11.429843,A
1,b,2,50,5.0,False,True,2021-06-02 16:31:11.429843,B
2,c,3,50,6.0,True,False,2021-06-03 16:31:11.429843,C


## Merge

In [70]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,-0.32256,-0.480855,-1.430895,1.396831
1,-0.407604,-1.113968,0.802429,-0.813734
2,-0.113883,-0.395984,-0.073951,0.284515
3,-0.689353,-0.406849,-2.325122,1.423332
4,0.007648,0.46822,-0.870278,1.985262
5,-0.17707,0.232671,0.008777,-1.614942
6,0.956918,0.975226,1.422185,1.036596
7,0.529448,-0.221873,1.265516,-1.950151
8,-1.068436,2.2505,-1.056361,-0.81475
9,2.019197,0.523076,-1.711667,-0.517824


In [72]:
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0 -0.322560 -0.480855 -1.430895  1.396831
 1 -0.407604 -1.113968  0.802429 -0.813734
 2 -0.113883 -0.395984 -0.073951  0.284515,
           0         1         2         3
 3 -0.689353 -0.406849 -2.325122  1.423332
 4  0.007648  0.468220 -0.870278  1.985262
 5 -0.177070  0.232671  0.008777 -1.614942
 6  0.956918  0.975226  1.422185  1.036596,
           0         1         2         3
 7  0.529448 -0.221873  1.265516 -1.950151
 8 -1.068436  2.250500 -1.056361 -0.814750
 9  2.019197  0.523076 -1.711667 -0.517824]

In [73]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-0.32256,-0.480855,-1.430895,1.396831
1,-0.407604,-1.113968,0.802429,-0.813734
2,-0.113883,-0.395984,-0.073951,0.284515
3,-0.689353,-0.406849,-2.325122,1.423332
4,0.007648,0.46822,-0.870278,1.985262
5,-0.17707,0.232671,0.008777,-1.614942
6,0.956918,0.975226,1.422185,1.036596
7,0.529448,-0.221873,1.265516,-1.950151
8,-1.068436,2.2505,-1.056361,-0.81475
9,2.019197,0.523076,-1.711667,-0.517824


In [74]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

In [76]:
pd.merge(left, right, on = 'key', how = 'outer')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


## Grouping

In [80]:
df = pd.DataFrame({
    'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
    'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
    'C': np.random.randn(8),
    'D': np.random.randn(8)
})
df

Unnamed: 0,A,B,C,D
0,foo,one,0.196272,1.029256
1,bar,one,1.871133,-0.084326
2,foo,two,-1.310084,1.006416
3,bar,three,-2.128265,-1.256651
4,foo,two,-0.72208,0.932865
5,bar,two,-0.26853,-0.054436
6,foo,one,-0.464402,0.414485
7,foo,three,0.430975,0.744477


In [79]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.453861,-1.536014
foo,1.833397,0.775569


In [81]:
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.871133,-0.084326
bar,three,-2.128265,-1.256651
bar,two,-0.26853,-0.054436
foo,one,-0.268129,1.443741
foo,three,0.430975,0.744477
foo,two,-2.032164,1.939281


In [82]:
df.groupby('A').agg({'C': np.sum, 'D': np.max})

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.525662,-0.054436
foo,-1.869318,1.029256


## Stack

In [88]:
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df2 = df[:4]

In [94]:
stacked = df2.stack()
stacked

first  second   
bar    one     A    0.679627
               B   -1.042584
       two     A   -0.384955
               B   -1.310710
baz    one     A    0.266318
               B   -1.618402
       two     A    0.502790
               B   -0.519566
dtype: float64

In [96]:
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.679627,-1.042584
bar,two,-0.384955,-1.31071
baz,one,0.266318,-1.618402
baz,two,0.50279,-0.519566


In [97]:
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,0.679627,-0.384955
bar,B,-1.042584,-1.31071
baz,A,0.266318,0.50279
baz,B,-1.618402,-0.519566


In [100]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,0.679627,0.266318
one,B,-1.042584,-1.618402
two,A,-0.384955,0.50279
two,B,-1.31071,-0.519566


## Pivot Table

In [103]:
df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3,
              'B': ['A', 'B', 'C'] * 4,
              'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
              'D': np.random.randn(12),
              'E': np.random.randn(12)})
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,0.297403,-1.180042
1,one,B,foo,0.116853,0.569746
2,two,C,foo,-0.371657,-0.269455
3,three,A,bar,1.602163,0.590497
4,one,B,bar,-0.661629,-1.329606
5,one,C,bar,0.636433,-1.282651
6,two,A,foo,2.159459,-0.183877
7,three,B,foo,-1.293188,-0.284243
8,one,C,foo,0.318522,0.350604
9,one,A,bar,-0.519081,-2.234324


In [104]:
pd.pivot_table(df, values = 'D', index = ['A', 'B'], columns = ['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.519081,0.297403
one,B,-0.661629,0.116853
one,C,0.636433,0.318522
three,A,1.602163,
three,B,,-1.293188
three,C,0.18553,
two,A,,2.159459
two,B,0.239474,
two,C,,-0.371657
