In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels as sm
%matplotlib inline

In [18]:
# Data Wrangling
# Hierarchical Indexing (MultiIndex)
data = pd.Series(np.random.randn(9), index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'], 
                                           [1, 2, 3, 1, 3, 1, 2, 2, 3]])
print(data)
print(data['b':'c'])
print(data.loc[['b', 'd']])
data.loc[:, 2] # selection from inner level

a  1    0.122137
   2    1.345369
   3   -0.601555
b  1    0.708400
   3   -0.546410
c  1    1.026855
   2   -0.515687
d  2   -0.147987
   3    0.530814
dtype: float64
b  1    0.708400
   3   -0.546410
c  1    1.026855
   2   -0.515687
dtype: float64
b  1    0.708400
   3   -0.546410
d  2   -0.147987
   3    0.530814
dtype: float64


a    1.345369
c   -0.515687
d   -0.147987
dtype: float64

In [10]:
# rearrange data into a DataFrame
data.unstack()
data.unstack().stack() # inverse operation

a  1    0.908548
   2    0.121484
   3    1.149916
b  1    0.059193
   3   -0.693320
c  1    0.273307
   2   -0.964085
d  2   -0.356794
   3    0.491613
dtype: float64

In [22]:
frame = pd.DataFrame(np.random.randn(18).reshape((9,2)), index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'], 
                                           [1, 2, 3, 1, 3, 1, 2, 2, 3]], columns=[['one', 'two']])
frame.index.names = ['key1', 'key2']
frame.swaplevel('key1', 'key2') # reordering Index
frame.sort_index(level=1) # sorting Index
frame.sum(level='key2')

Unnamed: 0_level_0,one,two
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-1.895291,2.785343
2,-0.851407,-2.147817
3,0.2288,1.481973


In [27]:
# Indexing DataFrame
frame = pd.DataFrame({'a':range(7), 'b':range(7, 0, -1),
                     'c':['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                     'd': [0, 1, 2, 0, 1, 2, 3]})
frame2 = frame.set_index(['c', 'd']) # create new DataFrame using one or more columns
print(frame2)
frame2.reset_index() # remove hierarachical index

       a  b
c   d      
one 0  0  7
    1  1  6
    2  2  5
two 0  3  4
    1  4  3
    2  5  2
    3  6  1


Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [28]:
# Combining and Merging
# pandas.merge() implements database JOIN operations
# pandas.concat() concatenates objects along axis
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'datat1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data2': range(3)})
pd.merge(df1, df2, on='key')

Unnamed: 0,key,datat1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [29]:
pd.merge(df1, df2, how='outer') # join taking union of keys, default is 'inner'
#pd.merge(df1, df2, how='inner')
#pd.merge(df1, df2, how='left')
#pd.merge(df1, df2, how='right')

Unnamed: 0,key,datat1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


In [None]:
# Merging on Index
pd.merge(left1, left2, left_on=['key1', 'key2'], right_index=True) # specify keys to use or just using Index
left2.join(right2, how='outer')
left2.join([right2, another], how='outer')

In [33]:
# Concatenating
# numpy array concatenate
arr = np.arange(12).reshape((3,4))
np.concatenate([arr, arr], axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [39]:
# pandas concat
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])
pd.concat([s1, s2, s3]) # by default along axis = 0 (row)
pd.concat([s1, s2, s3], axis=1) # along columns
s4 = pd.concat([s1, s3])
pd.concat([s1, s4], axis=1, join='inner') # 'outer' by default
pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']]) # specify axes to be used
result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three']) # create a hierarchical index
result.unstack()
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if sys.path[0] == '':


Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [41]:
# pandas concat DataFrames
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'], columns=['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'], columns=['three', 'four'])

pd.concat([df1, df2], axis=1, keys=['level1', 'levbel2'], names=['upper', 'lower'], sort=True)

upper,level1,level1,levbel2,levbel2
lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [42]:
df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])

pd.concat([df1, df2], ignore_index=True) # ignore while row index is not relevant

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


Unnamed: 0,a,b,c,d
0,-0.315783,0.036242,0.703619,-1.156933
1,0.641196,-1.129535,1.815749,-0.677096
2,1.001452,-1.481176,-1.734683,-0.072458
3,1.285567,-0.501637,,0.04717
4,-1.7022,0.446279,,-0.324429


In [43]:
# Combining Data with Overlap
df1 = pd.DataFrame({'a': [1., np.nan],
                   'b': [np.nan, 2.]})
df2 = pd.DataFrame({'a': [5., 4.],})
df1.combine_first(df2) # combine column by column, first df1, then df2, else NA

Unnamed: 0,a,b
0,1.0,
1,4.0,2.0


In [47]:
# Reshaping and Pivoting
# data.unstack() # may cause missing values
# data.stack() # pivot columns into rows by default, will filter out missing values
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                   index=pd.Index(['Ohio', 'Colorado'], name='state'),
                   columns=pd.Index(['one', 'two', 'three'], name='number'))
result = data.stack()
df = pd.DataFrame({'left': result, 'right': result + 5},
                 columns=pd.Index(['left', 'right'], name='side'))
print(df)
df.unstack('state') # state becomes the lowest level

side             left  right
state    number             
Ohio     one        0      5
         two        1      6
         three      2      7
Colorado one        3      8
         two        4      9
         three      5     10


side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [51]:
df.unstack('state').stack('side')

Unnamed: 0_level_0,state,Colorado,Ohio
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,3,0
one,right,8,5
two,left,4,1
two,right,9,6
three,left,5,2
three,right,10,7


In [55]:
# Pivoting 'long' to 'Wide'
#pivoted = data.pivot(row_index, col_index, 'value') # equivalent to using set_index followed by unstack()
#unstacked = data.set_index([row_index, col_index]).unstack(col_index)

# Pivoting 'Wide' to 'Long'
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
                  'A': [1, 2, 3],
                  'B': [4, 5, 6],
                  'C': [7, 8, 9]})
print(df)
melted = pd.melt(df, ['key'])
melted

   key  A  B  C
0  foo  1  4  7
1  bar  2  5  8
2  baz  3  6  9


Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
6,foo,C,7
7,bar,C,8
8,baz,C,9


In [57]:
reshaped = melted.pivot('key', 'variable', 'value')
reshaped.reset_index()

variable,key,A,B,C
0,bar,2,5,8
1,baz,3,6,9
2,foo,1,4,7


In [58]:
pd.melt(df, id_vars=['key'], value_vars=['A', 'B'])

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
