In [200]:
# Pandas is open source library built on top of NumPy
# It allows for a fast analysis and data cleaning and preparation
# It excels in performance and productivity
# it also has built-in vizualization features 
# it can work with data from a wide variety of sources

In [201]:
import pandas as pd 
import numpy as np
from numpy.random import randn

In [202]:
# Series 

In [203]:
labels = ['a', 'b','c']
my_data = [10,20,30]
arr = np.array(my_data)
d = {'a':10, 'b':20, 'c':30}

In [204]:
pd.Series(data=my_data)

0    10
1    20
2    30
dtype: int64

In [205]:
pd.Series(data=my_data, index=labels)

a    10
b    20
c    30
dtype: int64

In [206]:
pd.Series(my_data,labels)

a    10
b    20
c    30
dtype: int64

In [207]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int64

In [208]:
pd.Series(arr, labels)

a    10
b    20
c    30
dtype: int64

In [209]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [210]:
pd.Series(labels)

0    a
1    b
2    c
dtype: object

In [211]:
ser1 = pd.Series([1,2,3,4],['Serbia','Croatia','USA','England'])
ser1

Serbia     1
Croatia    2
USA        3
England    4
dtype: int64

In [212]:
ser2 = pd.Series([1,2,5,4],['Serbia','Croatia','Brazil','England'])
ser2

Serbia     1
Croatia    2
Brazil     5
England    4
dtype: int64

In [213]:
ser1['USA']

3

In [214]:
ser1[2]

3

In [215]:
ser1 + ser2

Brazil     NaN
Croatia    4.0
England    8.0
Serbia     2.0
USA        NaN
dtype: float64

In [216]:
# DataFrames

In [217]:
np.random.seed(101)

In [218]:
df = pd.DataFrame(randn(5,4),['a', 'b','c','d','e'],['w','x','y','z'])
df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [219]:
df['w']

a    2.706850
b    0.651118
c   -2.018168
d    0.188695
e    0.190794
Name: w, dtype: float64

In [220]:
type(df['w'])

pandas.core.series.Series

In [221]:
df['x']

a    0.628133
b   -0.319318
c    0.740122
d   -0.758872
e    1.978757
Name: x, dtype: float64

In [222]:
df[['x','z']]

Unnamed: 0,x,z
a,0.628133,0.503826
b,-0.319318,0.605965
c,0.740122,-0.589001
d,-0.758872,0.955057
e,1.978757,0.683509


In [223]:
df['new'] = df["w"] + df['z']

In [224]:
df

Unnamed: 0,w,x,y,z,new
a,2.70685,0.628133,0.907969,0.503826,3.210676
b,0.651118,-0.319318,-0.848077,0.605965,1.257083
c,-2.018168,0.740122,0.528813,-0.589001,-2.607169
d,0.188695,-0.758872,-0.933237,0.955057,1.143752
e,0.190794,1.978757,2.605967,0.683509,0.874303


In [225]:
df.drop('new', axis=1)

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [226]:
df

Unnamed: 0,w,x,y,z,new
a,2.70685,0.628133,0.907969,0.503826,3.210676
b,0.651118,-0.319318,-0.848077,0.605965,1.257083
c,-2.018168,0.740122,0.528813,-0.589001,-2.607169
d,0.188695,-0.758872,-0.933237,0.955057,1.143752
e,0.190794,1.978757,2.605967,0.683509,0.874303


In [227]:
df.drop('new', axis=1, inplace = True)
df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [228]:
df.drop('e', axis=0)

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057


In [229]:
df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [230]:
df.shape

(5, 4)

In [231]:
# Rows

In [232]:
df.loc['a']

w    2.706850
x    0.628133
y    0.907969
z    0.503826
Name: a, dtype: float64

In [233]:
df.iloc[0]

w    2.706850
x    0.628133
y    0.907969
z    0.503826
Name: a, dtype: float64

In [234]:
df.loc['b','y']

-0.84807698340363147

In [235]:
df.loc[['b','c'],['x','y']]

Unnamed: 0,x,y
b,-0.319318,-0.848077
c,0.740122,0.528813


In [236]:
booldf = df > 0
booldf

Unnamed: 0,w,x,y,z
a,True,True,True,True
b,True,False,False,True
c,False,True,True,False
d,True,False,False,True
e,True,True,True,True


In [237]:
df[booldf]

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,,,0.605965
c,,0.740122,0.528813,
d,0.188695,,,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [238]:
df[df>0]

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,,,0.605965
c,,0.740122,0.528813,
d,0.188695,,,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [239]:
df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [240]:
df[df['z']<0]

Unnamed: 0,w,x,y,z
c,-2.018168,0.740122,0.528813,-0.589001


In [241]:
df[df['w']>0]['x']

a    0.628133
b   -0.319318
d   -0.758872
e    1.978757
Name: x, dtype: float64

In [242]:
df.reset_index()

Unnamed: 0,index,w,x,y,z
0,a,2.70685,0.628133,0.907969,0.503826
1,b,0.651118,-0.319318,-0.848077,0.605965
2,c,-2.018168,0.740122,0.528813,-0.589001
3,d,0.188695,-0.758872,-0.933237,0.955057
4,e,0.190794,1.978757,2.605967,0.683509


In [243]:
df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [244]:
newind = 'CA NY WR OR CO'.split()

In [245]:
newind

['CA', 'NY', 'WR', 'OR', 'CO']

In [246]:
df['States'] = newind

In [247]:
df

Unnamed: 0,w,x,y,z,States
a,2.70685,0.628133,0.907969,0.503826,CA
b,0.651118,-0.319318,-0.848077,0.605965,NY
c,-2.018168,0.740122,0.528813,-0.589001,WR
d,0.188695,-0.758872,-0.933237,0.955057,OR
e,0.190794,1.978757,2.605967,0.683509,CO


In [248]:
df.set_index('States')

Unnamed: 0_level_0,w,x,y,z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WR,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509


In [249]:
df

Unnamed: 0,w,x,y,z,States
a,2.70685,0.628133,0.907969,0.503826,CA
b,0.651118,-0.319318,-0.848077,0.605965,NY
c,-2.018168,0.740122,0.528813,-0.589001,WR
d,0.188695,-0.758872,-0.933237,0.955057,OR
e,0.190794,1.978757,2.605967,0.683509,CO


In [250]:
# index levels
outside = ['G1', 'G1', 'G1', 'G2', 'G2', 'G2' ]
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside, inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [251]:
inside

[1, 2, 3, 1, 2, 3]

In [252]:
hier_index

MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [253]:
list(zip(outside, inside))

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [254]:
df = pd.DataFrame(randn(6,2),hier_index, ['A','B'])

In [255]:
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [256]:
df.loc['G1'].loc[1]

A    0.302665
B    1.693723
Name: 1, dtype: float64

In [257]:
df.index.names

FrozenList([None, None])

In [258]:
df.index.names = ['Groups', 'Num']

In [259]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [260]:
df.loc['G2'].loc[2]['B']

0.072959675317038689

In [261]:
df.xs

<bound method NDFrame.xs of                    A         B
Groups Num                    
G1     1    0.302665  1.693723
       2   -1.706086 -1.159119
       3   -0.134841  0.390528
G2     1    0.166905  0.184502
       2    0.807706  0.072960
       3    0.638787  0.329646>

In [262]:
df.xs('G1')

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [263]:
df.xs(1, level='Num')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.302665,1.693723
G2,0.166905,0.184502


In [264]:
df.xs(3, level='Num')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-0.134841,0.390528
G2,0.638787,0.329646


In [265]:
# Missing Data

In [266]:
d = {'A':[1,2,np.nan] , 'B':[5,np.nan,np.nan], 'C':[1,2,3]}
d

{'A': [1, 2, nan], 'B': [5, nan, nan], 'C': [1, 2, 3]}

In [267]:
df1 = pd.DataFrame(d)

In [268]:
df1

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [273]:
# removing everything all with mising item

In [274]:
df1.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [275]:
df1.dropna(axis=0)

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [276]:
# trash argumnet
df1.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [277]:
# adding missing values
df1.fillna(value='Add Value')

Unnamed: 0,A,B,C
0,1,5,1
1,2,Add Value,2
2,Add Value,Add Value,3


In [278]:
df1

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [279]:
df1['A']

0    1.0
1    2.0
2    NaN
Name: A, dtype: float64

In [280]:
df1['A'].fillna(value=df['A'].mean())

0    1.000000
1    2.000000
2    0.012523
Name: A, dtype: float64

In [281]:
df1['B'].fillna(value=df['B'].mean())

0    5.00000
1    0.25204
2    0.25204
Name: B, dtype: float64

In [None]:
# GroupBy

In [282]:
data = {'Company': ['GOOG', 'GOOG', 'MSFT', 'MSFT', 'FB', 'FB'],
         'Person': ['Sam', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Sarah'],
         'Sales': [200,120,340,124,243,350]}

In [285]:
df2 = pd.DataFrame(data)
df2

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


In [287]:
df2.groupby('Company')

<pandas.core.groupby.DataFrameGroupBy object at 0x10b1fcf60>

In [289]:
byComp = df2.groupby('Company')
byComp

<pandas.core.groupby.DataFrameGroupBy object at 0x10b204fd0>