In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.Series(['one','two',np.nan, 'four'])
data

0     one
1     two
2     NaN
3    four
dtype: object

In [3]:
# check if series has missing values -> if True, than it has 
data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
# drop missing values
data.dropna()

0     one
1     two
3    four
dtype: object

In [5]:
dframe = pd.DataFrame([[1,2,3], [np.nan, 5, 6], [7, np.nan, 9], [np.nan, np.nan, np.nan]])
dframe

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0
3,,,


In [6]:
# any row which has any nan will be dropped
clean_dframe = dframe.dropna()
clean_dframe

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [7]:
# other methods
dframe.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0


In [8]:
dframe.dropna(axis=1)

0
1
2
3


In [9]:
dframe2 = pd.DataFrame([[1, 2, 3, np.nan], [2, np.nan, 5, 6], [6, 7, np.nan, 11], [1, np.nan, np.nan, np.nan]], 
                       columns = ['a', 'b','c','d'])
dframe2

Unnamed: 0,a,b,c,d
0,1,2.0,3.0,
1,2,,5.0,6.0
2,6,7.0,,11.0
3,1,,,


In [10]:
# treshhold -> don't drop rows with at least 2 datapoints in each row
dframe2.dropna(thresh=2)

Unnamed: 0,a,b,c,d
0,1,2.0,3.0,
1,2,,5.0,6.0
2,6,7.0,,11.0


In [11]:
dframe2.dropna(thresh=3)

Unnamed: 0,a,b,c,d
0,1,2.0,3.0,
1,2,,5.0,6.0
2,6,7.0,,11.0


In [12]:
# fill nan values with numbers, column mean, median etc.
dframe.fillna(1)

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,1.0,5.0,6.0
2,7.0,1.0,9.0
3,1.0,1.0,1.0


In [13]:
dframe2.fillna({0:0, 1:1, 2:2, 3:3})

Unnamed: 0,a,b,c,d
0,1,2.0,3.0,
1,2,,5.0,6.0
2,6,7.0,,11.0
3,1,,,


In [14]:
dframe2

Unnamed: 0,a,b,c,d
0,1,2.0,3.0,
1,2,,5.0,6.0
2,6,7.0,,11.0
3,1,,,


In [15]:
dframe2.mean(axis=0)

a    2.5
b    4.5
c    4.0
d    8.5
dtype: float64

In [16]:
dframe2.mean()

a    2.5
b    4.5
c    4.0
d    8.5
dtype: float64

In [17]:
dframe2.fillna(dframe2.mean())

Unnamed: 0,a,b,c,d
0,1,2.0,3.0,8.5
1,2,4.5,5.0,6.0
2,6,7.0,4.0,11.0
3,1,4.5,4.0,8.5


In [18]:
dframe2.median()

a    1.5
b    4.5
c    4.0
d    8.5
dtype: float64

In [19]:
dframe2.fillna(dframe2.median())

Unnamed: 0,a,b,c,d
0,1,2.0,3.0,8.5
1,2,4.5,5.0,6.0
2,6,7.0,4.0,11.0
3,1,4.5,4.0,8.5


## Index Hierarchy 

In [20]:
from numpy.random import randn

In [22]:
# creates multiple index level!
ser = pd.Series(randn(6), index = [[1,1,1, 2,2,2],['a','b','c', 'a','b','c']])
ser

1  a   -1.052152
   b   -2.166826
   c   -1.605112
2  a    0.539672
   b   -0.128014
   c   -1.142945
dtype: float64

In [24]:
# check index
ser.index

MultiIndex([(1, 'a'),
            (1, 'b'),
            (1, 'c'),
            (2, 'a'),
            (2, 'b'),
            (2, 'c')],
           )

In [25]:
ser[1]

a   -1.052152
b   -2.166826
c   -1.605112
dtype: float64

In [26]:
ser[2]

a    0.539672
b   -0.128014
c   -1.142945
dtype: float64

In [28]:
ser

1  a   -1.052152
   b   -2.166826
   c   -1.605112
2  a    0.539672
   b   -0.128014
   c   -1.142945
dtype: float64

In [29]:
# get all 'a'
ser[:,'a']

1   -1.052152
2    0.539672
dtype: float64

In [30]:
# create a dataframe from multiple index levels
dframe = ser.unstack()
dframe

Unnamed: 0,a,b,c
1,-1.052152,-2.166826,-1.605112
2,0.539672,-0.128014,-1.142945
