# Missing Data

We show a few convenient methods to deal with Missing Data in pandas:

In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})

In [5]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [7]:
df.dropna() # by default, drop rows with NaN 

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [9]:
df.dropna(axis=1) # drop columns with NaN

Unnamed: 0,C
0,1
1,2
2,3


In [10]:
df # did not overwrite

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [12]:
df.dropna(axis=1, inplace= True)
df

Unnamed: 0,C
0,1
1,2
2,3


In [13]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [15]:
df.dropna(thresh=2) # keep rows with at least thresh=2 non-NaN

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [17]:
df.dropna(thresh=2,axis=1) # keep columns with at least thresh= 2 non-NaN

Unnamed: 0,A,C
0,1.0,1
1,2.0,2
2,,3


In [18]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [19]:
df.fillna(value=0)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,0.0,2
2,0.0,0.0,3


In [20]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [21]:
df['A'].fillna(value=0)

0    1.0
1    2.0
2    0.0
Name: A, dtype: float64

In [22]:
df['B'].fillna(value=10)

0     5.0
1    10.0
2    10.0
Name: B, dtype: float64

In [23]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [25]:
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

In [18]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [26]:
df.loc[1].fillna(value = df.loc[1].mean())

A    2.0
B    2.0
C    2.0
Name: 1, dtype: float64

In [27]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [28]:
values = {'A': 0, 'B': 1, 'C': 2}
df.fillna(value=values)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,1.0,2
2,0.0,1.0,3


In [29]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [22]:
df.fillna(value=values, inplace=True) # overwrite by setting inplace=True
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,1.0,2
2,0.0,1.0,3
