In [12]:
import numpy as np
import pandas as pd
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## Dealing with missing data

### CSV-formatted file with NaN data

In [5]:
from io import StringIO
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''
df = pd.read_csv(StringIO(csv_data))
display(df)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [7]:
# Missing values per column
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [24]:
# number of rows that have missing values
(df.isnull().sum(axis=1) > 0).sum()

2

In [8]:
# access the underlying NumPy array of DataFrame
df.values

array([[  1.,   2.,   3.,   4.],
       [  5.,   6.,  nan,   8.],
       [ 10.,  11.,  12.,  nan]])

### Eliminating samples of features with missing values

In [10]:
# drop rows with missing values
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [11]:
# drop columns that have at least one NaN in any row
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [13]:
# only drop rows where all columns are NaN
df.dropna(how='all')

# drop rows that have not at least 4 non-NaN values
df.dropna(thresh=4)

# only drop rows where NaN appear in specific columns (here: 'C')
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [20]:
(df.isnull().sum(axis=1) > 0).sum()

2

### Imputing missing values

**mean imputation**, estimate the missing values

replace the missing value by the mean value of the entire feature column

In [26]:
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df)
imputed_data = imr.transform(df.values)
imputed_data

# strategy can be 'median' or 'most_frequent'
# this is useful for imputing categorical feature values

array([[  1. ,   2. ,   3. ,   4. ],
       [  5. ,   6. ,   7.5,   8. ],
       [ 10. ,  11. ,  12. ,   6. ]])

### under