http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html

In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer

### Create DataFrame with Random Float Numbers

In [35]:
df = pd.DataFrame(np.random.random(size=(100, 4)), columns=list('ABCD'))

Unnamed: 0,A,B,C,D
0,0.30049,0.84741,0.675516,0.641422
1,0.967723,0.125482,0.948713,0.612066
2,0.90495,0.76979,0.179652,0.83973
3,0.559112,0.519487,0.303547,0.143799
4,0.99021,0.545354,0.88006,0.739226


### Randomly Insert Missing values into DataFrame

In [36]:
import random
ix = [(row, col) for row in range(df.shape[0]) for col in range(df.shape[1])]
for row, col in random.sample(ix, int(round(.1*len(ix)))):
    df.iat[row, col] = np.nan
df

Unnamed: 0,A,B,C,D
0,0.300490,0.847410,0.675516,0.641422
1,0.967723,0.125482,0.948713,0.612066
2,0.904950,0.769790,,0.839730
3,,0.519487,,0.143799
4,0.990210,0.545354,0.880060,0.739226
5,0.610543,0.960990,0.488889,0.535773
6,0.537496,0.180877,0.251928,
7,0.881764,0.797916,0.439437,0.749860
8,0.363432,0.842719,0.514104,0.182874
9,0.303383,0.017908,0.899540,0.246292


### Fit Imputer

In [37]:
# Create an imputer object that looks for 'Nan' values, then replaces them with the mean value of the feature by columns (axis=0)
mean_imputer = Imputer(missing_values='NaN', strategy='mean', axis=1)

# Train the imputor on the df dataset
mean_imputer = mean_imputer.fit(df)

### Apply Imputer

In [38]:
# Apply the imputer to the df dataset
imputed_df = mean_imputer.transform(df.values)

In [39]:
pd.DataFrame(imputed_df)

Unnamed: 0,0,1,2,3
0,0.300490,0.847410,0.675516,0.641422
1,0.967723,0.125482,0.948713,0.612066
2,0.904950,0.769790,0.838157,0.839730
3,0.331643,0.519487,0.331643,0.143799
4,0.990210,0.545354,0.880060,0.739226
5,0.610543,0.960990,0.488889,0.535773
6,0.537496,0.180877,0.251928,0.323434
7,0.881764,0.797916,0.439437,0.749860
8,0.363432,0.842719,0.514104,0.182874
9,0.303383,0.017908,0.899540,0.246292
