In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/titanic_train.csv')

# --- Detection ---

In [3]:
def get_df_info(df):
    print(df.info(),'\n\n*** Total Null Elements ***\n\n', 
          df.isnull().sum())
    return df.head()

In [4]:
get_df_info(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None 

*** Total Null Elements ***

 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# --- Univariate Imputation ---

In [5]:
import numpy as np

In [6]:
df = pd.DataFrame({'col_1' : [1, 2, np.nan, 3, 2, 3],
                   'col_2' : ['A', 'B', 'B', 'C', np.nan, 'C'],
                   'col_3' : [8, 6, np.nan, 9, np.nan, 1]})
df

Unnamed: 0,col_1,col_2,col_3
0,1.0,A,8.0
1,2.0,B,6.0
2,,B,
3,3.0,C,9.0
4,2.0,,
5,3.0,C,1.0


### Numerical Data

In [7]:
df['col_1'].fillna(df['col_1'].mean(), inplace=False)

0    1.0
1    2.0
2    2.2
3    3.0
4    2.0
5    3.0
Name: col_1, dtype: float64

In [8]:
df['col_1'].fillna(df['col_1'].median(), inplace=False)

0    1.0
1    2.0
2    2.0
3    3.0
4    2.0
5    3.0
Name: col_1, dtype: float64

In [9]:
df['col_1'].fillna(df['col_1'].mode()[0], inplace=False)

0    1.0
1    2.0
2    2.0
3    3.0
4    2.0
5    3.0
Name: col_1, dtype: float64

### Categorical Data

In [10]:
df['col_2'].fillna(df['col_2'].mode()[0], inplace=False)

0    A
1    B
2    B
3    C
4    B
5    C
Name: col_2, dtype: object

### Combining Several Columns

In [11]:
dict_to_impute = {'col_1' : 10, 'col_2' : 'QQ'}

In [12]:
df.loc[:,['col_1', 'col_2']].fillna(value=dict_to_impute, inplace=False)

Unnamed: 0,col_1,col_2
0,1.0,A
1,2.0,B
2,10.0,B
3,3.0,C
4,2.0,QQ
5,3.0,C


In [13]:
df.fillna(value=dict_to_impute, inplace=False)

Unnamed: 0,col_1,col_2,col_3
0,1.0,A,8.0
1,2.0,B,6.0
2,10.0,B,
3,3.0,C,9.0
4,2.0,QQ,
5,3.0,C,1.0


### Using Imputer

In [14]:
from sklearn.impute import SimpleImputer

In [15]:
df = pd.DataFrame({'col_1' : [1, 2, np.nan, 3, 2, 3],
                   'col_2' : ['A', 'B', 'B', 'C', 'none', 'C'],
                   'col_3' : [8, 6, np.nan, 9, np.nan, 1]})
df

Unnamed: 0,col_1,col_2,col_3
0,1.0,A,8.0
1,2.0,B,6.0
2,,B,
3,3.0,C,9.0
4,2.0,none,
5,3.0,C,1.0


In [16]:
categ_imputer = SimpleImputer(missing_values='none', strategy='most_frequent', verbose=1)
num_imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1, add_indicator=True)

In [17]:
categ_cols = list(df.select_dtypes(include=['object']).columns.values)
num_cols = list(df.select_dtypes(exclude=['object']).columns.values)

print('Categorical columns:\t', categ_cols, '\nNumerical columns:\t', num_cols, )

Categorical columns:	 ['col_2'] 
Numerical columns:	 ['col_1', 'col_3']


In [18]:
categ_imputer = categ_imputer.fit(df[categ_cols])
num_imputer = num_imputer.fit(df[num_cols])

In [19]:
df[categ_cols] = categ_imputer.transform(df[categ_cols].values)
df[num_cols + list(map(lambda x: x + '_na_indicator', num_cols))] =\
                pd.DataFrame(num_imputer.transform(df[num_cols].values), index=df.index)

In [20]:
df

Unnamed: 0,col_1,col_2,col_3,col_1_na_indicator,col_3_na_indicator
0,1.0,A,8.0,0.0,0.0
1,2.0,B,6.0,0.0,0.0
2,2.0,B,7.0,1.0,1.0
3,3.0,C,9.0,0.0,0.0
4,2.0,B,7.0,0.0,1.0
5,3.0,C,1.0,0.0,0.0


In [21]:
print('Imputed Numerical Values:\n', num_imputer.statistics_)

Imputed Numerical Values:
 [2. 7.]


In [22]:
print('Imputed Categorical Values:\n', categ_imputer.statistics_)

Imputed Categorical Values:
 ['B']


# --- Multivariate Imputation ---

Read here:
https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html

In [23]:
# Or here: https://github.com/eltonlaw/impyute
#from impyute.imputation.cs import fast_knn
#imputed_training=fast_knn(df.values, k=3)

In [24]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [25]:
df = pd.DataFrame({'col_1' : [1, 7, np.nan, 3, 2, 3],
                   'col_2' : ['A', 'B', 'B', 'C', np.nan, 'C'],
                   'col_3' : [8, 6, np.nan, 9, np.nan, 1],
                   'col_4' : ['qwe', np.nan, 'rty', 'C', np.nan, 'C']})
df

Unnamed: 0,col_1,col_2,col_3,col_4
0,1.0,A,8.0,qwe
1,7.0,B,6.0,
2,,B,,rty
3,3.0,C,9.0,C
4,2.0,,,
5,3.0,C,1.0,C


In [26]:
imputer = IterativeImputer(imputation_order='descending', max_iter=10, random_state=0)

In [27]:
categ_cols = list(df.select_dtypes(include=['object']).columns.values)

In [28]:
mappings = dict()
for categ_col in categ_cols:
    class_mapping = {label : idx for idx, label in enumerate(np.unique(df[categ_col].dropna()))}
    
    df[categ_col] = df[categ_col].map(class_mapping)
    
    mappings[categ_col] = class_mapping
    mappings[categ_col + 'min'] = df[categ_col].min()
    mappings[categ_col + 'max'] = df[categ_col].max()
df

Unnamed: 0,col_1,col_2,col_3,col_4
0,1.0,0.0,8.0,1.0
1,7.0,1.0,6.0,
2,,1.0,,2.0
3,3.0,2.0,9.0,0.0
4,2.0,,,
5,3.0,2.0,1.0,0.0


In [29]:
imputer.fit(df.values)
df[df.columns] = imputer.transform(df.values)
df

Unnamed: 0,col_1,col_2,col_3,col_4
0,1.0,0.0,8.0,1.0
1,7.0,1.0,6.0,0.749806
2,3.199933,1.0,6.000503,2.0
3,3.0,2.0,9.0,0.0
4,2.0,1.199074,6.000644,0.750066
5,3.0,2.0,1.0,0.0


In [30]:
for categ_col in categ_cols:
    inv_class_mapping = {idx : label for label, idx in mappings[categ_col].items()}
    
    # np.clip makes sure the categorical integers will have corresponding labels
    df[categ_col] = np.clip(list(map(round, df[categ_col])), 
                            mappings[categ_col+'min'],
                            mappings[categ_col+'max'])
    
    df[categ_col] = df[categ_col].map(inv_class_mapping)
df

Unnamed: 0,col_1,col_2,col_3,col_4
0,1.0,A,8.0,qwe
1,7.0,B,6.0,qwe
2,3.199933,B,6.000503,rty
3,3.0,C,9.0,C
4,2.0,B,6.000644,qwe
5,3.0,C,1.0,C
