In [19]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv


In [20]:
train=pd.read_csv('/kaggle/input/titanic/train.csv')
test=pd.read_csv('/kaggle/input/titanic/test.csv')

In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [22]:
num_cols=[ 'Pclass','SibSp','Parch', 'Age', 'Fare']

### Univariate feature imputation

These methods use only non-missing values in the same feature to handle the missing values.


In [23]:
from sklearn.impute import SimpleImputer

In [24]:
def simple_impute(data, cols):
    data_imp=data.copy()
    impute=SimpleImputer(strategy='mean')
    impute.fit(data[cols])
    data[cols]=impute.transform(train[cols])
    return data

In [25]:
train_uni=simple_impute(train, num_cols)
train_uni[num_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  891 non-null    float64
 1   SibSp   891 non-null    float64
 2   Parch   891 non-null    float64
 3   Age     891 non-null    float64
 4   Fare    891 non-null    float64
dtypes: float64(5)
memory usage: 34.9 KB


### Multivariate feature imputation

These methods use all the features to fill in the missing values.

2.1 **Iterative Imputer**

At each step, a column is treated as the output-$y$ and the rest features are considered as the input-$X$. The model will fit on the $(X, y)$ where $y$ is known, and predict the missing values.

In [26]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [27]:
def iterative_impute(data, cols):
    data_imp=data.copy()
    iter_imputer=IterativeImputer(max_iter=10, initial_strategy='mean', 
                                  imputation_order='ascending')
    iter_imputer.fit(data[num_cols])
    data_imp[cols]=iter_imputer.transform(data[num_cols])
    return data_imp

In [28]:
train_iter=iterative_impute(train, num_cols)
train_iter[num_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  891 non-null    float64
 1   SibSp   891 non-null    float64
 2   Parch   891 non-null    float64
 3   Age     891 non-null    float64
 4   Fare    891 non-null    float64
dtypes: float64(5)
memory usage: 34.9 KB


2.2 **KNN Imputer**

Using the KNN model to predict the missing values.

In [29]:
from sklearn.impute import KNNImputer
def knn_impute(data, cols):
    data_imp=data.copy()
    imputer=KNNImputer(weights='distance')
    imputer.fit(data[num_cols])
    data_imp[cols]=imputer.transform(data[num_cols])
    return data_imp


In [30]:
train_knn=knn_impute(train, num_cols)
train_knn[num_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  891 non-null    float64
 1   SibSp   891 non-null    float64
 2   Parch   891 non-null    float64
 3   Age     891 non-null    float64
 4   Fare    891 non-null    float64
dtypes: float64(5)
memory usage: 34.9 KB
