#Missing Data Imputation
1. Mean Imputation
2. Mode imputation
3. Model-based imputation-KNN
4. Model-based imputation-regression
5. Multiple imputation-mice
6. Deep neural network imputation-datawig

In [None]:
%load_ext lab_black

In [None]:
# importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from impyute.imputation.cs import fast_knn, mice
import datawig

In [None]:
# loading data
df = pd.read_csv("../data/imu.csv")
df.head(2)

In [None]:
data = df.loc[:, ["Quaternion_1", "Quaternion_2", "Quaternion_3", "Quaternion_4"]]
data.head(2)

## Mean Imputation

In [None]:
impute_mean = SimpleImputer(strategy="mean")
impute_mean.fit(data)
imputed_mean = impute_mean.transform(data)
imputed_mean = pd.DataFrame(imputed_mean, columns=data.columns)
imputed_mean.head(2)

In [None]:
sns.distplot(
    data["Quaternion_2"],
    hist=True,
    kde=True,
    bins=90,
    hist_kws={"edgecolor": "black"},
    kde_kws={"linewidth": 4},
)

sns.distplot(
    imputed_mean["Quaternion_2"],
    hist=True,
    kde=True,
    bins=90,
    hist_kws={"edgecolor": "black"},
    kde_kws={"linewidth": 4},
)
plt.legend(
    ["Original data", "Imputed Data"], prop={"size": 16}, title="Mean imputation"
)
plt.tight_layout(), plt.show()

## Mode imputation

In [None]:
impute_mode = SimpleImputer(strategy="most_frequent")
impute_mode.fit(data)
imputed_mode = impute_mode.transform(data)
imputed_mode = pd.DataFrame(imputed_mode, columns=data.columns)
imputed_mode.head(2)

In [None]:
sns.distplot(
    data["Quaternion_2"],
    hist=True,
    kde=True,
    bins=90,
    hist_kws={"edgecolor": "black"},
    kde_kws={"linewidth": 4},
)

sns.distplot(
    imputed_mode["Quaternion_2"],
    hist=True,
    kde=True,
    bins=90,
    hist_kws={"edgecolor": "black"},
    kde_kws={"linewidth": 4},
)
plt.legend(
    ["Original data", "Imputed Data"], prop={"size": 16}, title="Mode imputation"
)
plt.tight_layout(), plt.show()

## Model based imputation-KNN

In [None]:
impute_knn = fast_knn(data.values, k=30)

In [None]:
imputed_knn = pd.DataFrame(impute_knn, columns=data.columns)
imputed_knn.head(2)

In [None]:
sns.distplot(
    data["Quaternion_2"],
    hist=True,
    kde=True,
    bins=90,
    hist_kws={"edgecolor": "black"},
    kde_kws={"linewidth": 4},
)

sns.distplot(
    imputed_knn["Quaternion_2"],
    hist=True,
    kde=True,
    bins=90,
    hist_kws={"edgecolor": "black"},
    kde_kws={"linewidth": 4},
)
plt.legend(["Original data", "Imputed Data"], prop={"size": 16}, title="knn imputation")
plt.tight_layout(), plt.show()

## Regression imputation

In [None]:
# creating training and test data to apply regression model
train = data.dropna(axis=0, how="any")
test = data[
    (data["Quaternion_1"].notnull())
    & (data["Quaternion_3"].notnull())
    & (data["Quaternion_4"].notnull())
    & (data["Quaternion_2"].isnull())
]
x_train = train.drop("Quaternion_2", axis=1)
y_train = train["Quaternion_2"]
x_test = test.drop("Quaternion_2", axis=1)
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [None]:
sns.distplot(
    data["Quaternion_2"],
    hist=True,
    kde=True,
    bins=90,
    hist_kws={"edgecolor": "black"},
    kde_kws={"linewidth": 4},
)

sns.distplot(
    y_pred,
    hist=True,
    kde=True,
    bins=90,
    hist_kws={"edgecolor": "black"},
    kde_kws={"linewidth": 4},
)
plt.legend(
    ["Original data", "Imputed Data"], prop={"size": 16}, title="regression imputation"
)
plt.tight_layout(), plt.show()

## Multiple imputation-mice

In [None]:
impute_mice = mice(data.values)
impute_mice

In [None]:
sns.distplot(data['Quaternion_2'], hist=True, 
             kde=True, bins=90, hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth':4})

sns.distplot(impute_mice[:,1], hist=True, 
             kde=True, bins=90, hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth':4})
plt.legend(['Original data','Imputed Data'], prop={'size': 16}, title = 'mice')
plt.tight_layout(), plt.show()

## Deep learning method-datawig

In [None]:
input_cols = ['Quaternion_1', 'Quaternion_3', 'Quaternion_4']
output_cols = 'Quaternion_2'

model=datawig.SimpleImputer(
    input_columns=input_cols,
    output_column=output_cols,
    output_path = 'datawig_model'  #stores model data and metrics
)
model.fit(train_df=train,num_epochs=50)
y_pred=model.predict(test)

In [None]:
y_pred

In [None]:
sns.distplot(
    data['Quaternion_2'], hist=True, 
    kde=True, bins=90, hist_kws={'edgecolor':'black'},
    kde_kws={'linewidth':4}
)

sns.distplot(
    y_pred['Quaternion_2_imputed'], hist=True, 
    kde=True, bins=90, hist_kws={'edgecolor':'black'},
    kde_kws={'linewidth':4}
)

plt.legend(['Original data','Imputed Data'], prop={'size': 16}, title = 'datawig')
plt.tight_layout(), plt.show()