## MICE: Multiple Imputation by Chained Equations

In [6]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Sample dataset with missing values
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 2, 3, 4, 5],
    'C': [1, 2, 3, np.nan, 5]
}
df = pd.DataFrame(data)

# # Initialize MICE imputer
imputer = IterativeImputer(random_state=42)


# # Fit and transform the data
imputed_data = imputer.fit_transform(df)

# # Convert the result back to a DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=df.columns)

print(imputed_df)
print(f"Number of iterations: {imputer.n_iter_}")

         A         B         C
0  1.00000  0.999988  1.000000
1  2.00000  2.000000  2.000000
2  3.00005  3.000000  3.000000
3  4.00000  4.000000  3.999993
4  5.00000  5.000000  5.000000
Number of iterations: 3


## KNN Imputer: imputes only numerical columns, but it can use categorical columns

In [11]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder

# Sample dataset with both numerical and categorical features and missing values
data = {
    'Feature A': [1.0, 2.0, np.nan, 4.0],
    'Categorical Feature': ['A', 'B', 'A', np.nan],
    'Feature B': [5.0, np.nan, 1.0, 3.0]
}
df = pd.DataFrame(data)

# Display the original dataset with missing values
print("Original Dataset:")
print(df)

# One-Hot Encode the categorical feature
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_features = encoder.fit_transform(df[['Categorical Feature']])
encoded_feature_names = encoder.get_feature_names_out(['Categorical Feature'])

# Create a new DataFrame with the encoded categorical features
df_encoded = pd.concat([df.drop(['Categorical Feature'], axis=1), pd.DataFrame(encoded_features, columns=encoded_feature_names)], axis=1)

# Initialize KNN Imputer with K=2
imputer = KNNImputer(n_neighbors=2)

# Fit and transform the dataset with the KNN imputer
imputed_data = imputer.fit_transform(df_encoded)

# Convert the result back to a DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=df_encoded.columns)

# Display the imputed dataset
print("\nImputed Dataset:")
print(imputed_df)


Original Dataset:
   Feature A Categorical Feature  Feature B
0        1.0                   A        5.0
1        2.0                   B        NaN
2        NaN                   A        1.0
3        4.0                 NaN        3.0

Imputed Dataset:
   Feature A  Feature B  Categorical Feature_A  Categorical Feature_B  \
0        1.0        5.0                    1.0                    0.0   
1        2.0        3.0                    0.0                    1.0   
2        3.0        1.0                    1.0                    0.0   
3        4.0        3.0                    0.0                    0.0   

   Categorical Feature_nan  
0                      0.0  
1                      0.0  
2                      0.0  
3                      1.0  
