## Univariate Numerical Imputation

In [None]:
import numpy as np
import pandas as pd

### Mean Imputation

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
data = {
    'Value': [5, 7, np.nan, 10, np.nan, 6]
}

df = pd.DataFrame(data)
df.head(6)

In [None]:
imputer = SimpleImputer(strategy='mean')

df['Transformed_Value'] = imputer.fit_transform(df[['Value']])

In [None]:
df.head()

### Median Imputation

In [None]:
data = {
    'Value': [5, 7, np.nan, 10, np.nan, 6]
}

df = pd.DataFrame(data)
df.head(6)


In [None]:
imputer = SimpleImputer(strategy='median')

df['Transformed_Value'] = imputer.fit_transform(df[['Value']])

In [None]:
df.head(6)

### MODE Imputation

In [None]:
data = {
    'X': [5, 7, np.nan, 7, np.nan, 6,7]
}

df = pd.DataFrame(data)
df

In [None]:
imputer = SimpleImputer(strategy='most_frequent')
df['X_transformed'] = imputer.fit_transform(df[['X']])

In [None]:
df

### Custom/Constant value Imputation

In [None]:
data = {
    'X': [5, 7, np.nan, 10, np.nan, 6]
}

df = pd.DataFrame(data)
df

In [None]:
imputer = SimpleImputer(strategy='constant', fill_value=2)

df['X_transformed'] = imputer.fit_transform(df[['X']])

In [None]:
df

### Forward/Backward Fill Imputation

In [None]:
### Forward Fill

In [None]:
data = {
    'X': [5, None, 7, None, 10, None]
}

df = pd.DataFrame(data)
df

In [None]:
df["Forward_Fill"] = df["X"].fillna(method='ffill')

In [None]:
df

In [None]:
### Backward Fill

In [None]:
df["Backward_Fill"] = df["X"].fillna(method='bfill')

In [None]:
df

### Interpolation Imputation

In [None]:
data = {
    'X': [5, None, 7, None, 10, None, 15]
}

df = pd.DataFrame(data)
df

In [None]:
df["X_transformed"] = df["X"].interpolate(method='linear')

In [None]:
df

### Moving Average Imputation 

In [None]:
data = {
    'X': [5, 8, np.nan, 12, 15, np.nan, 20, 22, np.nan, 25, 28, 30]
}

df = pd.DataFrame(data)
df

In [None]:
df['X_filled'] = df['X'].fillna(df['X'].rolling(window=3, min_periods=1).mean())

In [None]:
df

### End of Distruibition Imputation

In [None]:
data = {
    'X': [10, 15, 20, 25, 30, 35, np.nan, 45, np.nan, 55]
}

df = pd.DataFrame(data)
df

In [None]:
P_low = df['X'].quantile(0.1)  # 10th percentile
P_high = df['X'].quantile(0.9)  # 90th percentile

In [None]:
P_low , P_high

In [None]:
df['X_imputed'] = df['X'].fillna(df['X'].median())


In [None]:
df

In [None]:
df.loc[df['X'] < P_low, 'X_imputed'] = P_low

df.loc[df['X'] > P_high, 'X_imputed'] = P_high

In [None]:
df

## Univariate Categorical Imputation

### Mode/Frequent Category Imputation

In [None]:
data = {
    'X': ["yes", "yes", np.nan, "no", np.nan, "yes","no"]
}

df = pd.DataFrame(data)
df

In [None]:
mode = df['X'].mode()[0]

In [None]:
mode

In [None]:
df["X_transfromed"] = df['X'].fillna(mode)

In [None]:
df

### Custom Imputation

In [None]:
data = {
    'X': ["yes", "yes", np.nan, "no", np.nan, "yes","no"]
}

df = pd.DataFrame(data)
df

In [None]:
df["X_transformed"] = df["X"].fillna("missing")

In [None]:
df

### HOT DECK Imputation

In [None]:
data = {
    'Fruit': ['Apple', 'Banana', 'Apple', 'Orange', 'Banana', np.nan, 'Orange', 'Apple', np.nan, 'Banana']
}

df = pd.DataFrame(data)
df

In [None]:
missing_indices = df[df['Fruit'].isna()].index

In [None]:
missing_indices

In [None]:
non_missing_values = df['Fruit'].dropna().values

In [None]:
non_missing_values

In [None]:
len(missing_indices)

In [None]:
np.random.seed(42)

imputed_values = np.random.choice(non_missing_values, size=len(missing_indices))

In [None]:
imputed_values

In [None]:
df.loc[missing_indices, 'Fruit'] = imputed_values

In [None]:
df

### Proxy Variable Imputation

In [None]:
data = {
    'Fruit': ['Apple', 'Banana', 'Apple', 'Orange', 'Banana', np.nan, 'Orange', 'Apple', np.nan, 'Banana'],
    'Color': ['Red', 'Yellow', 'Red', 'Orange', 'Yellow', 'Red', 'Orange', 'Red', 'Yellow', 'Yellow']
}

df = pd.DataFrame(data)
df

In [None]:
missing_indices = df[df['Fruit'].isna()].index

In [None]:
missing_indices 

In [None]:
# Calculate mode of Fruit for each Color category
fruit_mode_by_color = df.groupby('Color')['Fruit'].agg(lambda x: x.mode().iloc[0])

In [None]:
fruit_mode_by_color

In [None]:
# Impute missing values using the proxy variable
for idx in missing_indices: ##[5,8]
    color_value = df.loc[idx, 'Color']
    # print(color_value)
    
    imputed_fruit = fruit_mode_by_color[color_value]

    # print(imputed_fruit)
    
    df.loc[idx, 'Fruit'] = imputed_fruit

In [None]:
df

## Multivariate Imputation

### MICE 

In [None]:
!pip scikit-learn fancyimpute

In [1]:
import pandas as pd
import numpy as np
from fancyimpute import IterativeImputer

In [None]:
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [3, np.nan, 5, np.nan, 7],
    'C': [np.nan, 2, 3, 4, np.nan],
    'D': [1, np.nan, 3, np.nan, 5]
}

df = pd.DataFrame(data)
df

In [None]:
mice_imputer = IterativeImputer()

In [None]:
imputed_data = mice_imputer.fit_transform(df)

In [None]:
imputed_data

In [None]:
df_imputed = pd.DataFrame(imputed_data, columns=df.columns)
df_imputed

### KNN Imputation

In [None]:
from fancyimpute import KNN

In [None]:
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [3, np.nan, 5, np.nan, 7],
    'C': [np.nan, 2, 3, 4, np.nan],
    'D': [1, np.nan, 3, np.nan, 5]
}

df = pd.DataFrame(data)
df

In [None]:
knn_imputer = KNN()

In [None]:
imputed_data = knn_imputer.fit_transform(df)

In [None]:
imputed_data

In [None]:
df_imputed = pd.DataFrame(imputed_data, columns=df.columns)
df_imputed

In [None]:
### USING SCIKIT_LEARN

from sklearn.impute import KNNImputer

# Initialize the KNN imputer with the desired number of neighbors (k)
knn_imputer = KNNImputer(n_neighbors=3)

# Fit the imputer on the dataset and transform it to fill in missing values
df_imputed = knn_imputer.fit_transform(df)

# Convert the imputed data back to a DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)


### Regression Imputation

In [None]:
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [3, np.nan, 5, np.nan, 7],
    'C': [np.nan, 2, 3, 4, np.nan],
    'D': [1, np.nan, 3, np.nan, 5]
}

df = pd.DataFrame(data)
df

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer 

In [None]:
def regression_imputation(df, target_column):
    # Separate the columns into those with and without missing values in the target column
    train_df = df[df[target_column].notna()]
    test_df = df[df[target_column].isna()]
    
    # Separate predictors and target for training data
    X_train = train_df.drop(columns=[target_column])
    y_train = train_df[target_column]
    
    # Separate predictors for testing data (ensure no NaNs)
    X_test = test_df.drop(columns=[target_column])
    
    # Handle missing values in X_train and X_test separately (if any)
    imputer = SimpleImputer(strategy='mean')  # Replace NaNs with mean (or any desired strategy)
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)  # Use fitted imputer from training data
    
    # Fit a regression model
    model = LinearRegression()
    model.fit(X_train_imputed, y_train)
    
    # Predict the missing values
    predicted_values = model.predict(X_test_imputed)
    
    # Impute the missing values in original DataFrame
    df.loc[df[target_column].isna(), target_column] = predicted_values
    
    return df

In [None]:
for column in df.columns:
    df = regression_imputation(df, column)

In [None]:
df

### Random Forest Imputation

In [None]:
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [3, np.nan, 5, np.nan, 7],
    'C': [np.nan, 2, 3, 4, np.nan],
    'D': [1, np.nan, 3, np.nan, 5]
}

df = pd.DataFrame(data)
df

In [None]:
from sklearn.ensemble import RandomForestRegressor

def random_forest_imputation(df, target_column):
    # Separate the columns into those with and without missing values in the target column
    train_df = df[df[target_column].notna()]
    test_df = df[df[target_column].isna()]
    
    # Separate predictors and target for training data
    X_train = train_df.drop(columns=[target_column])
    y_train = train_df[target_column]
    
    # Separate predictors for testing data (ensure no NaNs)
    X_test = test_df.drop(columns=[target_column])
    
    # Handle missing values in X_train and X_test separately (if any)
    imputer = SimpleImputer(strategy='mean')  # Replace NaNs with mean (or any desired strategy)
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)  # Use fitted imputer from training data
    
    # Fit a regression model
    model = RandomForestRegressor()
    model.fit(X_train_imputed, y_train)
    
    # Predict the missing values
    predicted_values = model.predict(X_test_imputed)
    
    # Impute the missing values in original DataFrame
    df.loc[df[target_column].isna(), target_column] = predicted_values
    
    return df

In [None]:
for column in df.columns:
    df = random_forest_imputation(df, column)

In [None]:
df

## MISSING INDICATORS

In [2]:
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [3, np.nan, 5, np.nan, 7],
    'C': [np.nan, 2, 3, 4, np.nan],
    'D': [1, np.nan, 3, np.nan, 5]
}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C,D
0,1.0,3.0,,1.0
1,2.0,,2.0,
2,,5.0,3.0,3.0
3,4.0,,4.0,
4,5.0,7.0,,5.0


In [3]:
from sklearn.impute import SimpleImputer, MissingIndicator

In [4]:
indicator = MissingIndicator()
missing_indicators = indicator.fit_transform(df)

In [5]:
missing_indicators

array([[False, False,  True, False],
       [False,  True, False,  True],
       [ True, False, False, False],
       [False,  True, False,  True],
       [False, False,  True, False]])

In [6]:
missing_indicator_df = pd.DataFrame(missing_indicators, columns=[f'{col}_missing' for col in df.columns])

In [7]:
missing_indicator_df

Unnamed: 0,A_missing,B_missing,C_missing,D_missing
0,False,False,True,False
1,False,True,False,True
2,True,False,False,False
3,False,True,False,True
4,False,False,True,False


In [8]:
missing_indicator_df = missing_indicator_df.astype(int)

In [9]:
missing_indicator_df

Unnamed: 0,A_missing,B_missing,C_missing,D_missing
0,0,0,1,0
1,0,1,0,1
2,1,0,0,0
3,0,1,0,1
4,0,0,1,0


In [11]:
imputer = SimpleImputer(strategy='mean')
imputed_data = imputer.fit_transform(df)

In [12]:
df_imputed = pd.DataFrame(imputed_data, columns=df.columns)

In [13]:
df_imputed

Unnamed: 0,A,B,C,D
0,1.0,3.0,3.0,1.0
1,2.0,5.0,2.0,3.0
2,3.0,5.0,3.0,3.0
3,4.0,5.0,4.0,3.0
4,5.0,7.0,3.0,5.0


In [14]:
df_final = pd.concat([df_imputed, missing_indicator_df], axis=1)

In [15]:
df_final

Unnamed: 0,A,B,C,D,A_missing,B_missing,C_missing,D_missing
0,1.0,3.0,3.0,1.0,0,0,1,0
1,2.0,5.0,2.0,3.0,0,1,0,1
2,3.0,5.0,3.0,3.0,1,0,0,0
3,4.0,5.0,4.0,3.0,0,1,0,1
4,5.0,7.0,3.0,5.0,0,0,1,0
