### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

### 1) Continuous Data Imputation

In [3]:
df_cont = pd.DataFrame({
    'Age': [25, 27, np.nan, 30, 29, np.nan],
    'Salary': [4000, np.nan, 5000, 5500, np.nan, 4500]
})
print("Original Data:")
df_cont

Original Data:


Unnamed: 0,Age,Salary
0,25.0,4000.0
1,27.0,
2,,5000.0
3,30.0,5500.0
4,29.0,
5,,4500.0


#### Univariate Imputation

(i) Mean Imputation

In [4]:
df = df_cont.copy()

imputer_mean = SimpleImputer(strategy='mean')
df['Age_mean'] = imputer_mean.fit_transform(df_cont[['Age']])
df['Salary_mean'] = imputer_mean.fit_transform(df_cont[['Salary']])
df1 = df
df1

Unnamed: 0,Age,Salary,Age_mean,Salary_mean
0,25.0,4000.0,25.0,4000.0
1,27.0,,27.0,4750.0
2,,5000.0,27.75,5000.0
3,30.0,5500.0,30.0,5500.0
4,29.0,,29.0,4750.0
5,,4500.0,27.75,4500.0


If you want to update existing data frame

M1: Using Inplace

In [5]:
df = df_cont.copy()

# Find the issue here
df.fillna(df["Age"].mean(), inplace=True)
df

Unnamed: 0,Age,Salary
0,25.0,4000.0
1,27.0,27.75
2,27.75,5000.0
3,30.0,5500.0
4,29.0,27.75
5,27.75,4500.0


In [6]:
df = df_cont.copy()

df["Age"].fillna(df["Age"].mean(), inplace=True)
df["Salary"].fillna(df["Salary"].mean(), inplace=True)
df

Unnamed: 0,Age,Salary
0,25.0,4000.0
1,27.0,4750.0
2,27.75,5000.0
3,30.0,5500.0
4,29.0,4750.0
5,27.75,4500.0


reassigning to the same column (recommmended for smaller datasets)

In [7]:
df = df_cont.copy()

df["Age"] = df["Age"].fillna(df["Age"].median())
df["Salary"] = df["Salary"].fillna(df["Salary"].median())
df

Unnamed: 0,Age,Salary
0,25.0,4000.0
1,27.0,4750.0
2,28.0,5000.0
3,30.0,5500.0
4,29.0,4750.0
5,28.0,4500.0


(ii) Median Imputation

In [8]:
df = df1.copy()

imputer_median = SimpleImputer(strategy='median')

df['Age_median'] = imputer_median.fit_transform(df[['Age']])
df['Salary_median'] = imputer_median.fit_transform(df[['Salary']])

print("\nAfter Mean/Median Imputation:\n")
df


After Mean/Median Imputation:



Unnamed: 0,Age,Salary,Age_mean,Salary_mean,Age_median,Salary_median
0,25.0,4000.0,25.0,4000.0,25.0,4000.0
1,27.0,,27.0,4750.0,27.0,4750.0
2,,5000.0,27.75,5000.0,28.0,5000.0
3,30.0,5500.0,30.0,5500.0,30.0,5500.0
4,29.0,,29.0,4750.0,29.0,4750.0
5,,4500.0,27.75,4500.0,28.0,4500.0


(iii) KNN imputation

In [9]:
imputer_knn = KNNImputer(n_neighbors=2)
df_knn = pd.DataFrame(imputer_knn.fit_transform(df_cont[['Age', 'Salary']]),
                      columns=['Age_KNN', 'Salary_KNN'])
print("KNN Imputed Data:")
df_knn

KNN Imputed Data:


Unnamed: 0,Age_KNN,Salary_KNN
0,25.0,4000.0
1,27.0,4750.0
2,27.5,5000.0
3,30.0,5500.0
4,29.0,4750.0
5,27.5,4500.0


(iv) MICE Imputation

In [10]:
imputer_iter = IterativeImputer(random_state=42)
df_mice = pd.DataFrame(imputer_iter.fit_transform(df_cont[['Age', 'Salary']]),
                       columns=['Age_MICE', 'Salary_MICE'])

print("MICE Imputed Data:\n")
df_mice

MICE Imputed Data:



Unnamed: 0,Age_MICE,Salary_MICE
0,25.0,4000.0
1,27.0,4599.430334
2,28.337391,5000.0
3,30.0,5500.0
4,29.0,5199.412889
5,26.670263,4500.0


### 2) Categorical Data Imputation

In [11]:

df_cat = pd.DataFrame({
    'Gender': ['Male', 'Female', np.nan, 'Male', np.nan],
    'Region': ['East', 'West', 'South', np.nan, 'West']
})
print("Original Data:\n")
df_cat

Original Data:



Unnamed: 0,Gender,Region
0,Male,East
1,Female,West
2,,South
3,Male,
4,,West


Mode imputation

In [12]:

imputer_mode = SimpleImputer(strategy='most_frequent')
df_cat_mode = pd.DataFrame(imputer_mode.fit_transform(df_cat), columns=df_cat.columns)
print("\nAfter Mode Imputation:\n")
df_cat_mode


After Mode Imputation:



Unnamed: 0,Gender,Region
0,Male,East
1,Female,West
2,Male,South
3,Male,West
4,Male,West


Constant imputer

In [13]:
imputer_const = SimpleImputer(strategy='constant', fill_value='Unknown')
df_cat_const = pd.DataFrame(imputer_const.fit_transform(df_cat), columns=df_cat.columns)

print("After Constant Imputation:\n")
df_cat_const


After Constant Imputation:



Unnamed: 0,Gender,Region
0,Male,East
1,Female,West
2,Unknown,South
3,Male,Unknown
4,Unknown,West


### 3) Mixed-Type Data Imputation

In [14]:

df_mixed = pd.DataFrame({
    'Age': [25, np.nan, 30, 28, np.nan],
    'Income': [4000, 4200, np.nan, 5000, 4800],
    'Gender': ['M', 'F', np.nan, 'M', 'F']
})
df_mixed

Unnamed: 0,Age,Income,Gender
0,25.0,4000.0,M
1,,4200.0,F
2,30.0,,
3,28.0,5000.0,M
4,,4800.0,F


In [15]:
# Replace Gender with numeric codes
df_mixed_enc = df_mixed.copy()
df_mixed_enc['Gender'] = df_mixed_enc['Gender'].map({'M': 1, 'F': 0}).astype(float)
df_mixed_enc

Unnamed: 0,Age,Income,Gender
0,25.0,4000.0,1.0
1,,4200.0,0.0
2,30.0,,
3,28.0,5000.0,1.0
4,,4800.0,0.0


In [16]:
imputer_mixed = IterativeImputer(random_state=42)

df_mixed_imp = pd.DataFrame(imputer_mixed.fit_transform(df_mixed_enc),
                            columns=df_mixed_enc.columns)
df_mixed_imp                           

Unnamed: 0,Age,Income,Gender
0,25.0,4000.0,1.0
1,25.60114,4200.0,0.0
2,30.0,5665.938632,0.5
3,28.0,5000.0,1.0
4,27.401663,4800.0,0.0


In [17]:
# Set a threshold to determine the categorical variable
df_mixed_imp['Gender'] = np.where(df_mixed_imp['Gender'] > 0.5, 'M', 'F')
print("MICE Imputed Mixed Data:")
df_mixed_imp

MICE Imputed Mixed Data:


Unnamed: 0,Age,Income,Gender
0,25.0,4000.0,M
1,25.60114,4200.0,F
2,30.0,5665.938632,F
3,28.0,5000.0,M
4,27.401663,4800.0,F


### 4) Time Series Data Imputation

In [35]:
dates = pd.date_range('2024-01-01', periods=7)
df_time = pd.DataFrame({
    'Date': dates,
    'Temp': [30, np.nan, 32, np.nan, 31, np.nan, 29]
})

print("Original Data:")
print(df_time)

Original Data:
        Date  Temp
0 2024-01-01  30.0
1 2024-01-02   NaN
2 2024-01-03  32.0
3 2024-01-04   NaN
4 2024-01-05  31.0
5 2024-01-06   NaN
6 2024-01-07  29.0


In [36]:
df_time['Date'].dtypes

dtype('<M8[ns]')

In [42]:
df_time['Date'].loc[0]

Timestamp('2024-01-01 00:00:00')

Forward fill

In [38]:
df_time_ffill = df_time.ffill()

print("\nForward Fill:\n")
df_time_ffill


Forward Fill:



Unnamed: 0,Date,Temp
0,2024-01-01,30.0
1,2024-01-02,30.0
2,2024-01-03,32.0
3,2024-01-04,32.0
4,2024-01-05,31.0
5,2024-01-06,31.0
6,2024-01-07,29.0


Backward fill

In [39]:
df_time_bfill = df_time.bfill()

print("\nBackward Fill:\n")
df_time_bfill


Backward Fill:



Unnamed: 0,Date,Temp
0,2024-01-01,30.0
1,2024-01-02,32.0
2,2024-01-03,32.0
3,2024-01-04,31.0
4,2024-01-05,31.0
5,2024-01-06,29.0
6,2024-01-07,29.0


Interpolation

In [21]:
df_time_interp = df_time.interpolate(method='linear')
print("Linear Interpolation:\n")
df_time_interp

Linear Interpolation:



Unnamed: 0_level_0,Temp
Date,Unnamed: 1_level_1
2024-01-01,30.0
2024-01-02,31.0
2024-01-03,32.0
2024-01-04,31.5
2024-01-05,31.0
2024-01-06,30.0
2024-01-07,29.0


### Last Resort: Drop missing values

In [22]:
df = df_cont.copy()
df.dropna(inplace=True)
df

Unnamed: 0,Age,Salary
0,25.0,4000.0
3,30.0,5500.0
