 Data Imputation in the Retail Sales Dataset

In [6]:
import pandas as pd

# Importing the dataset
file_path = r'E:\data set\HousingData.csv'
House_df = pd.read_csv(file_path)

# Calculate the percentage of missing values for each column
missing_percentage = House_df.isnull().sum() / House_df.shape[0] * 100

# Display the missing values percentage
print(missing_percentage)


CRIM       3.952569
ZN         3.952569
INDUS      3.952569
CHAS       3.952569
NOX        0.000000
RM         0.000000
AGE        3.952569
DIS        0.000000
RAD        0.000000
TAX        0.000000
PTRATIO    0.000000
B          0.000000
LSTAT      3.952569
MEDV       0.000000
dtype: float64


K-Nearest Neighbors (KNN) 

In [7]:

# Replacing with mode
House_df['CHAS'] = House_df['CHAS'].fillna(House_df['CHAS'].mode()[0])

from sklearn.impute import KNNImputer

# Creating a KNN imputer with k=5
#k=5 refers to the number of nearest neighbors to consider when imputing missing values.
imputer = KNNImputer(n_neighbors=5)

X = House_df.iloc[:,0:13]

# Fitting the imputer on the dataset and transform the data
X_imputed = imputer.fit_transform(X)

# Converting the result back to a DataFrame for easier inspection
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)

#verification
X_imputed_df.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
dtype: int64

Multiple Imputation by Chained Equations (MICE):

In [9]:

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


# Selecting features for imputation
X = House_df.iloc[:, 0:13] 

# Initializing the Iterative Imputer (MICE)
mice_imputer = IterativeImputer(max_iter=10, random_state=0)  # max_iter: Number of imputation rounds

# Fitting the imputer and transform the dataset
X_imputed = mice_imputer.fit_transform(X)

# Converting the result back to a DataFrame for easier inspection
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)

X_imputed_df.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
dtype: int64