# **6. How to handle missing values in machine learning ? Write a code**

In [20]:
# Importing necessary libraries.
import numpy as np
import pandas as pd

In [28]:
DF = pd.read_csv("/content/Anaemic_Dataset.csv")
original_DF = DF.drop(columns=["Number"])
original_DF.head(10)

Unnamed: 0,Sex,%Red Pixel,%Green pixel,%Blue pixel,Hb,Anaemic
0,M,43.2555,30.8421,25.9025,6.3,Yes
1,F,45.6033,28.19,,13.5,No
2,F,45.0107,28.9677,26.0215,11.7,No
3,F,44.5398,28.9899,26.4703,,
4,M,,30.6972,26.0158,12.4,No
5,,45.0994,27.9645,26.9361,16.2,No
6,F,43.1457,30.1628,26.6915,8.6,Yes
7,F,43.6103,29.1099,27.2798,10.3,No
8,,45.0423,,25.7918,13.0,No
9,F,46.5143,27.4282,26.0575,9.7,Yes


In [29]:
# Here the
#   Categorical columns are: 0, 5
#   Numerical columns are: 1, 2, 3, 4

In [30]:
# Filling/Replacing/Removing null values in Categorical attributes.

# We can fill/remove null values by,
# 1. Filling null value with its previous value in the attribute.
# 2. Filling null value with its succesor value in the attribute.
# 3. Filling null value with the mode of the attribute.
# 4. Droping null values

In [31]:
DF1 = original_DF.copy()
DF1[['Sex', 'Anaemic']] = DF1[['Sex', 'Anaemic']].fillna(method='pad')
DF1.head(10)

Unnamed: 0,Sex,%Red Pixel,%Green pixel,%Blue pixel,Hb,Anaemic
0,M,43.2555,30.8421,25.9025,6.3,Yes
1,F,45.6033,28.19,,13.5,No
2,F,45.0107,28.9677,26.0215,11.7,No
3,F,44.5398,28.9899,26.4703,,No
4,M,,30.6972,26.0158,12.4,No
5,M,45.0994,27.9645,26.9361,16.2,No
6,F,43.1457,30.1628,26.6915,8.6,Yes
7,F,43.6103,29.1099,27.2798,10.3,No
8,F,45.0423,,25.7918,13.0,No
9,F,46.5143,27.4282,26.0575,9.7,Yes


In [32]:
DF2 = original_DF.copy()
DF2[['Sex', 'Anaemic']] = DF2[['Sex', 'Anaemic']].fillna(method='bfill')
DF2.head(10)

Unnamed: 0,Sex,%Red Pixel,%Green pixel,%Blue pixel,Hb,Anaemic
0,M,43.2555,30.8421,25.9025,6.3,Yes
1,F,45.6033,28.19,,13.5,No
2,F,45.0107,28.9677,26.0215,11.7,No
3,F,44.5398,28.9899,26.4703,,No
4,M,,30.6972,26.0158,12.4,No
5,F,45.0994,27.9645,26.9361,16.2,No
6,F,43.1457,30.1628,26.6915,8.6,Yes
7,F,43.6103,29.1099,27.2798,10.3,No
8,F,45.0423,,25.7918,13.0,No
9,F,46.5143,27.4282,26.0575,9.7,Yes


In [33]:
# Filling null value with the mode of the attribute.
sex_mode = original_DF['Sex'].mode()[0]
anaemic_mode = original_DF['Anaemic'].mode()[0]

DF3 = original_DF.copy()
DF3['Sex'] = DF3['Sex'].replace(to_replace=np.nan, value = sex_mode)
DF3['Anaemic'] = DF3['Anaemic'].replace(to_replace=np.nan, value = anaemic_mode)
DF3.head(10)

Unnamed: 0,Sex,%Red Pixel,%Green pixel,%Blue pixel,Hb,Anaemic
0,M,43.2555,30.8421,25.9025,6.3,Yes
1,F,45.6033,28.19,,13.5,No
2,F,45.0107,28.9677,26.0215,11.7,No
3,F,44.5398,28.9899,26.4703,,No
4,M,,30.6972,26.0158,12.4,No
5,M,45.0994,27.9645,26.9361,16.2,No
6,F,43.1457,30.1628,26.6915,8.6,Yes
7,F,43.6103,29.1099,27.2798,10.3,No
8,M,45.0423,,25.7918,13.0,No
9,F,46.5143,27.4282,26.0575,9.7,Yes


In [34]:
# Droping null values
DF4 = original_DF.copy()
DF4 = original_DF.dropna()
DF4.head(10)

Unnamed: 0,Sex,%Red Pixel,%Green pixel,%Blue pixel,Hb,Anaemic
0,M,43.2555,30.8421,25.9025,6.3,Yes
2,F,45.0107,28.9677,26.0215,11.7,No
6,F,43.1457,30.1628,26.6915,8.6,Yes
7,F,43.6103,29.1099,27.2798,10.3,No
9,F,46.5143,27.4282,26.0575,9.7,Yes
10,F,45.3506,29.1248,25.5246,12.6,No
11,F,44.4062,28.9298,26.664,15.4,No
12,F,44.9642,30.5279,24.5079,4.8,Yes
13,M,45.0484,31.1049,23.8467,9.0,Yes
15,M,45.5842,28.7311,25.6848,14.0,No


In [35]:
# Filling/Replacing/Removing null values in Numerical attributes.

# We can fill/remove null values by,
# 1. Mean/Median/Mode imputation.
# 2. Droping null values (already shown above for categorical)

In [39]:
DF5 = original_DF.copy()
# Filling the '%Red Pixel' null values with its mean
DF5['%Red Pixel'].fillna(DF5['%Red Pixel'].mean(), inplace=True)

# Filling the '%Green pixel' null values with its median
DF5['%Green pixel'].fillna(DF5['%Green pixel'].median(), inplace=True)

# Filling the '%Blue pixel' null values with its mode
DF5['%Blue pixel'].fillna(DF5['%Blue pixel'].mode()[0], inplace=True)

# Filling the 'Hb' null values with its median
DF5['Hb'].fillna(DF5['Hb'].median(), inplace=True)

DF5.head(10)

Unnamed: 0,Sex,%Red Pixel,%Green pixel,%Blue pixel,Hb,Anaemic
0,M,43.2555,30.8421,25.9025,6.3,Yes
1,F,45.6033,28.19,26.2399,13.5,No
2,F,45.0107,28.9677,26.0215,11.7,No
3,F,44.5398,28.9899,26.4703,12.6,
4,M,45.69188,30.6972,26.0158,12.4,No
5,,45.0994,27.9645,26.9361,16.2,No
6,F,43.1457,30.1628,26.6915,8.6,Yes
7,F,43.6103,29.1099,27.2798,10.3,No
8,,45.0423,28.882,25.7918,13.0,No
9,F,46.5143,27.4282,26.0575,9.7,Yes


In [37]:
# Filling/Replacing/Removing null values using sklearn library.

# 1. Simple Imputer
# 2. KNN Imputer
# 3. Iterative Imputer

In [38]:
# Importing Imputer libraries.
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [40]:
# Using Simple Imputer.
DF6 = original_DF.copy()

# for numerical attributes.
imputer_num = SimpleImputer(strategy='mean')
DF6[['%Red Pixel', '%Green pixel', '%Blue pixel', 'Hb']] = imputer_num.fit_transform(DF6[['%Red Pixel', '%Green pixel', '%Blue pixel', 'Hb']])

# for categorical attributes.
imputer_cat = SimpleImputer(strategy='most_frequent')
DF6[['Sex', 'Anaemic']] = imputer_cat.fit_transform(DF6[['Sex', 'Anaemic']])

DF6.head(10)

Unnamed: 0,Sex,%Red Pixel,%Green pixel,%Blue pixel,Hb,Anaemic
0,M,43.2555,30.8421,25.9025,6.3,Yes
1,F,45.6033,28.19,25.474758,13.5,No
2,F,45.0107,28.9677,26.0215,11.7,No
3,F,44.5398,28.9899,26.4703,12.142574,No
4,M,45.69188,30.6972,26.0158,12.4,No
5,M,45.0994,27.9645,26.9361,16.2,No
6,F,43.1457,30.1628,26.6915,8.6,Yes
7,F,43.6103,29.1099,27.2798,10.3,No
8,M,45.0423,28.862048,25.7918,13.0,No
9,F,46.5143,27.4282,26.0575,9.7,Yes


In [44]:
# Using KNN Imputer. (only works on numeric columns)
DF7 = original_DF.copy()

num_cols = ['%Red Pixel', '%Green pixel', '%Blue pixel', 'Hb']
cat_cols = ['Sex', 'Anaemic']

numeric_data = DF7[num_cols]

imputer = KNNImputer(n_neighbors=5)
imputed_numeric_data = imputer.fit_transform(numeric_data)

imputed_numeric_DF = pd.DataFrame(imputed_numeric_data, columns=num_cols)
final_DF = pd.concat([DF7[cat_cols], imputed_numeric_DF], axis=1)

final_DF.head(10)

Unnamed: 0,Sex,Anaemic,%Red Pixel,%Green pixel,%Blue pixel,Hb
0,M,Yes,43.2555,30.8421,25.9025,6.3
1,F,No,45.6033,28.19,25.99186,13.5
2,F,No,45.0107,28.9677,26.0215,11.7
3,F,,44.5398,28.9899,26.4703,13.56
4,M,No,44.0212,30.6972,26.0158,12.4
5,,No,45.0994,27.9645,26.9361,16.2
6,F,Yes,43.1457,30.1628,26.6915,8.6
7,F,No,43.6103,29.1099,27.2798,10.3
8,,No,45.0423,29.44476,25.7918,13.0
9,F,Yes,46.5143,27.4282,26.0575,9.7


In [45]:
# Using Iterative Imputer.
DF8 = original_DF.copy()

numeric_data = DF8[num_cols]

iterative_imputer = IterativeImputer()
imputed_numeric_data = iterative_imputer.fit_transform(numeric_data)

imputed_numeric_DF = pd.DataFrame(imputed_numeric_data, columns=num_cols)
final_DF = pd.concat([DF8[cat_cols], imputed_numeric_DF], axis=1)

final_DF.head(10)

Unnamed: 0,Sex,Anaemic,%Red Pixel,%Green pixel,%Blue pixel,Hb
0,M,Yes,43.2555,30.8421,25.9025,6.3
1,F,No,45.6033,28.19,26.206709,13.5
2,F,No,45.0107,28.9677,26.0215,11.7
3,F,,44.5398,28.9899,26.4703,12.11873
4,M,No,43.286992,30.6972,26.0158,12.4
5,,No,45.0994,27.9645,26.9361,16.2
6,F,Yes,43.1457,30.1628,26.6915,8.6
7,F,No,43.6103,29.1099,27.2798,10.3
8,,No,45.0423,29.165855,25.7918,13.0
9,F,Yes,46.5143,27.4282,26.0575,9.7
