# Import Libraries

In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv(r"C:\Users\kumar\Downloads\ML-Projects\Datasets\sample_dataset.csv")

# Identify Categorical & Numerical Features

In [3]:
categorical = df.select_dtypes(include=["object", "category", "bool"])
numerical = df.select_dtypes(exclude=["object", "category", "bool"])

# Data Cleaning for Numerical Variables

#### Simple Imputer

In [9]:
from sklearn.impute import SimpleImputer

In [13]:
cleaner = SimpleImputer(strategy="mean")
# We can use mean,median, mode as strategy to Fill Blank values with that particular value of column

In [14]:
print(cleaner.fit_transform(numerical))

[[1.40595477e+01 1.03800000e+01 1.22800000e+02 ... 4.60100000e-01
  1.18900000e-01 0.00000000e+00]
 [2.05700000e+01 1.77700000e+01 1.32900000e+02 ... 2.75000000e-01
  8.43631702e-02 0.00000000e+00]
 [1.96900000e+01 2.12500000e+01 1.30000000e+02 ... 3.61300000e-01
  8.75800000e-02 0.00000000e+00]
 ...
 [1.66000000e+01 2.80800000e+01 1.08300000e+02 ... 2.21800000e-01
  7.82000000e-02 0.00000000e+00]
 [2.06000000e+01 2.93300000e+01 1.40100000e+02 ... 4.08700000e-01
  1.24000000e-01 0.00000000e+00]
 [7.76000000e+00 1.93118293e+01 4.79200000e+01 ... 2.87100000e-01
  7.03900000e-02 1.00000000e+00]]


# Data Cleaning for Categorical Features

#### Mosty Frequent Data

In [15]:
cleaner = SimpleImputer(strategy="most_frequent")

In [19]:
pd.DataFrame(cleaner.fit_transform(categorical)).value_counts()

0
A    564
B      4
C      1
Name: count, dtype: int64

#### New Value into Blank

In [23]:
# This Functionality works for both Numerical as well as Categorical Features

In [24]:
#Categorical Features
cleaner = SimpleImputer(strategy="constant", fill_value="undefined")
pd.DataFrame(cleaner.fit_transform(categorical)).value_counts()

0        
A            489
undefined     75
B              4
C              1
Name: count, dtype: int64

In [27]:
#Numerical Features
cleaner = SimpleImputer(strategy="constant", fill_value=0)
pd.DataFrame(cleaner.fit_transform(numerical)).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,11.909845,16.698453,82.980703,468.530053,0.065568,0.088181,0.072572,0.032974,0.150161,0.055472,...,18.625677,86.763638,804.32935,0.109446,0.213882,0.238741,0.101418,0.210258,0.069685,0.627417
std,6.002666,7.749625,35.685316,424.990186,0.047081,0.061993,0.083146,0.039049,0.073018,0.021029,...,12.567502,52.147137,618.429622,0.054996,0.174935,0.214909,0.073334,0.142626,0.036234,0.483918
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,10.49,14.74,71.49,0.0,0.0,0.05073,0.001487,0.0,0.1487,0.05628,...,0.0,69.86,455.7,0.1021,0.09358,0.07161,0.04815,0.0,0.06589,0.0
50%,12.72,18.14,84.07,449.3,0.08677,0.08061,0.04209,0.0207,0.172,0.06066,...,21.96,89.04,628.5,0.1263,0.1854,0.1856,0.08704,0.2572,0.07676,1.0
75%,15.06,21.46,102.4,664.7,0.1015,0.123,0.1115,0.05397,0.1934,0.0654,...,27.68,114.2,993.6,0.1432,0.3064,0.366,0.1561,0.3038,0.08839,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.2906,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


# Cleaning Data with KNN Blank Filling

In [28]:
from sklearn.impute import KNNImputer

#### Without Weight

In [30]:
cleaner = KNNImputer(n_neighbors=5)
pd.DataFrame(cleaner.fit_transform(numerical)).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.019588,19.289276,91.566678,639.997926,0.096604,0.10368,0.088737,0.047341,0.181407,0.06259,...,25.500373,105.551227,878.034446,0.133747,0.254424,0.266494,0.114627,0.288545,0.084273,0.627417
std,3.372633,4.079819,23.261553,321.123294,0.012599,0.049889,0.076259,0.034051,0.025618,0.00684,...,5.514207,31.814476,567.171199,0.021728,0.149508,0.200573,0.06437,0.057876,0.017185,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.49,54.49,185.2,0.07117,0.03432,0.0,0.0,0.1566,0.05504,0.0
25%,11.71,16.52,75.27,432.0,0.08794,0.06797,0.033732,0.02331,0.165,0.0578,...,21.78,83.9,514.0,0.11961,0.155,0.1211,0.0656,0.2554,0.07351,0.0
50%,13.34,18.94,86.192,551.86,0.096344,0.09462,0.06335,0.033278,0.1793,0.06132,...,25.05,97.464,684.6,0.134,0.2164,0.21456,0.09783,0.27946,0.080886,1.0
75%,15.75,21.48,103.7,761.3,0.1039,0.128,0.1242,0.06759,0.1953,0.06576,...,28.668,120.3,1050.0,0.1471,0.33358,0.3786,0.1625,0.3109,0.091024,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.2906,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


#### With Weight

In [31]:
cleaner = KNNImputer(n_neighbors=5, weights="distance")
pd.DataFrame(cleaner.fit_transform(numerical)).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.011745,19.301887,91.527132,639.356966,0.096604,0.103896,0.08847,0.047526,0.181566,0.0626,...,25.490294,105.53644,877.565889,0.133727,0.254375,0.266527,0.114529,0.288616,0.084325,0.627417
std,3.372777,4.110971,23.268459,320.94979,0.012815,0.049815,0.076331,0.034094,0.025628,0.006858,...,5.520173,31.934985,567.32561,0.021805,0.14954,0.200249,0.064456,0.057812,0.017219,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.49,54.49,185.2,0.07117,0.03432,0.0,0.0,0.1566,0.05504,0.0
25%,11.71,16.54,75.276516,429.568166,0.08759,0.06779,0.03367,0.02315,0.165453,0.05769,...,21.77,83.74,513.9,0.1194,0.155,0.1206,0.06575,0.256523,0.07343,0.0
50%,13.3,18.89,86.24,551.1,0.09687,0.096219,0.06155,0.03485,0.179624,0.06133,...,25.05,97.33,684.6,0.133843,0.217,0.221764,0.09804,0.2806,0.08113,1.0
75%,15.71,21.54,103.6,766.6,0.1039,0.1279,0.122956,0.06847,0.1953,0.065791,...,28.65,121.2,1050.0,0.1475,0.3345,0.3755,0.1614,0.31,0.09158,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.2906,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


# ColumnTransformers

In [33]:
numerical_columns = numerical.columns
categorical_columns = categorical.columns
from sklearn.compose import ColumnTransformer

In [35]:

cleaner = ColumnTransformer([('numerical features', SimpleImputer(strategy="mean"), numerical_columns), ('categorical features', SimpleImputer(strategy="most_frequent"), categorical_columns)])

In [39]:
pd.DataFrame(cleaner.fit_transform(df)).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569
unique,396.0,420.0,473.0,384.0,337.0,460.0,418.0,369.0,375.0,446.0,...,426.0,492.0,358.0,445.0,484.0,435.0,375.0,448.0,2.0,3
top,14.059548,19.311829,92.039025,661.522581,0.097156,0.104531,0.094063,0.049115,0.181405,0.062626,...,107.322848,893.873828,0.133065,0.256748,0.267936,0.116579,0.291797,0.084363,1.0,A
freq,87.0,77.0,56.0,166.0,185.0,89.0,130.0,187.0,98.0,65.0,...,109.0,57.0,101.0,95.0,62.0,74.0,159.0,99.0,357.0,564


#### Select Coumn using Column Selector

In [40]:
from sklearn.compose import make_column_selector

In [41]:
cleaner =  ColumnTransformer([('numerical features', SimpleImputer(strategy="mean"), make_column_selector(dtype_exclude="object")), ('categorical features', SimpleImputer(strategy="most_frequent"), 
                                                                                                          make_column_selector(dtype_include="object"))])

In [42]:
pd.DataFrame(cleaner.fit_transform(df)).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569
unique,396.0,420.0,473.0,384.0,337.0,460.0,418.0,369.0,375.0,446.0,...,426.0,492.0,358.0,445.0,484.0,435.0,375.0,448.0,2.0,3
top,14.059548,19.311829,92.039025,661.522581,0.097156,0.104531,0.094063,0.049115,0.181405,0.062626,...,107.322848,893.873828,0.133065,0.256748,0.267936,0.116579,0.291797,0.084363,1.0,A
freq,87.0,77.0,56.0,166.0,185.0,89.0,130.0,187.0,98.0,65.0,...,109.0,57.0,101.0,95.0,62.0,74.0,159.0,99.0,357.0,564
