## Data Cleaning & Preprocessing

### Import Libraries

In [88]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OrdinalEncoder,OneHotEncoder
from sklearn.impute import SimpleImputer

### Load Dataset

In [105]:
df = pd.read_csv('./Data.csv')

In [106]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [53]:
df.corr()

Unnamed: 0,Age,Salary
Age,1.0,0.982495
Salary,0.982495,1.0


In [54]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [67]:
Imputer = SimpleImputer(missing_values=np.nan,strategy='constant',fill_value=0)

In [68]:
df.iloc[:,1:3]=Imputer.fit_transform(df.iloc[:,1:3])

In [69]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,0.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,0.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [29]:
Imputer=SimpleImputer(missing_values=np.nan,strategy='mean')

In [36]:
df.iloc[:,1:3]=Imputer.fit_transform(df.iloc[:,1:3])

In [37]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [71]:
MinMax=MinMaxScaler()
df.iloc[:,1:3]=MinMax.fit_transform(df.iloc[:,1:3])

In [72]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.88,0.86747,No
1,Spain,0.54,0.578313,Yes
2,Germany,0.6,0.650602,No
3,Spain,0.76,0.73494,No
4,Germany,0.8,0.0,Yes
5,France,0.7,0.698795,Yes
6,Spain,0.0,0.626506,No
7,France,0.96,0.951807,Yes
8,Germany,1.0,1.0,No
9,France,0.74,0.807229,Yes


In [73]:
St = StandardScaler()
df.iloc[:,1:3]=St.fit_transform(df.iloc[:,1:3])

In [74]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.673262,0.66197,No
1,Spain,-0.58448,-0.4262,Yes
2,Germany,-0.362526,-0.154157,No
3,Spain,0.229353,0.163225,No
4,Germany,0.377323,-2.602539,Yes
5,France,0.007398,0.027204,Yes
6,Spain,-2.58207,-0.244838,No
7,France,0.969201,0.979353,Yes
8,Germany,1.117171,1.160714,No
9,France,0.155368,0.435268,Yes


In [75]:
df.skew()

Age      -1.753019
Salary   -1.760196
dtype: float64

In [76]:
df.kurtosis()

Age       4.014398
Salary    4.301502
dtype: float64

In [77]:
Ordinal = OrdinalEncoder()
df.iloc[:,[0,3]]=Ordinal.fit_transform(df.iloc[:,[0,3]])

In [107]:
df.index

RangeIndex(start=0, stop=10, step=1)

In [111]:
ct = OneHotEncoder(sparse=False)
x=pd.DataFrame(ct.fit_transform(df.iloc[:,[0,3]]))
x.index=df.index
x

Unnamed: 0,0,1,2,3,4
0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,1.0
2,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,1.0,1.0,0.0
4,0.0,1.0,0.0,0.0,1.0
5,1.0,0.0,0.0,0.0,1.0
6,0.0,0.0,1.0,1.0,0.0
7,1.0,0.0,0.0,0.0,1.0
8,0.0,1.0,0.0,1.0,0.0
9,1.0,0.0,0.0,0.0,1.0


In [87]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
df = np.array(ct.fit_transform(df))
print(df)

[[1.0 0.0 0.0 44.0 72000.0 'No']
 [0.0 0.0 1.0 27.0 48000.0 'Yes']
 [0.0 1.0 0.0 30.0 54000.0 'No']
 [0.0 0.0 1.0 38.0 61000.0 'No']
 [0.0 1.0 0.0 40.0 nan 'Yes']
 [1.0 0.0 0.0 35.0 58000.0 'Yes']
 [0.0 0.0 1.0 nan 52000.0 'No']
 [1.0 0.0 0.0 48.0 79000.0 'Yes']
 [0.0 1.0 0.0 50.0 83000.0 'No']
 [1.0 0.0 0.0 37.0 67000.0 'Yes']]
