# Data Preprocessing

# I. Importing the libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.impute import SimpleImputer

In [3]:
from sklearn.datasets import make_blobs
from sklearn.compose import ColumnTransformer
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler

In [4]:
from collections import Counter

# 1. Reading and slicing the data

In [5]:
df_dataset = pd.read_csv('datasets/data.csv')
df_dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [6]:
X = df_dataset.iloc[:, :-1]
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [7]:
y = df_dataset.iloc[:, -1]
y

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

# 2. Handling missing data

In [8]:
df_dataset.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [9]:
df_dataset.isnull().values.any()

True

In [10]:
df_dataset.isnull().sum().sum()

2

In [11]:
missing = ["n/a", "na", "--", "NaN", "NA"]
df_dataset = pd.read_csv('datasets/data.csv', na_values = missing)
df_dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [12]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer

SimpleImputer()

In [13]:
imputer.fit(X.iloc[:, 1:3])

SimpleImputer()

In [14]:
X.iloc[:, 1:3] = imputer.transform(X.iloc[:, 1:3])

# 3. Splitting Data

## 3.1. Splitting the dataset into the Training set and Test set

In [15]:
X, y = make_blobs(n_samples=1000)

In [16]:
pd.DataFrame(X)

Unnamed: 0,0,1
0,-8.202122,-6.364822
1,-9.814522,-6.514044
2,9.466742,4.808716
3,-2.150367,5.047151
4,-9.291543,-4.390995
...,...,...
995,-10.599423,-5.432539
996,-8.303757,-5.988139
997,-2.180413,6.822627
998,-4.676256,4.985052


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(f'X_train shape {X_train.shape}; X_test shape {X_test.shape}; y_train shape {y_train.shape}; y_test shape {y_test.shape}')
pd.DataFrame(X_train)

X_train shape (800, 2); X_test shape (200, 2); y_train shape (800,); y_test shape (200,)


Unnamed: 0,0,1
0,-3.401700,5.187892
1,-9.946863,-4.093212
2,8.293124,6.118191
3,8.991203,6.266633
4,-4.139008,3.103089
...,...,...
795,-2.961278,3.474305
796,-2.625934,5.498991
797,-9.298297,-5.623292
798,-8.805540,-4.399218


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(f'X_train shape {X_train.shape}; X_test shape {X_test.shape}; y_train shape {y_train.shape}; y_test shape {y_test.shape}')
pd.DataFrame(X_train)

X_train shape (800, 2); X_test shape (200, 2); y_train shape (800,); y_test shape (200,)


Unnamed: 0,0,1
0,9.193707,6.982848
1,-3.079198,3.511793
2,8.012787,7.940447
3,-2.268889,4.434616
4,-3.274894,5.393333
...,...,...
795,-2.752465,6.164367
796,-7.310510,-5.393631
797,10.700605,5.379644
798,-8.459788,-5.202653


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(f'X_train shape {X_train.shape}; X_test shape {X_test.shape}; y_train shape {y_train.shape}; y_test shape {y_test.shape}')
pd.DataFrame(X_train)

X_train shape (800, 2); X_test shape (200, 2); y_train shape (800,); y_test shape (200,)


Unnamed: 0,0,1
0,-3.242155,3.256068
1,-2.851977,4.619913
2,8.602233,6.943180
3,9.181630,6.259699
4,9.994575,6.289159
...,...,...
795,7.786241,4.492133
796,7.618063,5.692832
797,-3.770336,7.361217
798,-3.682737,6.075248


As we can see, the data is split randomly. To avoid this, we can use the random_state parameter to make the split reproducible. This is useful when we want to compare different models.

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(f'X_train shape {X_train.shape}; X_test shape {X_test.shape}; y_train shape {y_train.shape}; y_test shape {y_test.shape}')
pd.DataFrame(X_train)

X_train shape (800, 2); X_test shape (200, 2); y_train shape (800,); y_test shape (200,)


Unnamed: 0,0,1
0,-11.180503,-7.069609
1,-9.590609,-5.060153
2,10.491161,8.252008
3,8.199994,7.306120
4,-8.981454,-5.105297
...,...,...
795,9.450058,5.960306
796,-3.811830,6.491310
797,-3.286949,4.776541
798,-10.463077,-6.916611


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(f'X_train shape {X_train.shape}; X_test shape {X_test.shape}; y_train shape {y_train.shape}; y_test shape {y_test.shape}')
pd.DataFrame(X_train)

X_train shape (800, 2); X_test shape (200, 2); y_train shape (800,); y_test shape (200,)


Unnamed: 0,0,1
0,-11.180503,-7.069609
1,-9.590609,-5.060153
2,10.491161,8.252008
3,8.199994,7.306120
4,-8.981454,-5.105297
...,...,...
795,9.450058,5.960306
796,-3.811830,6.491310
797,-3.286949,4.776541
798,-10.463077,-6.916611


## 3.2. Stratified Splitting

Stratified splitting is a technique that is used to split the data in a way that the proportion of each class is the same in the train and test sets. This is useful when the dataset is imbalanced.

In [22]:
X, y = make_classification(n_samples=1000, weights=[0.94], flip_y=0, random_state=1)

In [23]:
print('Distribution of classes in the original dataset: %s' % Counter(y))

Distribution of classes in the original dataset: Counter({0: 940, 1: 60})


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print('Distribution of classes in the train dataset: %s' % Counter(y_train))
print('Distribution of classes in the test dataset: %s' % Counter(y_test))

Distribution of classes in the train dataset: Counter({0: 755, 1: 45})
Distribution of classes in the test dataset: Counter({0: 185, 1: 15})


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
print('Distribution of classes in the train dataset: %s' % Counter(y_train))
print('Distribution of classes in the test dataset: %s' % Counter(y_test))

Distribution of classes in the train dataset: Counter({0: 752, 1: 48})
Distribution of classes in the test dataset: Counter({0: 188, 1: 12})


# 4. Encoding Categorical data

## 4.1. Label Encoding

In [26]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [27]:
pd.DataFrame(y)

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
995,0
996,0
997,0
998,0


### 4.2. One Hot Encoding

In [28]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,2.221909,0.807802,0.927339,-0.070094,-0.638715,-0.625338,-0.990550,0.269520,-0.352831,0.707078,-0.637334,1.316802,1.958895,-0.206349,1.431720,-0.767452,-0.075525,-2.864634,0.727620,-1.278130
1,-0.183853,0.072078,0.108895,-0.230275,0.126486,-2.640619,0.466686,-0.200674,2.176379,1.431810,-0.001866,-1.160561,1.433353,0.911168,-0.925495,0.577990,0.614711,1.277095,0.397899,0.380486
2,-0.626070,0.812148,-1.531915,-0.380824,-0.540130,0.142317,0.228305,0.063659,0.253463,-1.371024,-0.632939,-1.335798,-2.042800,2.147297,-0.056327,-0.679621,0.114661,0.025155,-0.069091,-1.214660
3,-0.270036,-0.372108,-1.140488,-0.579190,-1.163629,1.168288,1.356020,1.295251,-1.753413,-0.964620,0.174320,-0.296616,-0.401440,-1.586623,-0.096158,-1.048242,0.700129,-0.716063,-0.501382,-0.483549
4,-0.286285,-0.210801,0.212339,-0.446445,-0.803471,-1.306725,-0.167858,-0.857692,-0.491583,-1.330638,0.119190,-2.155325,0.263040,-0.386946,1.806458,-0.353635,0.220540,-0.664314,-0.979698,-0.090187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1.228973,2.009362,-1.337587,-0.315281,-1.153013,-0.319832,0.921625,-1.471066,-1.814726,1.785779,-1.563524,-0.657682,1.281235,-1.255936,-0.281530,-1.652669,0.133020,0.930704,0.000231,-2.983196
996,0.552988,0.839988,-0.553544,2.126596,0.071932,-0.818915,-0.840424,2.249522,-0.509866,-1.469819,-0.599663,0.375807,0.213101,0.816202,-2.258564,-0.056790,0.198285,-0.423507,-0.541073,-0.762027
997,-0.421712,-0.393626,0.951773,0.622981,-0.669641,-2.075664,1.110404,0.305029,-0.773816,-0.938604,0.089215,1.186398,1.905036,-1.244140,0.324049,-2.227633,-1.298183,-1.239348,-0.058853,-1.367342
998,-1.335056,-0.376535,0.816065,-1.747210,-0.813916,-0.851866,-1.141681,-1.015003,-0.614941,0.979415,0.343442,1.071549,-0.313444,0.477677,-0.782453,0.902692,-0.183231,1.728582,1.171279,1.012649


In [29]:
column_transformer = ColumnTransformer([("Name", OneHotEncoder(sparse=False), [0])], remainder='passthrough')

In [30]:
X2 = column_transformer.fit_transform(X)
pd.DataFrame(X2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.637334,1.316802,1.958895,-0.206349,1.431720,-0.767452,-0.075525,-2.864634,0.727620,-1.278130
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.001866,-1.160561,1.433353,0.911168,-0.925495,0.577990,0.614711,1.277095,0.397899,0.380486
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.632939,-1.335798,-2.042800,2.147297,-0.056327,-0.679621,0.114661,0.025155,-0.069091,-1.214660
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.174320,-0.296616,-0.401440,-1.586623,-0.096158,-1.048242,0.700129,-0.716063,-0.501382,-0.483549
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.119190,-2.155325,0.263040,-0.386946,1.806458,-0.353635,0.220540,-0.664314,-0.979698,-0.090187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.563524,-0.657682,1.281235,-1.255936,-0.281530,-1.652669,0.133020,0.930704,0.000231,-2.983196
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.599663,0.375807,0.213101,0.816202,-2.258564,-0.056790,0.198285,-0.423507,-0.541073,-0.762027
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.089215,1.186398,1.905036,-1.244140,0.324049,-2.227633,-1.298183,-1.239348,-0.058853,-1.367342
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.343442,1.071549,-0.313444,0.477677,-0.782453,0.902692,-0.183231,1.728582,1.171279,1.012649


In [31]:
data = np.asarray([["red"], ["green"], ["blue"]])
data

array([['red'],
       ['green'],
       ['blue']], dtype='<U5')

In [32]:
encoder = OneHotEncoder(sparse=False)
encoder.fit(data)
one_hot = encoder.transform(data)
one_hot

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.]])

# 5. Scaling data

## 5.1. Normalization

In [33]:
normalized_dataset = MinMaxScaler().fit_transform(df_dataset[['Age', 'Salary']])
df_normalized_dataset = pd.DataFrame(normalized_dataset, columns=['Age', 'Salary'])
df_normalized_dataset

Unnamed: 0,Age,Salary
0,0.73913,0.685714
1,0.0,0.0
2,0.130435,0.171429
3,0.478261,0.371429
4,0.565217,
5,0.347826,0.285714
6,,0.114286
7,0.913043,0.885714
8,1.0,1.0
9,0.434783,0.542857


## 5.2. Standardization

In [34]:
standardized_dataset = StandardScaler().fit_transform(df_dataset[['Age', 'Salary']])
df_standardized_dataset = pd.DataFrame(standardized_dataset, columns=['Age', 'Salary'])
df_standardized_dataset

Unnamed: 0,Age,Salary
0,0.719931,0.711013
1,-1.623675,-1.364376
2,-1.210098,-0.845529
3,-0.107224,-0.240207
4,0.168495,
5,-0.520801,-0.499631
6,,-1.018478
7,1.271368,1.316334
8,1.547087,1.662233
9,-0.245083,0.27864
