In [130]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split    # Because train test split s 

##### Step 1 - Importing Data

In [131]:
dataset = pd.read_csv("./Machine-Learning-A-Z-Codes-Datasets/Part 1 - Data Preprocessing/Section 2 -------------------- Part 1 - Data Preprocessing --------------------/Python/Data.csv")

In [132]:
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


##### Step 2 - Divide data into Features and Result vector

In [133]:
X = dataset.iloc[:,:-1] #The colon here means range, a range in python includes the lower bound but exclude the upper bound
Y = dataset.iloc[:,-1]  #When we want only one particular column, we give the index of that particular column  


In [134]:
print(type(X))
print(type(X.to_numpy()))
print(type(X.values))

<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [135]:
X = X.to_numpy()
Y = Y.to_numpy()

##### Step 3 - Taking care of missing data

Imputing Numerical Data

In [136]:
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")

# Now we will use the "fit" method to apply this imputer to the matrix of features

imputer.fit(X=X[:,1:3]) #Learns how to Fit only expects all the columns with numerical values because we have used the imputer for the same


In [137]:
X[:, 1:3] = imputer.transform(X=X[:, 1:3]) #Transforms the data using the imputer in the specified columns

In [138]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

##### Step 4 - Encoding Categorical data

Encoding Categorical Data - One Hot Encoding
- One hot encoding consists of binary column of each of the following inputs
example - [0,1,0],[1,0,0],[0,0,1]

In [139]:
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0])],remainder="passthrough")

In [140]:
X= ct.fit_transform(X=X)

In [141]:
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

Encoding Labels - Label Encoder

In [142]:
le = LabelEncoder()
Y = le.fit_transform(y=Y)

In [143]:
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

##### Step 5 - Splitting the data into training and test set.
<p style="color: yellow;">This a part of evaluation process and not of data preprocessing. Therefore this function is not inside preprocessing module of scikit-learn </p>

In [144]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, shuffle=True, random_state=1)  #Instead of test_size = 0.2, we can also give train_size = 0.8

In [145]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [146]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [147]:
print(Y_train)

[0 1 0 0 1 1 0 1]


In [148]:
print(Y_test)

[0 1]


##### Step 6 - Feature Scaling

In [149]:
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_train

array([[0.0, 0.0, 1.0, -0.19159184384578545, -1.0781259408412425],
       [0.0, 1.0, 0.0, -0.014117293757057777, -0.07013167641635372],
       [1.0, 0.0, 0.0, 0.566708506533324, 0.633562432710455],
       [0.0, 0.0, 1.0, -0.30453019390224867, -0.30786617274297867],
       [0.0, 0.0, 1.0, -1.9018011447007988, -1.420463615551582],
       [1.0, 0.0, 0.0, 1.1475343068237058, 1.232653363453549],
       [0.0, 1.0, 0.0, 1.4379472069688968, 1.5749910381638885],
       [1.0, 0.0, 0.0, -0.7401495441200351, -0.5646194287757332]],
      dtype=object)

In [150]:
X_test[:, 3:] = sc.transform(X_test[:, 3:])
X_test

array([[0.0, 1.0, 0.0, -1.4661817944830124, -0.9069571034860727],
       [1.0, 0.0, 0.0, -0.44973664397484414, 0.2056403393225306]],
      dtype=object)