## Data Processing Tools


Importing the libaries

In [32]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Importing the dataset

In [33]:
dataset = pd.read_csv('Data.csv')
x = dataset.iloc[:, :-1].values # get the dataset except the last column 
y = dataset.iloc[:, -1].values # get the last column

In [34]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [35]:
x,y

(array([['France', 44.0, 72000.0],
        ['Spain', 27.0, 48000.0],
        ['Germany', 30.0, 54000.0],
        ['Spain', 38.0, 61000.0],
        ['Germany', 40.0, nan],
        ['France', 35.0, 58000.0],
        ['Spain', nan, 52000.0],
        ['France', 48.0, 79000.0],
        ['Germany', 50.0, 83000.0],
        ['France', 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

Takeing care of missing data

In [36]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') # fit the missing data by mean
imputer.fit(x[:, 1:3]) # fit the imputer to the columns with missing values
x[:, 1:3] = imputer.transform(x[:, 1:3]) # transform the missing values to the mean of the column

In [37]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

Encoding categorical data

Encoding the Independent Variable


In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0])], remainder="passthrough") # remove the first column and replace it with the one hot encoded values
x=np.array(ct.fit_transform(x))

In [39]:
x

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

Encoding the Dependent Variable

In [40]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [41]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

Splitting the dataset into the Traing set and Test set

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [43]:
X_train

array([[0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [44]:
X_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [45]:
y_train

array([0, 1, 0, 0, 1, 1, 0, 1])

In [46]:
y_test

array([0, 1])

Feature Scaling

In [47]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [48]:
X_train

array([[-0.77459667, -0.57735027,  1.29099445, -0.19159184, -1.07812594],
       [-0.77459667,  1.73205081, -0.77459667, -0.01411729, -0.07013168],
       [ 1.29099445, -0.57735027, -0.77459667,  0.56670851,  0.63356243],
       [-0.77459667, -0.57735027,  1.29099445, -0.30453019, -0.30786617],
       [-0.77459667, -0.57735027,  1.29099445, -1.90180114, -1.42046362],
       [ 1.29099445, -0.57735027, -0.77459667,  1.14753431,  1.23265336],
       [-0.77459667,  1.73205081, -0.77459667,  1.43794721,  1.57499104],
       [ 1.29099445, -0.57735027, -0.77459667, -0.74014954, -0.56461943]])

In [49]:
X_test

array([[-0.77459667,  1.73205081, -0.77459667, -1.46618179, -0.9069571 ],
       [ 1.29099445, -0.57735027, -0.77459667, -0.44973664,  0.20564034]])