## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('Data.csv')
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [34]:
X = dataset.iloc[:, :-1]
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [44]:
y = dataset.iloc[:, -1]
y

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

## Take care of missing data

In [22]:
from sklearn.impute import SimpleImputer

In [35]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# care for string column
imputer.fit(X[['Age', 'Salary']])

X[['Age', 'Salary']] = imputer.transform(X[['Age', 'Salary']])

In [36]:
print(X)

   Country        Age        Salary
0   France  44.000000  72000.000000
1    Spain  27.000000  48000.000000
2  Germany  30.000000  54000.000000
3    Spain  38.000000  61000.000000
4  Germany  40.000000  63777.777778
5   France  35.000000  58000.000000
6    Spain  38.777778  52000.000000
7   France  48.000000  79000.000000
8  Germany  50.000000  83000.000000
9   France  37.000000  67000.000000


## Encode categorical data

#### Encoding independent variable

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [38]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['Country'])], remainder='passthrough')

In [39]:
X = ct.fit_transform(X)

In [40]:
print(pd.DataFrame(X))

     0    1    2          3             4
0  1.0  0.0  0.0  44.000000  72000.000000
1  0.0  0.0  1.0  27.000000  48000.000000
2  0.0  1.0  0.0  30.000000  54000.000000
3  0.0  0.0  1.0  38.000000  61000.000000
4  0.0  1.0  0.0  40.000000  63777.777778
5  1.0  0.0  0.0  35.000000  58000.000000
6  0.0  0.0  1.0  38.777778  52000.000000
7  1.0  0.0  0.0  48.000000  79000.000000
8  0.0  1.0  0.0  50.000000  83000.000000
9  1.0  0.0  0.0  37.000000  67000.000000


#### Encode dependent variable

In [41]:
from sklearn.preprocessing import LabelEncoder

In [42]:
le = LabelEncoder()

In [45]:
y = le.fit_transform(y)

In [46]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Train/test split

In [47]:
from sklearn.model_selection import train_test_split

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [53]:
X_train

array([[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04]])

In [54]:
X_test

array([[0.0e+00, 1.0e+00, 0.0e+00, 3.0e+01, 5.4e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.7e+01, 6.7e+04]])

In [55]:
y_train

array([0, 1, 0, 0, 1, 1, 0, 1])

In [56]:
y_test

array([0, 1])

## Feature Scaling

In [57]:
from sklearn.preprocessing import StandardScaler

In [58]:
sc = StandardScaler()

In [59]:
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [60]:
X_train

array([[ 0.        ,  0.        ,  1.        , -0.19159184, -1.07812594],
       [ 0.        ,  1.        ,  0.        , -0.01411729, -0.07013168],
       [ 1.        ,  0.        ,  0.        ,  0.56670851,  0.63356243],
       [ 0.        ,  0.        ,  1.        , -0.30453019, -0.30786617],
       [ 0.        ,  0.        ,  1.        , -1.90180114, -1.42046362],
       [ 1.        ,  0.        ,  0.        ,  1.14753431,  1.23265336],
       [ 0.        ,  1.        ,  0.        ,  1.43794721,  1.57499104],
       [ 1.        ,  0.        ,  0.        , -0.74014954, -0.56461943]])

In [61]:
X_test

array([[ 0.        ,  1.        ,  0.        , -1.46618179, -0.9069571 ],
       [ 1.        ,  0.        ,  0.        , -0.44973664,  0.20564034]])