In [129]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [130]:
data = pd.read_csv("data/data.csv")
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [131]:
data.isna().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [132]:
x = data.drop("Purchased",axis=1)
y = pd.DataFrame(data["Purchased"],columns=["Purchased"])

In [133]:
x

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [134]:
y

Unnamed: 0,Purchased
0,No
1,Yes
2,No
3,No
4,Yes
5,Yes
6,No
7,Yes
8,No
9,Yes


In [136]:
# Missing Value
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

numerical_imputer = SimpleImputer(strategy="mean")
numerical_features = ["Age","Salary"]
imputer = ColumnTransformer([("Numerical_Imputer",numerical_imputer,numerical_features)])
filled_x = imputer.fit_transform(x)

In [137]:
filled_x = np.around(filled_x,2)
filled_x

array([[4.400000e+01, 7.200000e+04],
       [2.700000e+01, 4.800000e+04],
       [3.000000e+01, 5.400000e+04],
       [3.800000e+01, 6.100000e+04],
       [4.000000e+01, 6.377778e+04],
       [3.500000e+01, 5.800000e+04],
       [3.878000e+01, 5.200000e+04],
       [4.800000e+01, 7.900000e+04],
       [5.000000e+01, 8.300000e+04],
       [3.700000e+01, 6.700000e+04]])

In [138]:
filled_data_x = pd.DataFrame(filled_x,columns=["Age", "Salary"])

In [139]:
filled_data_x

Unnamed: 0,Age,Salary
0,44.0,72000.0
1,27.0,48000.0
2,30.0,54000.0
3,38.0,61000.0
4,40.0,63777.78
5,35.0,58000.0
6,38.78,52000.0
7,48.0,79000.0
8,50.0,83000.0
9,37.0,67000.0


In [140]:
filled_data_x["Country"] = data["Country"]

In [141]:
filled_data_x

Unnamed: 0,Age,Salary,Country
0,44.0,72000.0,France
1,27.0,48000.0,Spain
2,30.0,54000.0,Germany
3,38.0,61000.0,Spain
4,40.0,63777.78,Germany
5,35.0,58000.0,France
6,38.78,52000.0,Spain
7,48.0,79000.0,France
8,50.0,83000.0,Germany
9,37.0,67000.0,France


In [142]:
# Dummies for independent variables
from sklearn.preprocessing import OneHotEncoder

categorical_features = ["Country"]
transformer = ColumnTransformer([("one_hot",OneHotEncoder(),categorical_features)],remainder="passthrough")

transformed_data_X = transformer.fit_transform(filled_data_x)
transformed_data_X

array([[1.000000e+00, 0.000000e+00, 0.000000e+00, 4.400000e+01,
        7.200000e+04],
       [0.000000e+00, 0.000000e+00, 1.000000e+00, 2.700000e+01,
        4.800000e+04],
       [0.000000e+00, 1.000000e+00, 0.000000e+00, 3.000000e+01,
        5.400000e+04],
       [0.000000e+00, 0.000000e+00, 1.000000e+00, 3.800000e+01,
        6.100000e+04],
       [0.000000e+00, 1.000000e+00, 0.000000e+00, 4.000000e+01,
        6.377778e+04],
       [1.000000e+00, 0.000000e+00, 0.000000e+00, 3.500000e+01,
        5.800000e+04],
       [0.000000e+00, 0.000000e+00, 1.000000e+00, 3.878000e+01,
        5.200000e+04],
       [1.000000e+00, 0.000000e+00, 0.000000e+00, 4.800000e+01,
        7.900000e+04],
       [0.000000e+00, 1.000000e+00, 0.000000e+00, 5.000000e+01,
        8.300000e+04],
       [1.000000e+00, 0.000000e+00, 0.000000e+00, 3.700000e+01,
        6.700000e+04]])

In [121]:
y

Unnamed: 0,Purchased
0,No
1,Yes
2,No
3,No
4,Yes
5,Yes
6,No
7,Yes
8,No
9,Yes


In [122]:
# Dummies for dependent variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y

  return f(**kwargs)


array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [123]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(transformed_data_X, y, test_size = 0.2, random_state = 1)

In [124]:
print(X_train)

[[0.000000e+00 0.000000e+00 1.000000e+00 3.878000e+01 5.200000e+04]
 [0.000000e+00 1.000000e+00 0.000000e+00 4.000000e+01 6.377778e+04]
 [1.000000e+00 0.000000e+00 0.000000e+00 4.400000e+01 7.200000e+04]
 [0.000000e+00 0.000000e+00 1.000000e+00 3.800000e+01 6.100000e+04]
 [0.000000e+00 0.000000e+00 1.000000e+00 2.700000e+01 4.800000e+04]
 [1.000000e+00 0.000000e+00 0.000000e+00 4.800000e+01 7.900000e+04]
 [0.000000e+00 1.000000e+00 0.000000e+00 5.000000e+01 8.300000e+04]
 [1.000000e+00 0.000000e+00 0.000000e+00 3.500000e+01 5.800000e+04]]


In [144]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [145]:
X_train

array([[ 0.        ,  0.        ,  1.        , -0.19131098, -1.07812597],
       [ 0.        ,  1.        ,  0.        , -0.01415774, -0.07013151],
       [ 1.        ,  0.        ,  0.        ,  0.56667255,  0.63356241],
       [ 0.        ,  0.        ,  1.        , -0.30457288, -0.3078662 ],
       [ 0.        ,  0.        ,  1.        , -1.90185617, -1.42046364],
       [ 1.        ,  0.        ,  0.        ,  1.14750283,  1.23265334],
       [ 0.        ,  1.        ,  0.        ,  1.43791798,  1.57499102],
       [ 1.        ,  0.        ,  0.        , -0.7401956 , -0.56461945]])