In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [6]:
dataset = pd.read_csv('Data.csv')

In [7]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# Select columns to be dependent variables

In [8]:
x=dataset.iloc[:,:-1].values

In [9]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# Select Independent varibale column

In [10]:
y=dataset.iloc[:,-1].values

In [11]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

# Deal with missing data
    

In [12]:
#we can use the average of the column
#To create textbox, use Esc + m
from sklearn.preprocessing import Imputer
imputer=Imputer(missing_values='NaN', strategy='mean', axis=0)
#higher boundary is not included
imputer=imputer.fit(x[:,1:3])
x[:,1:3]=imputer.transform(x[:,1:3])

In [13]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# Encode categorical variable columns to numeric 

In [14]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [15]:
labelencoder_x= LabelEncoder()
# select the column with the categorical variable, and create unique numeric lables
x[:,0]= labelencoder_x.fit_transform(x[:,0])
x                                    

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

## Create Dummy variables for Country column

In [16]:
# OneHotEncoder? for Help and Documentations
# Numeric labels for country names are nominal instead of ordinal, 
# OneHotEncoder is for creating dummy variables where presence is 1, and absence is 0
onehotencoder= OneHotEncoder(categorical_features=[0])
x= onehotencoder.fit_transform(x).toarray()
x

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

In [17]:
labelencoder_y= LabelEncoder()
y= labelencoder_y.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

# Split sample into training and testing sets

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
# test_size is usually small
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2, random_state= 0)


# Feature Scaling


<b>Standardization</b>: Z-score:   $x_{stand} = \dfrac{x - mean(x)}{\sigma}$ <br><br>
<b>Normalization</b>: $x_{norm} = \dfrac{x - min(x)}{max(x)-min(x)}$


In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
# create object to scale independent variable
sc_x= StandardScaler()
# for training set, we need to fit the object then transform  
x_train= sc_x.fit_transform(x_train)
# for test set, we don't need to fit
x_test= sc_x.transform(x_test)

In [22]:
# should we scale dummy variables?
# do scaling for y when the range is huge