## Data Preprocessing Tools

### Importing the libraries

In [26]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset

In [27]:
dataset = pd.read_csv("data/Data.csv")

X = dataset.iloc[:, :-1]  # take all the column exclude the last column, and convert to ndarray
y = dataset.iloc[:, -1]   # take only the last column, and convert to ndarray

In [28]:
print(X)

   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
5   France  35.0  58000.0
6    Spain   NaN  52000.0
7   France  48.0  79000.0
8  Germany  50.0  83000.0
9   France  37.0  67000.0


In [29]:
print(y)

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


### Taking care of missing data

In [30]:
# Check the number of missing data in the dataset
dataset.isna().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [31]:
from sklearn.impute import SimpleImputer

# Define which numerical columns that we are going to impute
num_features = ["Age", "Salary"]

# Create a handler to fill numerical value with its mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Impute the numerical columns
X.loc[:, num_features] = imputer.fit_transform(X[num_features])

In [32]:
print(X)

   Country        Age        Salary
0   France  44.000000  72000.000000
1    Spain  27.000000  48000.000000
2  Germany  30.000000  54000.000000
3    Spain  38.000000  61000.000000
4  Germany  40.000000  63777.777778
5   France  35.000000  58000.000000
6    Spain  38.777778  52000.000000
7   France  48.000000  79000.000000
8  Germany  50.000000  83000.000000
9   France  37.000000  67000.000000


### Encoding Categorical Data

#### Encoding the Independent Variable

In [33]:
from sklearn.preprocessing import OneHotEncoder

# Define which categorical columns that we are going to encode
cat_features = ["Country"]

# Encode the categorical columns
encoder = OneHotEncoder(sparse=False)
X_encoded = pd.DataFrame(encoder.fit_transform(X.loc[:, cat_features]))
X_encoded.columns = encoder.get_feature_names(cat_features)

# Combine X with encoded columns
X.drop(cat_features ,axis=1, inplace=True)
X = pd.concat([X_encoded, X], axis=1)

In [34]:
print(X)

   Country_France  Country_Germany  Country_Spain        Age        Salary
0             1.0              0.0            0.0  44.000000  72000.000000
1             0.0              0.0            1.0  27.000000  48000.000000
2             0.0              1.0            0.0  30.000000  54000.000000
3             0.0              0.0            1.0  38.000000  61000.000000
4             0.0              1.0            0.0  40.000000  63777.777778
5             1.0              0.0            0.0  35.000000  58000.000000
6             0.0              0.0            1.0  38.777778  52000.000000
7             1.0              0.0            0.0  48.000000  79000.000000
8             0.0              1.0            0.0  50.000000  83000.000000
9             1.0              0.0            0.0  37.000000  67000.000000


#### Encoding Dependent Variable

In [35]:
from sklearn.preprocessing import LabelEncoder

# Encode the label data
le = LabelEncoder()
y = le.fit_transform(y)

In [36]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


### Splitting the dataset into the Training set and Test set

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,  # 20% test dataset
                                                    random_state=1  # for reproducibility purpose
                                                   )

In [38]:
print(X_train)

   Country_France  Country_Germany  Country_Spain        Age        Salary
6             0.0              0.0            1.0  38.777778  52000.000000
4             0.0              1.0            0.0  40.000000  63777.777778
0             1.0              0.0            0.0  44.000000  72000.000000
3             0.0              0.0            1.0  38.000000  61000.000000
1             0.0              0.0            1.0  27.000000  48000.000000
7             1.0              0.0            0.0  48.000000  79000.000000
8             0.0              1.0            0.0  50.000000  83000.000000
5             1.0              0.0            0.0  35.000000  58000.000000


In [39]:
print(X_test)

   Country_France  Country_Germany  Country_Spain   Age   Salary
2             0.0              1.0            0.0  30.0  54000.0
9             1.0              0.0            0.0  37.0  67000.0


In [40]:
print (y_train)

[0 1 0 0 1 1 0 1]


In [41]:
print (y_test)

[0 1]


### Feature Scaling

In [42]:
from sklearn.preprocessing import StandardScaler

# Define which numerical columns that we are going to scale
scale_features = ["Age", "Salary"]

sc = StandardScaler()
pd.options.mode.chained_assignment = None # to disable SettingWithCopyWarning
X_train.loc[:, scale_features] = sc.fit_transform(X_train.loc[:, scale_features])
X_test.loc[:, scale_features] = sc.transform(X_test.loc[:, scale_features]) # use the same scaler that has been applied to train set
pd.options.mode.chained_assignment = 'warn' # to re-enable SettingWithCopyWarning

In [43]:
print(X_train)

   Country_France  Country_Germany  Country_Spain       Age    Salary
6             0.0              0.0            1.0 -0.191592 -1.078126
4             0.0              1.0            0.0 -0.014117 -0.070132
0             1.0              0.0            0.0  0.566709  0.633562
3             0.0              0.0            1.0 -0.304530 -0.307866
1             0.0              0.0            1.0 -1.901801 -1.420464
7             1.0              0.0            0.0  1.147534  1.232653
8             0.0              1.0            0.0  1.437947  1.574991
5             1.0              0.0            0.0 -0.740150 -0.564619


In [44]:
print(X_test)

   Country_France  Country_Germany  Country_Spain       Age    Salary
2             0.0              1.0            0.0 -1.466182 -0.906957
9             1.0              0.0            0.0 -0.449737  0.205640
