# Data Preprocessing Template

## Importing the libraries

In [118]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [119]:
dataset = pd.read_csv('Data.csv')
# all the rows, all, the columns except the last one
X = dataset.iloc[:, :-1].values
# all the rows, all, only the last column
y = dataset.iloc[:, -1].values

## Taking care of the missing data

In [120]:
from sklearn.impute import SimpleImputer

# replace empty values with
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer.fit(X[:, 1 :])
X[:, 1 :]  = imputer.transform(X[:, 1 :])

## Encoding categorical datas

Encoding the independent variable

In [121]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# OneHotEncoder of the country column, its not numeric, country becomes into 3 new columns of 0s and 1s (since we have 3 different categorical countries)  0.0 1.0 0.0
# main benefit of 0.0 1.0 0.0 (OneHot encoding), there is no numerical order imposed for the categorical data (e.g france => 1, germany => 2 enum) 
# [0] is the index(es) we want to do the transformation, passthrough to keep not effected columns
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0])], remainder="passthrough")
X = np.array(ct.fit_transform(X))

Encoding the dependent variable, dependent variable (y) is what we are trying to predict

In [122]:
from sklearn.preprocessing import LabelEncoder

# Encode Yes/No into 0s and 1s   
le = LabelEncoder()
y = le.fit_transform(y)

## Splitting data set int the Training and Test sets

In [123]:
from sklearn.model_selection import train_test_split

# separate 80 percent for training and 20 percent for testing (independent and dependent variables separately)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

## Feature scaling

In [124]:
# Normalisation => x - min(x) / max(x) - min(x) => is recommended when you have a normal distrubtion in most of the features, would map to  0 <= value <= 1
# Standardisation => x - mean(x) / standard_deviation(x) => will work always => mostly between -3 and 3
# Must be done separately AFTER splitting train and test
# Do NOT apply 

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train[:, 3:] = sc.fit_transform (X_train[:, 3:])


# use the SAME scaler for the scaling test set
X_test[:, 3:] = sc.transform (X_test[:, 3:])