# Chapter 1: Data Preprocessing

## 1. Importing the libraries and dataset ##

Every project begins with libraries imports and dataset imports. Below are some snippets of code that should be found at the beginning of every project.

In [1]:
# Import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

In [2]:
#Change current working directory to fetch the dataset
working_location = os.chdir('datasets')
working_location = os.getcwd()
print(working_location)

/Users/ulysse/Documents/GitHub/mooc-udemy-mlAtoZ/datasets


In [3]:
# Import the dataset
dataset = pd.read_csv("Part1-Data.csv")
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,3].values

In [4]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## 2. Missing values ##

If values are missing, you can delete the line, delete the column or take the average of the whole column or whole line.

In [5]:
# This can be done with the Imputer class
from sklearn.preprocessing import Imputer

# Strategies available are: mean, median, most_frequent. Here we pick mean
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])

In [6]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## 3. Categorical data ##

We cannot feed such categorical data like the first column to a model, here is how we change it.

In [7]:
# Let's import the right tools
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Changes label into numbers
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

# Encode the variable into dummy variables
# DON'T FORGET TO DROP ONE OF THE COLUMNS
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()

# Encoding the Dependent Variable
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

In [8]:
print(X)
print(y)

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01
  7.20000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01
  4.80000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01
  5.40000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01
  6.10000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01
  6.37777778e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01
  5.80000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01
  5.20000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01
  7.90000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01
  8.30000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01
  6.70000000e+04]]
[0 1 0 0 1 1 0 1 0 1]


In [None]:
# Template of another way of doing dummy variable
columns_to_dummy = ['nanana', 'nana', 'nananana']

df = pd.get_dummies(df,columns=columns_to_dummy,drop_first=True)

## 4. Splitting the dataset into test and train set ##

Now time to split the dataset into test and train set.

In [10]:
# Here is the classic way to do so, the random_state is to reproduce specific results and the test_size ratio should be played with
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42 )

## 5. Feature scaling ##

Depending on the algorithm it is a best practice to put the features on the same scale. The classic way to do so is to remove the mean and scale by the unit variance. <br>**NB**: The test set should only be transformed and not help to fit the scaler.

In [11]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)