# 1. Data Preprocessing
## Basic imports

In [57]:
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image
import sys
sys.path.insert(1, '../..')
from imports.methods import plot_decision_regions

## 1. Identifying missing values in tabular data
- It is sometimes more convenient to preprocess data using pandas DataFrame instead of a numpy array (initially scipy only supported numpy arrays as inputs)
- We can read Comma-separated values (CSV) formatted data into a pandas library DataFrame
- StringIO allows us to read a string as if it was a regular CSV file on a hard drive

In [58]:
import pandas as pd
from io import StringIO
import sys

csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

# If you are using Python 2.7, you need
# to convert the string to unicode:

if (sys.version_info < (3, 0)):
    csv_data = unicode(csv_data)

df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


## 2. Finding the number of missing values per column

In [59]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

## 3. Accesing the undelying numpy array of a DataFrame

In [60]:
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

# 2. Eliminating training examples or features with missing values
## 1. Remove rows that contain missing values

In [61]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


## 2. Remove columns that contain missing values

In [62]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


## 3. Only drop rows where all columns are NaN

In [63]:
df.dropna(how='all')  

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


## 4. Drop rows that have fewer than 3 real values 

In [64]:
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


## 5. Only drop rows where NaN appear in specific columns (here: 'C')

In [65]:
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


# 3. Imputing missing values
- We can use many interpolation techniques to estimate the missing values from other training examples in our dataset
## 1. Impute missing values via the column mean (using scikit SimpleImputer)

In [66]:
from sklearn.impute import SimpleImputer
import numpy as np

imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

###2. Impute missing values via the column mean (using pandas' fillna method)

In [67]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


# 4. Handling categorical data
## 1. Nominal and ordinal features
- We can define order on ordinal features (so they can be sorted and ordered), but nominal features do not imply any order at all

In [68]:
import pandas as pd

df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
                   ['red', 'L', 13.5, 'class1'],
                   ['blue', 'XL', 15.3, 'class2']])

df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


## 2. Mapping ordinal features
- In most cases there is no convenient function to derive correct order of the labels of an ordinal feature, so we have define mapping manually

In [69]:
size_mapping = {'XL': 3,
                'L': 2,
                'M': 1}

df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


## 3. Inverse mapping ordinal features

In [70]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

## 4.1. Encoding class labels
- Create a mapping dictionary to convert class labels from strings to integers

In [71]:
# create a mapping dict
# to convert class labels from strings to integers
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [72]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


## 4.2. Reversing class label mapping

In [73]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


## 5.1. Encoding class labels using LabelEncoder

In [74]:
from sklearn.preprocessing import LabelEncoder

# Label encoding with sklearn's LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([1, 0, 1])

## 5.2. Reversing class label mapping using LabelEncoder

In [75]:
class_le.inverse_transform(y)

array(['class2', 'class1', 'class2'], dtype=object)

# 5. Performing one-hot encoding on nominal features
- It might seem like encoding nominal features using enumeration without any concern about ordering is a good idea, since we do not care about the order
- The issue is that if we blindly assign different integers to different labels we will introduce ordering, which will make a lerning algorithm assume that, in this case, "green" is larger than "blue", etc.
- The results of the algorithm will not be optimal and might not be useful at all
## 1. Incorrect approach
- blue = 0
- green = 1
- red = 2

In [76]:
X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

## 2. One-hot encoding approach
- The idea behind this approach is to introduce a new dummy feature for each unique value in the nominal feature column

In [77]:
from sklearn.preprocessing import OneHotEncoder

X = df[['color', 'size', 'price']].values
color_ohe = OneHotEncoder()
color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

## 3. Transforming columns selectively using ColumnTransformer

In [78]:
from sklearn.compose import ColumnTransformer

X = df[['color', 'size', 'price']].values
c_transf = ColumnTransformer([ ('onehot', OneHotEncoder(), [0]),
                               ('nothing', 'passthrough', [1, 2])])
c_transf.fit_transform(X).astype(float)

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

## 4. Creating dummy features using pd.get_dummies()

In [79]:
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0


- Using one-hot encodings introduces multicollinearity (high correlation between features), which can be an issue for certain methods (ones requiring matrix inversion)
- Removing one of the newly-created dummy features helps reduce the effects of that without any information loss

In [80]:
pd.get_dummies(df[['price', 'color', 'size']], drop_first=True)

Unnamed: 0,price,size,color_green,color_red
0,10.1,1,1,0
1,13.5,2,0,1
2,15.3,3,0,0


## 5. Removing the redundant column via OneHotEncoder

In [82]:
color_ohe = OneHotEncoder(categories='auto', drop='first')
c_transf = ColumnTransformer([ ('onehot', color_ohe, [0]),
                               ('nothing', 'passthrough', [1, 2])])
c_transf.fit_transform(X).astype(float)

array([[ 1. ,  0. ,  1. , 10.1],
       [ 0. ,  1. ,  2. , 13.5],
       [ 0. ,  0. ,  3. , 15.3]])