# 1. Data Preprocessing
## Basic imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image
import sys
sys.path.insert(1, '../..')
from imports.methods import plot_decision_regions

## 1. Identifying missing values in tabular data
- It is sometimes more convenient to preprocess data using pandas DataFrame instead of a numpy array (initially scipy only supported numpy arrays as inputs)
- We can read Comma-separated values (CSV) formatted data into a pandas library DataFrame
- StringIO allows us to read a string as if it was a regular CSV file on a hard drive

In [1]:
import pandas as pd
from io import StringIO
import sys

csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

# If you are using Python 2.7, you need
# to convert the string to unicode:

if (sys.version_info < (3, 0)):
    csv_data = unicode(csv_data)

df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


### 1. Finding the number of missing values per column

In [2]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

### 2. Accesing the undelying numpy array of a DataFrame

In [3]:
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

## 2. Eliminating training examples or features with missing values

### 1. Remove rows that contain missing values

In [5]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


### 2. Remove columns that contain missing values

In [4]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


### 3. Only drop rows where all columns are NaN

In [6]:
df.dropna(how='all')  

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


### 4. Drop rows that have fewer than 3 real values 

In [7]:
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


### 5. Only drop rows where NaN appear in specific columns (here: 'C')

In [8]:
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


## 3. Imputing missing values
- We can use many interpolation techniques to estimate the missing values from other training examples in our dataset
### 1. Impute missing values via the column mean (using scikit SimpleImputer)

In [9]:
from sklearn.impute import SimpleImputer
import numpy as np

imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

### 2. Impute missing values via the column mean (using pandas' fillna method)

In [10]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


## 4. Handling categorical data
### 1. Nominal and ordinal features
- We can define order on ordinal features (so they can be sorted and ordered), but nominal features do not imply any order at all

In [11]:
import pandas as pd

df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
                   ['red', 'L', 13.5, 'class1'],
                   ['blue', 'XL', 15.3, 'class2']])

df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2
