# Elements of Data Science Part 3: Preprocessing & Feature Engineering
[Course link](https://www.aws.training/Details/eLearning?id=26598)

# Encoding categoricals

In [1]:
# Sample data set
import pandas as pd

df = pd.DataFrame([
    ['apt', 3, 2572, 'S', 1372000, 'Y'],
    ['house', 2, 1386, 'N', 699000, 'N'],
    ['house', 3, 1932, 'L', 800000, 'N'],
    ['house', 1, 851, 'M', 451000, 'Y'],
    ['apt', 1, 600, 'N', 325000, 'N'],
])
df.columns = ['type', 'bedrooms', 'area', 'garden_size', 'price', 'loan_approved']
df

Unnamed: 0,type,bedrooms,area,garden_size,price,loan_approved
0,apt,3,2572,S,1372000,Y
1,house,2,1386,N,699000,N
2,house,3,1932,L,800000,N
3,house,1,851,M,451000,Y
4,apt,1,600,N,325000,N


In [2]:
# Manual mapping
mapping = {'N': 0, 'S': 5, 'M': 10, 'L': 20}
df['garden_size'] = df['garden_size'].map(mapping)
df

Unnamed: 0,type,bedrooms,area,garden_size,price,loan_approved
0,apt,3,2572,5,1372000,Y
1,house,2,1386,0,699000,N
2,house,3,1932,20,800000,N
3,house,1,851,10,451000,Y
4,apt,1,600,0,325000,N


## Encoding ordinals
**Use sklearn's `LabelEncoder` to encode either:**
1. ordinals (ordered or have some relationship) or
1. target variables (since model will not incorrectly learn a relationship between them)

Supports binary and multi-value classes.

In [3]:
from sklearn.preprocessing import LabelEncoder

loan_enc = LabelEncoder()
target = loan_enc.fit_transform(df['loan_approved'])
target

array([1, 0, 0, 1, 0])

## Encoding nominals (no relationship between values)
Two steps typically needed:
1. convert classes to integers (`LabelEncoder` can safely be used here as an intermediate step)
1. one-hot encode

** `LabelBinarizer()` is a shortcut combining these two steps: **

In [4]:
from sklearn.preprocessing import OneHotEncoder

df = pd.DataFrame({'fruits': ['apple', 'banana', 'banana', 'mango', 'banana']})
le = LabelEncoder()
num_type = le.fit_transform(df['fruits'])
print(num_type)
ohe = OneHotEncoder()
num_typeT = num_type.reshape(-1, 1)
ohe.fit(num_typeT)
print('Integer categories:', ohe.categories_)
# Note: returns a scipy.sparse.csr.csr_matrix
ohe_cols = ohe.transform(num_typeT)
print(ohe_cols.toarray())

[0 1 1 2 1]
Integer categories: [array([0, 1, 2])]
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


In [5]:
# Or use fit_transform()
ohe_cols = ohe.fit_transform(num_type.reshape(-1, 1))
print(ohe_cols.toarray())

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


In [6]:
# Even easier, use pandas get_dummies() !!
pd.get_dummies(df)

Unnamed: 0,fruits_apple,fruits_banana,fruits_mango
0,1,0,0
1,0,1,0
2,0,1,0
3,0,0,1
4,0,1,0


## Handling Missing Values

In [7]:
# Add some sparse numerical values
df['quantity'] = [5, None, 3, None, 1]
df

Unnamed: 0,fruits,quantity
0,apple,5.0
1,banana,
2,banana,3.0
3,mango,
4,banana,1.0


In [8]:
# Test for missing vals
df.isnull()

Unnamed: 0,fruits,quantity
0,False,False
1,False,True
2,False,False
3,False,True
4,False,False


In [9]:
# Count by COLS
df.isnull().sum()

fruits      0
quantity    2
dtype: int64

In [10]:
# Count by ROWS
df.isnull().sum(axis=1)

0    0
1    1
2    0
3    1
4    0
dtype: int64

In [11]:
# Drop ROWS w/ any missing vals
df.dropna()

Unnamed: 0,fruits,quantity
0,apple,5.0
2,banana,3.0
4,banana,1.0


In [12]:
# Drop COLS w/ any missing vals
df.dropna(axis=1)

Unnamed: 0,fruits
0,apple
1,banana
2,banana
3,mango
4,banana


## Imputing

In [13]:
from sklearn.impute import SimpleImputer
import numpy as np

ex_vals = np.array([
    [5,3,2,2],
    [3,None,1,9],
    [5,2,7,None]
])

imp = SimpleImputer(strategy='mean')
imp.fit_transform(ex_vals)

array([[5. , 3. , 2. , 2. ],
       [3. , 2.5, 1. , 9. ],
       [5. , 2. , 7. , 5.5]])

In [4]:
# More advanced methods: 
# Note: the python fancyimpute library was merged into this!
import sklearn as skl
print(skl.__version__)

#from sklearn.impute import IterativeImputer

0.22.2.post1
