# Categorical Variable Encoding

## Label Encoder:

This approach is very simple and it involves converting each value in a column to a number. 

#### `LabelEncoder()` from scikit-learn Package

In [1]:
# %cd "C:\\Users\\yasin.unlu\\Documents\\Original Docs\\Documents1\\Docs\\Teaching\\PythonForDataScienceSummer2020\\Week-8"

C:\Users\yasin.unlu\Documents\Original Docs\Documents1\Docs\Teaching\PythonForDataScienceSummer2020\Week-8


In [1]:
import pandas as pd
df = pd.read_csv('data/play.csv')
df

Unnamed: 0,Day,Weather,Temperature,Humidity,Wind,Play
0,1,Sunny,Hot,90,10,No
1,2,Cloudy,Hot,95,5,Yes
2,3,Sunny,Mild,70,30,Yes
3,4,Cloudy,Mild,89,25,Yes
4,5,Rainy,Mild,85,25,No
5,6,Rainy,Cool,60,30,No
6,7,Rainy,Mild,92,20,Yes
7,8,Sunny,Hot,95,20,No
8,9,Cloudy,Hot,65,12,Yes
9,10,Rainy,Mild,100,25,No


In [2]:
df_clean = df.drop(axis=1, columns='Day')
df_clean

Unnamed: 0,Weather,Temperature,Humidity,Wind,Play
0,Sunny,Hot,90,10,No
1,Cloudy,Hot,95,5,Yes
2,Sunny,Mild,70,30,Yes
3,Cloudy,Mild,89,25,Yes
4,Rainy,Mild,85,25,No
5,Rainy,Cool,60,30,No
6,Rainy,Mild,92,20,Yes
7,Sunny,Hot,95,20,No
8,Cloudy,Hot,65,12,Yes
9,Rainy,Mild,100,25,No


In [3]:
features = df_clean.drop(columns = ['Play'], axis=1)
response = df_clean[['Play']]

In [5]:
# select columns with numerical data types
num_cols = features.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns
# select columns with categorical data types
cat_cols = features.select_dtypes(include=['object', 'bool', 'category']).columns

In [6]:
num_cols = num_cols.tolist()
num_cols

['Humidity', 'Wind']

In [7]:
cat_cols = cat_cols.tolist()
cat_cols

['Weather', 'Temperature']

In [8]:
#Encoding categorical data values
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

features[cat_cols] = features[cat_cols].astype('category') #let's make sure all categorical variables are of type str or category
#Now let's use apply() function to convert all caterical variables into encoded values.
features[cat_cols] = features[cat_cols].apply(LabelEncoder().fit_transform)

#now data set is ready for fitting.

In [9]:
features

Unnamed: 0,Weather,Temperature,Humidity,Wind
0,2,1,90,10
1,0,1,95,5
2,2,2,70,30
3,0,2,89,25
4,1,2,85,25
5,1,0,60,30
6,1,2,92,20
7,2,1,95,20
8,0,1,65,12
9,1,2,100,25


## One-Hot Encoder:

### `get_dummies()` from Pandas Package

In [10]:
features = df_clean.drop(columns = ['Play'], axis=1)#get the origin features df

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

pd.get_dummies(features, columns=["Weather"])

Unnamed: 0,Temperature,Humidity,Wind,Weather_Cloudy,Weather_Rainy,Weather_Sunny
0,Hot,90,10,0,0,1
1,Hot,95,5,1,0,0
2,Mild,70,30,0,0,1
3,Mild,89,25,1,0,0
4,Mild,85,25,0,1,0
5,Cool,60,30,0,1,0
6,Mild,92,20,0,1,0
7,Hot,95,20,0,0,1
8,Hot,65,12,1,0,0
9,Mild,100,25,0,1,0


In [13]:
features

Unnamed: 0,Weather,Temperature,Humidity,Wind
0,Sunny,Hot,90,10
1,Cloudy,Hot,95,5
2,Sunny,Mild,70,30
3,Cloudy,Mild,89,25
4,Rainy,Mild,85,25
5,Rainy,Cool,60,30
6,Rainy,Mild,92,20
7,Sunny,Hot,95,20
8,Cloudy,Hot,65,12
9,Rainy,Mild,100,25


In [14]:
features_encoded = pd.get_dummies(features)
features_encoded

Unnamed: 0,Humidity,Wind,Weather_Cloudy,Weather_Rainy,Weather_Sunny,Temperature_Cool,Temperature_Hot,Temperature_Mild
0,90,10,0,0,1,0,1,0
1,95,5,1,0,0,0,1,0
2,70,30,0,0,1,0,0,1
3,89,25,1,0,0,0,0,1
4,85,25,0,1,0,0,0,1
5,60,30,0,1,0,1,0,0
6,92,20,0,1,0,0,0,1
7,95,20,0,0,1,0,1,0
8,65,12,1,0,0,0,1,0
9,100,25,0,1,0,0,0,1


In [15]:
from sklearn.model_selection import train_test_split
my_result_list = train_test_split(features_encoded, response, test_size=0.20, random_state=0)
features_train, features_test, response_train, response_test = my_result_list

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state = 0)
classifier.fit(features_train, response_train)

response_pred = classifier.predict(features_test)
from sklearn.metrics import accuracy_score
print('Accuracy Score on test data: ', accuracy_score(y_true=response_test, y_pred=response_pred))

Accuracy Score on test data:  0.5


  import sys


In [17]:
response_test

Unnamed: 0,Play
2,Yes
8,Yes


In [166]:
from sklearn.model_selection import train_test_split
my_result_list = train_test_split(features, response, test_size=0.20, random_state=0)
features_train, features_test, response_train, response_test = my_result_list

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state = 0)
classifier.fit(features_train, response_train)

response_pred = classifier.predict(features_test)
from sklearn.metrics import accuracy_score
print('Accuracy Score on test data: ', accuracy_score(y_true=response_test, y_pred=response_pred))

Accuracy Score on test data:  0.5


  import sys


In [168]:
list(zip(features.columns, classifier.feature_importances_))

[('Weather', 0.42362082928409467),
 ('Temperature', 0.09588597343699383),
 ('Humidity', 0.22124473598963398),
 ('Wind', 0.25924846128927753)]

In [134]:
list(zip(features_encoded.columns, classifier.feature_importances_))

[('Humidity', 0.20906624554583736),
 ('Wind', 0.27123582766439913),
 ('Weather_Cloudy', 0.2489480077745383),
 ('Weather_Rainy', 0.05151441528992551),
 ('Weather_Sunny', 0.08648537646351061),
 ('Temperature_Cool', 0.02476745800360962),
 ('Temperature_Hot', 0.05456916099773245),
 ('Temperature_Mild', 0.05341350826044706)]

In [133]:
classifier.feature_importances_

array([0.20906625, 0.27123583, 0.24894801, 0.05151442, 0.08648538,
       0.02476746, 0.05456916, 0.05341351])

### `DictVectorizer()` from Scikit-Learn Package

When the data comes as a list of dictionaries, we can use Scikit-Learn's `DictVectorizer`.

In [135]:
data = [
    {'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
    {'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
    {'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
    {'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}
]

In [137]:
pd.DataFrame(data)

Unnamed: 0,price,rooms,neighborhood
0,850000,4,Queen Anne
1,700000,3,Fremont
2,650000,3,Wallingford
3,600000,2,Fremont


In [136]:
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False, dtype=int)
vec.fit_transform(data)

array([[     0,      1,      0, 850000,      4],
       [     1,      0,      0, 700000,      3],
       [     0,      0,      1, 650000,      3],
       [     1,      0,      0, 600000,      2]])

In [138]:
vec.feature_names_

['neighborhood=Fremont',
 'neighborhood=Queen Anne',
 'neighborhood=Wallingford',
 'price',
 'rooms']

In [140]:
features = df_clean.drop(columns = ['Play'], axis=1)
features

Unnamed: 0,Weather,Temperature,Humidity,Wind
0,Sunny,Hot,90,10
1,Cloudy,Hot,95,5
2,Sunny,Mild,70,30
3,Cloudy,Mild,89,25
4,Rainy,Mild,85,25
5,Rainy,Cool,60,30
6,Rainy,Mild,92,20
7,Sunny,Hot,95,20
8,Cloudy,Hot,65,12
9,Rainy,Mild,100,25


In [151]:
# features_dict = features.to_dict()  # never used
from sklearn.feature_extraction import DictVectorizer
#let's put out features dataframe into right data structure
features_list = features.apply(dict, axis=1)#each row will hold a dictionary

#let's call DictVectorizer
vec = DictVectorizer(sparse=False, dtype=int)
features_encoded = vec.fit_transform(features_list) #features_encoded is in numpy matrix form
features_encoded

array([[ 90,   0,   1,   0,   0,   0,   1,  10],
       [ 95,   0,   1,   0,   1,   0,   0,   5],
       [ 70,   0,   0,   1,   0,   0,   1,  30],
       [ 89,   0,   0,   1,   1,   0,   0,  25],
       [ 85,   0,   0,   1,   0,   1,   0,  25],
       [ 60,   1,   0,   0,   0,   1,   0,  30],
       [ 92,   0,   0,   1,   0,   1,   0,  20],
       [ 95,   0,   1,   0,   0,   0,   1,  20],
       [ 65,   0,   1,   0,   1,   0,   0,  12],
       [100,   0,   0,   1,   0,   1,   0,  25]])

In [156]:
from sklearn.model_selection import train_test_split
my_result_list = train_test_split(features_encoded, response, test_size=0.20, random_state=0)
features_train, features_test, response_train, response_test = my_result_list

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state = 0)
classifier.fit(features_train, response_train)

response_pred = classifier.predict(features_test)
from sklearn.metrics import accuracy_score
print('Accuracy Score on test data: ', accuracy_score(y_true=response_test, y_pred=response_pred))

Accuracy Score on test data:  0.5


  import sys


In [157]:
classifier.feature_importances_

array([0.20906625, 0.27123583, 0.24894801, 0.05151442, 0.08648538,
       0.02476746, 0.05456916, 0.05341351])

In [158]:
vec.feature_names_

['Humidity',
 'Temperature=Cool',
 'Temperature=Hot',
 'Temperature=Mild',
 'Weather=Cloudy',
 'Weather=Rainy',
 'Weather=Sunny',
 'Wind']

In [160]:
list(zip(vec.feature_names_, classifier.feature_importances_))

[('Humidity', 0.20906624554583736),
 ('Temperature=Cool', 0.27123582766439913),
 ('Temperature=Hot', 0.2489480077745383),
 ('Temperature=Mild', 0.05151441528992551),
 ('Weather=Cloudy', 0.08648537646351061),
 ('Weather=Rainy', 0.02476745800360962),
 ('Weather=Sunny', 0.05456916099773245),
 ('Wind', 0.05341350826044706)]

## Exercise: Evaluating Multiple Models using Adult Data Set

 See the `.ipynb` notebook "Week9_EvaluatingMultipleModels-Copy1.ipynb"