ONE HOT ENCODING

One hot encoding is a technique that we use to represent categorical variables as numerical values in a machine learning model.

In [1]:
import pandas as pd
township_df = pd.read_excel("ML_Algo_Datasets/TownShip.xlsx")
township_df

Unnamed: 0,township,area,price
0,monroe town,2600,550000
1,monroe town,3000,565000
2,monroe town,3200,610000
3,monroe town,3600,680000
4,monroe town,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robbinsville,2600,575000


In [2]:
# creat dummies of each township column
township_df_dummis  = pd.get_dummies(township_df.township)
township_df_dummis

Unnamed: 0,monroe town,robbinsville,west windsor
0,True,False,False
1,True,False,False
2,True,False,False
3,True,False,False
4,True,False,False
5,False,False,True
6,False,False,True
7,False,False,True
8,False,False,True
9,False,True,False


In [3]:
merged_township_df = pd.concat([township_df,township_df_dummis], axis="columns")
merged_township_df

Unnamed: 0,township,area,price,monroe town,robbinsville,west windsor
0,monroe town,2600,550000,True,False,False
1,monroe town,3000,565000,True,False,False
2,monroe town,3200,610000,True,False,False
3,monroe town,3600,680000,True,False,False
4,monroe town,4000,725000,True,False,False
5,west windsor,2600,585000,False,False,True
6,west windsor,2800,615000,False,False,True
7,west windsor,3300,650000,False,False,True
8,west windsor,3600,710000,False,False,True
9,robbinsville,2600,575000,False,True,False


Dummy Variable Trap

Let’s consider the case of gender having two values male (0 or 1) and female (1 or 0). Including both the dummy variable can cause redundancy because if a person is not male in such case that person is a female, hence, we don’t need to use both the variables in regression models. This will protect us from the dummy variable trap.

In [4]:
# therefore we always remove any one colums from dummies before training the model

X = merged_township_df.drop(["township", "price", "west windsor"], axis="columns")
X

Unnamed: 0,area,monroe town,robbinsville
0,2600,True,False
1,3000,True,False
2,3200,True,False
3,3600,True,False
4,4000,True,False
5,2600,False,False
6,2800,False,False
7,3300,False,False
8,3600,False,False
9,2600,False,True


In [5]:
Y = merged_township_df.price
Y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    575000
Name: price, dtype: int64

In [6]:
from sklearn.linear_model import LinearRegression
Model = LinearRegression()

Model.fit(X,Y)

In [7]:
# prdicting price from trained model
Model.predict([[3400, True, False]]) # (house with area-->3400 in township-->monroe town)



array([637821.97376908])

In [8]:
Model.predict([[2800, False, True]]) # (house with area-->2800 in township-->robbinsville)



array([567870.88798108])

In [9]:
Model.predict([[3000, False, False]]) # (house with area-->2800 in township-->west windsor)



array([632611.26639432])

In [10]:
# to check accuracy of your model 
Model.score(X,Y)

0.7329348930157672

you can also use OneHotEncoder directly fron sk learn library

In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [12]:
township_df_le = township_df
township_df_le.township = le.fit_transform(township_df_le.township)
township_df_le

Unnamed: 0,township,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [13]:
X = township_df_le[["township", "area"]].values
X # -> 2D array

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]], dtype=int64)

In [14]:
Y = township_df_le.price
Y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    575000
Name: price, dtype: int64

In [15]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

In [16]:
X = ohe.fit_transform(X).toarray()
X

array([[1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]])