# <font color='Orange'>Exploring Categorical Feature Encoding Techniques</font>

## <font color='green'>Importing Dataset</font> 

In [1]:
pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 18.0MB/s eta 0:00:01[K     |████████▏                       | 20kB 2.2MB/s eta 0:00:01[K     |████████████▏                   | 30kB 2.9MB/s eta 0:00:01[K     |████████████████▎               | 40kB 3.1MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 2.5MB/s eta 0:00:01[K     |████████████████████████▍       | 61kB 2.8MB/s eta 0:00:01[K     |████████████████████████████▍   | 71kB 3.1MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 2.6MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [2]:
import pandas as pd
import category_encoders as ce

  import pandas.util.testing as tm


In [3]:
df = pd.read_csv('train.csv')

print(f'Dimension Of Dataset: Row : {df.shape[0]}, Col : {df.shape[1]}')
print()
print(df.head(4))

Dimension Of Dataset: Row : 71301, Col : 25

   id  bin_0  bin_1  bin_2 bin_3 bin_4  ... ord_3 ord_4 ord_5  day month target
0   0      0      0      0     T     Y  ...     h     D    kr  2.0   2.0    0.0
1   1      0      1      0     T     Y  ...     a     A    bF  7.0   8.0    0.0
2   2      0      0      0     F     Y  ...     h     R    Jc  7.0   2.0    0.0
3   3      0      1      0     F     Y  ...     i     D    kW  2.0   1.0    1.0

[4 rows x 25 columns]


## <font color='green'>Basic Data Preprocessing</font>

In [4]:
# Checking Data Type Of Each Column
df.dtypes

id          int64
bin_0       int64
bin_1       int64
bin_2       int64
bin_3      object
bin_4      object
nom_0      object
nom_1      object
nom_2      object
nom_3      object
nom_4      object
nom_5      object
nom_6      object
nom_7      object
nom_8      object
nom_9      object
ord_0     float64
ord_1      object
ord_2      object
ord_3      object
ord_4      object
ord_5      object
day       float64
month     float64
target    float64
dtype: object

In [5]:
# Extracting All Categorical Columns into One DataFrame Using List Comprehension
cat_col = df[[x for x in df.columns if(df[x].dtypes != 'int64')]]
cat_col.shape[1]

21

In [6]:
cat_col.head(12)

Unnamed: 0,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,T,Y,Green,Triangle,Snake,Finland,Bassoon,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51,2.0,Grandmaster,Cold,h,D,kr,2.0,2.0,0.0
1,T,Y,Green,Trapezoid,Hamster,Russia,Piano,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21,1.0,Grandmaster,Hot,a,A,bF,7.0,8.0,0.0
2,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0,1.0,Expert,Lava Hot,h,R,Jc,7.0,2.0,0.0
3,F,Y,Red,Trapezoid,Snake,Canada,Oboe,f12246592,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71,1.0,Grandmaster,Boiling Hot,i,D,kW,2.0,1.0,1.0
4,F,N,Red,Trapezoid,Lion,Canada,Oboe,5b0f5acd5,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7,1.0,Grandmaster,Freezing,a,R,qP,7.0,8.0,0.0
5,T,N,Blue,Polygon,Lion,Costa Rica,Oboe,46cab09da,29a854620,ff5b35098,b7e6f8e6f,51e27c16d,1.0,Novice,Freezing,j,E,PZ,2.0,2.0,0.0
6,T,N,Green,Trapezoid,Cat,China,Piano,be5592604,3393a0f78,c6587685d,06f5ae149,7e3d79a0d,2.0,Grandmaster,Lava Hot,g,P,wy,5.0,4.0,0.0
7,T,Y,Red,Triangle,Dog,Russia,Oboe,72f8028dc,55eed5058,2dd9daf45,98addc2c9,feb72ecc2,1.0,Novice,Lava Hot,j,K,Ed,4.0,2.0,0.0
8,T,Y,Blue,Square,Hamster,Canada,Bassoon,4.60E+13,3e44d44eb,3f0057c9b,a2d110837,34a7273bf,2.0,Novice,Boiling Hot,e,V,qo,3.0,4.0,0.0
9,F,Y,Red,Trapezoid,Lion,China,Piano,ad95dc0ee,8ed6221ae,4fbfe4a84,2c15d0173,0ece7a511,1.0,Expert,Freezing,h,Q,CZ,3.0,2.0,0.0


In [7]:
# Counting Number Of Unique Values in each Column, So rather than preparing a list, we can create a dictionary !
l = {}           
for x in cat_col.columns:
    l[x]=cat_col[x].nunique()
print(l)

{'bin_3': 2, 'bin_4': 2, 'nom_0': 3, 'nom_1': 7, 'nom_2': 6, 'nom_3': 6, 'nom_4': 4, 'nom_5': 222, 'nom_6': 521, 'nom_7': 1214, 'nom_8': 2196, 'nom_9': 11142, 'ord_0': 3, 'ord_1': 5, 'ord_2': 6, 'ord_3': 15, 'ord_4': 26, 'ord_5': 192, 'day': 7, 'month': 12, 'target': 2}


# <font color='Green'>I have Prepared the Data And Its Information For You All, Now You Need to apply different Encoding Technique On This Data.</font>

In [8]:
encoder= ce.OrdinalEncoder(cols=['nom_2'],return_df=True,                               ##  Ordinalencoding or Label Encoding
                           mapping=[{'col':'nom_2',                      
'mapping':{'None':0,'Snake':1,'Hamster':2,'Lion':3,'Cat':4,'Dog':5}}])

df

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51,2.0,Grandmaster,Cold,h,D,kr,2.0,2.0,0.0
1,1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21,1.0,Grandmaster,Hot,a,A,bF,7.0,8.0,0.0
2,2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0,1.0,Expert,Lava Hot,h,R,Jc,7.0,2.0,0.0
3,3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,f12246592,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71,1.0,Grandmaster,Boiling Hot,i,D,kW,2.0,1.0,1.0
4,4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,5b0f5acd5,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7,1.0,Grandmaster,Freezing,a,R,qP,7.0,8.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71296,71296,0,1,1,T,Y,Red,Star,Cat,Canada,Piano,df51b96fc,e10910468,bc9fd0175,127af6795,b69c6d05f,2.0,Novice,Hot,j,F,aM,4.0,2.0,0.0
71297,71297,0,0,0,T,Y,Green,Trapezoid,Lion,Canada,Piano,06eeaf0aa,25d3a1087,6ded0aa57,c74b05c9a,aa1796b81,1.0,Novice,Freezing,l,R,SB,2.0,1.0,0.0
71298,71298,0,0,0,T,Y,Green,Trapezoid,Cat,India,Piano,6cd0619e0,a028c07af,2428baa29,c874a058e,cbb2e2721,2.0,Master,Freezing,a,J,Dx,1.0,11.0,0.0
71299,71299,0,0,0,T,Y,Blue,Trapezoid,Lion,Canada,Bassoon,3685a0904,fd2344f16,ad8263c35,03398d68f,f5cb73b1c,1.0,Grandmaster,Cold,i,K,dO,5.0,8.0,0.0


In [9]:
train = encoder.fit_transform(df)                 #fit and transform train data
train.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,1.0,Finland,Bassoon,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51,2.0,Grandmaster,Cold,h,D,kr,2.0,2.0,0.0
1,1,0,1,0,T,Y,Green,Trapezoid,2.0,Russia,Piano,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21,1.0,Grandmaster,Hot,a,A,bF,7.0,8.0,0.0
2,2,0,0,0,F,Y,Blue,Trapezoid,3.0,Russia,Theremin,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0,1.0,Expert,Lava Hot,h,R,Jc,7.0,2.0,0.0
3,3,0,1,0,F,Y,Red,Trapezoid,1.0,Canada,Oboe,f12246592,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71,1.0,Grandmaster,Boiling Hot,i,D,kW,2.0,1.0,1.0
4,4,0,0,0,F,N,Red,Trapezoid,3.0,Canada,Oboe,5b0f5acd5,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7,1.0,Grandmaster,Freezing,a,R,qP,7.0,8.0,0.0


In [10]:
# One hot encoding or Nominal encoding
teju=ce.OneHotEncoder(cols='nom_2',handle_unknown='return_nan',return_df=True,use_cat_names=True)
train = encoder.fit_transform(df)
train


Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,1.0,Finland,Bassoon,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51,2.0,Grandmaster,Cold,h,D,kr,2.0,2.0,0.0
1,1,0,1,0,T,Y,Green,Trapezoid,2.0,Russia,Piano,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21,1.0,Grandmaster,Hot,a,A,bF,7.0,8.0,0.0
2,2,0,0,0,F,Y,Blue,Trapezoid,3.0,Russia,Theremin,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0,1.0,Expert,Lava Hot,h,R,Jc,7.0,2.0,0.0
3,3,0,1,0,F,Y,Red,Trapezoid,1.0,Canada,Oboe,f12246592,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71,1.0,Grandmaster,Boiling Hot,i,D,kW,2.0,1.0,1.0
4,4,0,0,0,F,N,Red,Trapezoid,3.0,Canada,Oboe,5b0f5acd5,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7,1.0,Grandmaster,Freezing,a,R,qP,7.0,8.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71296,71296,0,1,1,T,Y,Red,Star,4.0,Canada,Piano,df51b96fc,e10910468,bc9fd0175,127af6795,b69c6d05f,2.0,Novice,Hot,j,F,aM,4.0,2.0,0.0
71297,71297,0,0,0,T,Y,Green,Trapezoid,3.0,Canada,Piano,06eeaf0aa,25d3a1087,6ded0aa57,c74b05c9a,aa1796b81,1.0,Novice,Freezing,l,R,SB,2.0,1.0,0.0
71298,71298,0,0,0,T,Y,Green,Trapezoid,4.0,India,Piano,6cd0619e0,a028c07af,2428baa29,c874a058e,cbb2e2721,2.0,Master,Freezing,a,J,Dx,1.0,11.0,0.0
71299,71299,0,0,0,T,Y,Blue,Trapezoid,3.0,Canada,Bassoon,3685a0904,fd2344f16,ad8263c35,03398d68f,f5cb73b1c,1.0,Grandmaster,Cold,i,K,dO,5.0,8.0,0.0


In [11]:
# Binary Encoder
encoder= ce.BinaryEncoder(cols=['nom_2'],return_df=True)
train = encoder.fit_transform(df) 
train

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2_0,nom_2_1,nom_2_2,nom_2_3,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,0,0,0,1,Finland,Bassoon,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51,2.0,Grandmaster,Cold,h,D,kr,2.0,2.0,0.0
1,1,0,1,0,T,Y,Green,Trapezoid,0,0,1,0,Russia,Piano,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21,1.0,Grandmaster,Hot,a,A,bF,7.0,8.0,0.0
2,2,0,0,0,F,Y,Blue,Trapezoid,0,0,1,1,Russia,Theremin,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0,1.0,Expert,Lava Hot,h,R,Jc,7.0,2.0,0.0
3,3,0,1,0,F,Y,Red,Trapezoid,0,0,0,1,Canada,Oboe,f12246592,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71,1.0,Grandmaster,Boiling Hot,i,D,kW,2.0,1.0,1.0
4,4,0,0,0,F,N,Red,Trapezoid,0,0,1,1,Canada,Oboe,5b0f5acd5,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7,1.0,Grandmaster,Freezing,a,R,qP,7.0,8.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71296,71296,0,1,1,T,Y,Red,Star,0,1,0,0,Canada,Piano,df51b96fc,e10910468,bc9fd0175,127af6795,b69c6d05f,2.0,Novice,Hot,j,F,aM,4.0,2.0,0.0
71297,71297,0,0,0,T,Y,Green,Trapezoid,0,0,1,1,Canada,Piano,06eeaf0aa,25d3a1087,6ded0aa57,c74b05c9a,aa1796b81,1.0,Novice,Freezing,l,R,SB,2.0,1.0,0.0
71298,71298,0,0,0,T,Y,Green,Trapezoid,0,1,0,0,India,Piano,6cd0619e0,a028c07af,2428baa29,c874a058e,cbb2e2721,2.0,Master,Freezing,a,J,Dx,1.0,11.0,0.0
71299,71299,0,0,0,T,Y,Blue,Trapezoid,0,0,1,1,Canada,Bassoon,3685a0904,fd2344f16,ad8263c35,03398d68f,f5cb73b1c,1.0,Grandmaster,Cold,i,K,dO,5.0,8.0,0.0


In [12]:
train.columns

Index(['id', 'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2_0', 'nom_2_1', 'nom_2_2', 'nom_2_3', 'nom_3', 'nom_4', 'nom_5',
       'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_0', 'ord_1', 'ord_2', 'ord_3',
       'ord_4', 'ord_5', 'day', 'month', 'target'],
      dtype='object')

In [15]:
# Hash Encoding
encoder=ce.HashingEncoder(cols='nom_1',n_components=6)
train = encoder.fit_transform(df)
train


Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,0,1,0,0,0,0,T,Y,Green,Snake,Finland,Bassoon,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51,2.0,Grandmaster,Cold,h,D,kr,2.0,2.0,0.0
1,0,0,0,0,1,0,1,0,1,0,T,Y,Green,Hamster,Russia,Piano,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21,1.0,Grandmaster,Hot,a,A,bF,7.0,8.0,0.0
2,0,0,0,0,1,0,2,0,0,0,F,Y,Blue,Lion,Russia,Theremin,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0,1.0,Expert,Lava Hot,h,R,Jc,7.0,2.0,0.0
3,0,0,0,0,1,0,3,0,1,0,F,Y,Red,Snake,Canada,Oboe,f12246592,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71,1.0,Grandmaster,Boiling Hot,i,D,kW,2.0,1.0,1.0
4,0,0,0,0,1,0,4,0,0,0,F,N,Red,Lion,Canada,Oboe,5b0f5acd5,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7,1.0,Grandmaster,Freezing,a,R,qP,7.0,8.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71296,1,0,0,0,0,0,71296,0,1,1,T,Y,Red,Cat,Canada,Piano,df51b96fc,e10910468,bc9fd0175,127af6795,b69c6d05f,2.0,Novice,Hot,j,F,aM,4.0,2.0,0.0
71297,0,0,0,0,1,0,71297,0,0,0,T,Y,Green,Lion,Canada,Piano,06eeaf0aa,25d3a1087,6ded0aa57,c74b05c9a,aa1796b81,1.0,Novice,Freezing,l,R,SB,2.0,1.0,0.0
71298,0,0,0,0,1,0,71298,0,0,0,T,Y,Green,Cat,India,Piano,6cd0619e0,a028c07af,2428baa29,c874a058e,cbb2e2721,2.0,Master,Freezing,a,J,Dx,1.0,11.0,0.0
71299,0,0,0,0,1,0,71299,0,0,0,T,Y,Blue,Lion,Canada,Bassoon,3685a0904,fd2344f16,ad8263c35,03398d68f,f5cb73b1c,1.0,Grandmaster,Cold,i,K,dO,5.0,8.0,0.0


In [16]:
# Base-N-Encoding
encoder= ce.BaseNEncoder(cols=['nom_3'],return_df=True,base=5)
train = encoder.fit_transform(df)
train

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3_0,nom_3_1,nom_3_2,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,Snake,0,0,1,Bassoon,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51,2.0,Grandmaster,Cold,h,D,kr,2.0,2.0,0.0
1,1,0,1,0,T,Y,Green,Trapezoid,Hamster,0,0,2,Piano,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21,1.0,Grandmaster,Hot,a,A,bF,7.0,8.0,0.0
2,2,0,0,0,F,Y,Blue,Trapezoid,Lion,0,0,2,Theremin,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0,1.0,Expert,Lava Hot,h,R,Jc,7.0,2.0,0.0
3,3,0,1,0,F,Y,Red,Trapezoid,Snake,0,0,3,Oboe,f12246592,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71,1.0,Grandmaster,Boiling Hot,i,D,kW,2.0,1.0,1.0
4,4,0,0,0,F,N,Red,Trapezoid,Lion,0,0,3,Oboe,5b0f5acd5,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7,1.0,Grandmaster,Freezing,a,R,qP,7.0,8.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71296,71296,0,1,1,T,Y,Red,Star,Cat,0,0,3,Piano,df51b96fc,e10910468,bc9fd0175,127af6795,b69c6d05f,2.0,Novice,Hot,j,F,aM,4.0,2.0,0.0
71297,71297,0,0,0,T,Y,Green,Trapezoid,Lion,0,0,3,Piano,06eeaf0aa,25d3a1087,6ded0aa57,c74b05c9a,aa1796b81,1.0,Novice,Freezing,l,R,SB,2.0,1.0,0.0
71298,71298,0,0,0,T,Y,Green,Trapezoid,Cat,0,1,1,Piano,6cd0619e0,a028c07af,2428baa29,c874a058e,cbb2e2721,2.0,Master,Freezing,a,J,Dx,1.0,11.0,0.0
71299,71299,0,0,0,T,Y,Blue,Trapezoid,Lion,0,0,3,Bassoon,3685a0904,fd2344f16,ad8263c35,03398d68f,f5cb73b1c,1.0,Grandmaster,Cold,i,K,dO,5.0,8.0,0.0


In [23]:
#Target Encoding
data=pd.DataFrame({'ord_4':['A','D','R','F','J','K'],'day':[7,2,7,4,1,5]})
encoder=ce.TargetEncoder(cols='ord_4') 
train = encoder.fit_transform(data['ord_4'],data['day'])
train




Unnamed: 0,ord_4
0,4.333333
1,4.333333
2,4.333333
3,4.333333
4,4.333333
5,4.333333
