# Exploring categorical encodings

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Getting data from Kaggle using the API

1. pip install kaggle 
2. read https://www.kaggle.com/docs/api
3. acept the rules https://www.kaggle.com/c/cat-in-the-dat/data
4. kaggle competitions download -c cat-in-the-dat
5. unzip cat-in-the-dat.zip 

In [36]:
df = pd.read_csv("train.csv")

In [37]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,Snake,Finland,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


"The data contains binary features (bin_*), nominal features (nom_*), ordinal features (ord_*) as well as (potentially cyclical) day (of the week) and month features. The string ordinal features ord_{3-5} are lexically ordered according to string.ascii_letters."

In [38]:
df.shape

(300000, 25)

### EDA

In [39]:
Y = df.target.values
X = df.drop(columns=['target'])
x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, 
                                                    random_state=3)

In [40]:
x_train.shape

(240000, 24)

In [41]:
x_train.describe()

Unnamed: 0,id,bin_0,bin_1,bin_2,ord_0,day,month
count,240000.0,240000.0,240000.0,240000.0,240000.0,240000.0,240000.0
mean,150067.711688,0.127337,0.256221,0.383258,1.479213,3.005562,5.77185
std,86601.303141,0.333351,0.436546,0.486181,0.712299,1.81638,3.843278
min,1.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,75061.5,0.0,0.0,0.0,1.0,2.0,2.0
50%,150142.5,0.0,0.0,0.0,1.0,3.0,4.0
75%,225034.25,0.0,1.0,1.0,2.0,4.0,9.0
max,299998.0,1.0,1.0,1.0,3.0,7.0,12.0


`id` is not useful for prediction

In [42]:
x_train = x_train.drop(columns=['id'])
x_val = x_val.drop(columns=['id'])
x_train.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
247601,0,0,1,F,Y,Blue,Triangle,Snake,Russia,Oboe,...,4a54418e6,c15b9aff6,1,Grandmaster,Lava Hot,l,W,RG,2,1
225468,0,1,0,T,Y,Blue,Square,Hamster,Canada,Bassoon,...,7b082c8a0,5a2461910,1,Novice,Cold,i,Z,Fd,3,9
276268,0,0,0,F,N,Blue,Triangle,Lion,China,Oboe,...,1c6ab243e,f106759a1,2,Master,Freezing,f,Z,jS,2,4
148600,0,0,0,T,Y,Blue,Circle,Snake,Russia,Oboe,...,714157447,4cbc1c1d1,2,Contributor,Lava Hot,l,E,qX,2,8
221915,0,0,0,T,Y,Blue,Square,Hamster,India,Piano,...,fe6b03253,4c59ba502,1,Novice,Boiling Hot,e,S,zU,1,9


`bin_0` has 75% quantile 0. Let's look more closely 

In [46]:
np.quantile(x_train.bin_0.values, [0.75, 0.8, 0.85, 0.88, 0.90, 0.99], axis=0)

array([0., 0., 0., 1., 1., 1.])

In [49]:
x_train.bin_0.value_counts()/x_train.shape[0]

0    0.872663
1    0.127337
Name: bin_0, dtype: float64

approx 87% of the data are is 0. I will keep it for now but maybe is not that useful. Let's now look at the target.

In [53]:
unique, counts = np.unique(y_train, return_counts=True)
unique, counts

(array([0, 1]), array([166519,  73481]))

In [54]:
counts/counts.sum()

array([0.69382917, 0.30617083])

30% of the targets are 1s. A little imbalance but not a big deal. We will be using AUC as a metric instead of accuracy.

Now let's look at how many unique values each features has.

In [85]:
{col: len(x_train[col].unique()) for col in x_train.columns}

{'bin_0': 2,
 'bin_1': 2,
 'bin_2': 2,
 'bin_3': 2,
 'bin_4': 2,
 'nom_0': 3,
 'nom_1': 6,
 'nom_2': 6,
 'nom_3': 6,
 'nom_4': 4,
 'nom_5': 222,
 'nom_6': 522,
 'nom_7': 1220,
 'nom_8': 2214,
 'nom_9': 11916,
 'ord_0': 3,
 'ord_1': 5,
 'ord_2': 6,
 'ord_3': 15,
 'ord_4': 26,
 'ord_5': 192,
 'day': 7,
 'month': 12}

Some of these values are large. It it not a big deal for label encoding but I would be for one-hot-encoding.

### Solution #1: 
Encode categorical features with label encoding and use a tree based model

In [81]:
from sklearn.preprocessing import LabelEncoder
def label_encoding_with_UNK(col_train, UNK=True):
    """ Returns a label encoding "UNK" values
    """
    le = LabelEncoder()
    uniq = np.unique(col_train)
    if UNK:
        uniq = np.concatenate((np.array(["UNK"]),uniq))
    le.fit(uniq)
    return le

In [77]:
le = label_encoding_with_UNK(x_train.nom_0.values)

In [78]:
le.classes_

array(['Blue', 'Green', 'Red', 'UNK'], dtype=object)

In [79]:
le.transform(le.classes_)

array([0, 1, 2, 3])

In [80]:
le.transform(x_train.nom_0.values)

array([0, 0, 0, ..., 1, 1, 2])

In [91]:
# columns that are not numerical already
num_cols = x_train.iloc[:20,].applymap(np.isreal).all(0)

In [92]:
num_cols

bin_0     True
bin_1     True
bin_2     True
bin_3    False
bin_4    False
nom_0    False
nom_1    False
nom_2    False
nom_3    False
nom_4    False
nom_5    False
nom_6    False
nom_7    False
nom_8    False
nom_9    False
ord_0     True
ord_1    False
ord_2    False
ord_3    False
ord_4    False
ord_5    False
day       True
month     True
dtype: bool