In [1]:
import pandas as pd
import numpy as np

In [2]:
!head train.csv

id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,f12246592,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,5b0f5acd5,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0
5,0,1,1,T,N,Blue,Polygon,Lion,Costa Rica,Oboe,46cab09da,29a854620,ff5b35098,b7e6f8e6f,51e27c16d,1,Novice,Freezing,j,E,PZ,2,2,0
6,0,1,1,T,N,Green,Trapezoid,Cat,China,Piano,be5592604,3393a0f78,c6587685d,0

In [3]:
# Read the data
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

y_train = df_train.loc[:, "target"]
df_train.drop(columns="target", inplace=True)

# Length of training set
N_train = len(df_train)

combine = pd.concat((df_train, df_test))

del df_train, df_test
print(combine.shape)

(500000, 24)


## Binary features

In [4]:
# For columns bin_3, bin_4, we convert T and F to 1 and 0 respectively.
combine.loc[:, "bin_3"] = (combine.loc[:, "bin_3"] == "T").astype(int)
combine.loc[:, "bin_4"] = (combine.loc[:, "bin_4"] == "Y").astype(int)

In [5]:
combine.loc[:, "bin_0": "bin_4"].head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4
0,0,0,0,1,1
1,0,1,0,1,1
2,0,0,0,0,1
3,0,1,0,0,1
4,0,0,0,0,0


## Nominal Features

We see from simple EDA that nom_0 ... nom_4 are low cardinal features. We use one hot encoding on them.

In [6]:
nominal_low_card = ["nom_%d" % i for i in range(0, 5)]
combine = pd.get_dummies(combine, columns=nominal_low_card, drop_first=True)

For columns nom_5 to nom_9, we treat them as hexademical numbers (hashing), and then we calculate the frequency.

In [7]:
import functools
base16 = functools.partial(int, base=16)

for i in range(5, 10):
    col_name = "nom_%d" % i
    freq_col_name = "nom_%d_freq" % i
    
    nom_hex = combine.loc[:, col_name].astype(str).apply(base16)
    freq = nom_hex.value_counts() / len(combine)
    combine.loc[:, freq_col_name] = nom_hex.apply(lambda x: freq[x])

In [9]:
combine.columns.tolist()

['id',
 'bin_0',
 'bin_1',
 'bin_2',
 'bin_3',
 'bin_4',
 'nom_5',
 'nom_6',
 'nom_7',
 'nom_8',
 'nom_9',
 'ord_0',
 'ord_1',
 'ord_2',
 'ord_3',
 'ord_4',
 'ord_5',
 'day',
 'month',
 'nom_0_Green',
 'nom_0_Red',
 'nom_1_Polygon',
 'nom_1_Square',
 'nom_1_Star',
 'nom_1_Trapezoid',
 'nom_1_Triangle',
 'nom_2_Cat',
 'nom_2_Dog',
 'nom_2_Hamster',
 'nom_2_Lion',
 'nom_2_Snake',
 'nom_3_China',
 'nom_3_Costa Rica',
 'nom_3_Finland',
 'nom_3_India',
 'nom_3_Russia',
 'nom_4_Oboe',
 'nom_4_Piano',
 'nom_4_Theremin',
 'nom_5_freq',
 'nom_6_freq',
 'nom_7_freq',
 'nom_8_freq',
 'nom_9_freq']