In [1]:
import pandas as pd
import numpy as np

In [2]:
!head train.csv

id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,f12246592,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,5b0f5acd5,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0
5,0,1,1,T,N,Blue,Polygon,Lion,Costa Rica,Oboe,46cab09da,29a854620,ff5b35098,b7e6f8e6f,51e27c16d,1,Novice,Freezing,j,E,PZ,2,2,0
6,0,1,1,T,N,Green,Trapezoid,Cat,China,Piano,be5592604,3393a0f78,c6587685d,0

In [3]:
# Read the data
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

y_train = df_train.loc[:, "target"]
df_train.drop(columns="target", inplace=True)

# Length of training set
N_train = len(df_train)

combine = pd.concat((df_train, df_test))

del df_train, df_test
print(combine.shape)

(500000, 24)


## Binary features

In [4]:
# For columns bin_3, bin_4, we convert T and F to 1 and 0 respectively.
combine.loc[:, "bin_3"] = (combine.loc[:, "bin_3"] == "T").astype(int)
combine.loc[:, "bin_4"] = (combine.loc[:, "bin_4"] == "Y").astype(int)

In [5]:
combine.loc[:, "bin_0": "bin_4"].head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4
0,0,0,0,1,1
1,0,1,0,1,1
2,0,0,0,0,1
3,0,1,0,0,1
4,0,0,0,0,0


## Nominal Features

We see from simple EDA that nom_0 ... nom_4 are low cardinal features. We use one hot encoding on them.

In [6]:
nominal_low_card = ["nom_%d" % i for i in range(0, 5)]
combine = pd.get_dummies(combine, columns=nominal_low_card, drop_first=True)

For columns nom_5 to nom_9, we treat them as hexademical numbers (hashing), and then we calculate the frequency. After this, we drop the original columns.

In [7]:
import functools
base16 = functools.partial(int, base=16)

for i in range(5, 10):
    col_name = "nom_%d" % i
    freq_col_name = "nom_%d_freq" % i
    
    nom_hex = combine.loc[:, col_name].astype(str).apply(base16)
    freq = nom_hex.value_counts() / len(combine)
    combine.loc[:, freq_col_name] = nom_hex.apply(lambda x: freq[x])
    combine.drop(columns=col_name, inplace=True)

## Ordinal features

In [8]:
combine.loc[:, "ord_0": "ord_5"].head()

Unnamed: 0,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5
0,2,Grandmaster,Cold,h,D,kr
1,1,Grandmaster,Hot,a,A,bF
2,1,Expert,Lava Hot,h,R,Jc
3,1,Grandmaster,Boiling Hot,i,D,kW
4,1,Grandmaster,Freezing,a,R,qP


In [9]:
combine.loc[:, "ord_0"].unique()

array([2, 1, 3], dtype=int64)

In [10]:
# For ord_0, we rename 1, 2, 3 to 0, 1, 2
combine.loc[:, "ord_0"] -= 1

In [11]:
# We see that the items in ord_1 have their own meanings. We replace the values with their natural ordering.
combine.loc[:, "ord_1"].unique()

array(['Grandmaster', 'Expert', 'Novice', 'Contributor', 'Master'],
      dtype=object)

In [12]:
repl_dict = {
    "Novice": 0,
    "Contributor": 1,
    "Expert": 2,
    "Master": 3,
    "Grandmaster": 4
}

combine.loc[:, "ord_1"].replace(repl_dict, inplace=True)

In [13]:
# Do the same for ord_2
combine.loc[:, "ord_2"].unique()

array(['Cold', 'Hot', 'Lava Hot', 'Boiling Hot', 'Freezing', 'Warm'],
      dtype=object)

In [14]:
repl_dict = {
    "Freezing": 0,
    "Cold": 1,
    "Warm": 2,
    "Hot":3,
    "Boiling Hot": 4,
    "Lava Hot":5
}

combine.loc[:, "ord_2"].replace(repl_dict, inplace=True)

In [15]:
# ord_3 consist of some alphabetical letters. We replace them with ascii codes.
combine.loc[:, "ord_3"].unique()

array(['h', 'a', 'i', 'j', 'g', 'e', 'd', 'b', 'k', 'f', 'l', 'n', 'o',
       'c', 'm'], dtype=object)

In [16]:
combine.loc[:, "ord_3"] = combine.loc[:, "ord_3"].apply(ord) - ord("a")

In [17]:
# Do the same for ord_4.
combine.loc[:, "ord_4"].unique()

array(['D', 'A', 'R', 'E', 'P', 'K', 'V', 'Q', 'Z', 'L', 'F', 'T', 'U',
       'S', 'Y', 'B', 'H', 'J', 'N', 'G', 'W', 'I', 'O', 'C', 'X', 'M'],
      dtype=object)

In [18]:
combine.loc[:, "ord_4"] = combine.loc[:, "ord_4"].apply(ord) - ord("A")

In [19]:
# For ord_5, we know it should be sorted in lexicographic order. We take its index in the sorted values.
symbols = sorted(combine.loc[:, "ord_5"].unique())
symbols[:5]

['AP', 'Ai', 'Aj', 'BA', 'BE']

In [20]:
combine.loc[:, "ord_5"] = combine.loc[:, "ord_5"].apply(lambda x: symbols.index(x))
combine.loc[:, "ord_5"].head()

0    136
1     93
2     31
3    134
4    158
Name: ord_5, dtype: int64

## Cyclic features

In [21]:
def get_cyclic_feature(col: pd.Series, T):
    interval = 2 * np.pi / T
    phase = col * interval
    return np.sin(phase), np.cos(phase)

In [22]:
# day and month are cyclical, so we use the sin/cos trick on them.
# day means day of the week
combine.loc[:, "day"].unique()

array([2, 7, 5, 4, 3, 1, 6], dtype=int64)

In [23]:
day_sin, day_cos = get_cyclic_feature(combine.loc[:, "day"], 7)
combine.loc[:, "day_sin"] = day_sin
combine.loc[:, "day_cos"] = day_cos

In [24]:
combine.drop(columns="day", inplace=True)

In [25]:
# Do the same for month
combine.loc[:, "month"].unique()

array([ 2,  8,  1,  4, 10,  3,  7,  9, 12, 11,  5,  6], dtype=int64)

In [26]:
month_sin, month_cos = get_cyclic_feature(combine.loc[:, "month"], 12)
combine.loc[:, "month_sin"] = month_sin
combine.loc[:, "month_cos"] = month_cos

In [27]:
combine.drop(columns="month", inplace=True)

## Convert to integers and save as file

In [28]:
combine.loc[:, "bin_0":"nom_4_Theremin"] = combine.loc[:, "bin_0":"nom_4_Theremin"].astype(np.uint8)

In [38]:
combine.shape

(500000, 41)

In [35]:
# Let's split the dataset into the original training and test set,
# and put back the target in the training set
X_train = combine[:N_train].copy()
X_train["target"] = y_train
X_test = combine[N_train:].copy()

In [37]:
X_train.to_csv("train_processed.csv", index=False)
X_test.to_csv("test_processe")

(300000, 42)