In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score

pd.options.display.max_columns = 100
pd.options.display.max_rows = 150

In [2]:
df_train = pd.read_csv("data/competition_data/train.csv")
df_train = df_train.sample(frac=1).reset_index(drop=True)

df_test = pd.read_csv("data/competition_data/test.csv")

df_train

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,256403,0,0,0,T,Y,Red,Star,Lion,Russia,Oboe,416a8f3ab,e3396fbd5,5ad7da858,4329a8c91,9fddc642c,1,Expert,Lava Hot,k,W,ut,2,12,1
1,75557,0,0,0,T,Y,Red,Triangle,Lion,India,Bassoon,8266050c6,d213f48ed,fa311a2a3,551be7857,b0406edeb,2,Novice,Freezing,k,V,QM,2,9,1
2,267915,0,0,1,F,N,Green,Circle,Dog,India,Bassoon,0b3bec656,c8436c277,cb9bbed08,bb215fb49,0e85ebce3,2,Novice,Boiling Hot,i,I,aO,7,1,0
3,106685,0,1,1,T,Y,Red,Trapezoid,Snake,Canada,Oboe,c0534106d,1611d2b45,d888dabd2,f58eea518,ba877ef51,1,Novice,Freezing,a,L,sY,2,11,0
4,281526,0,0,1,F,Y,Blue,Square,Axolotl,Russia,Oboe,b97f51ac4,bd47c2363,b9b5e2958,203ed591a,244f3d156,3,Novice,Boiling Hot,j,K,sV,4,12,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,292532,0,1,0,F,N,Green,Square,Lion,Costa Rica,Bassoon,d1b1709e8,c9eedfbc2,59fe57a1b,eb2dc547b,817e2c7b5,1,Novice,Freezing,l,P,CM,1,3,0
299996,233649,0,0,0,T,N,Red,Square,Cat,Finland,Piano,475e79160,1b36c0b8c,aa148acce,51be79e62,e7dc16f7e,2,Grandmaster,Lava Hot,k,T,yc,3,12,1
299997,188741,0,0,0,T,N,Red,Trapezoid,Hamster,Costa Rica,Piano,76be0b8b1,73dd803d0,b0fad10d7,4d70170c4,67fe1445f,1,Novice,Boiling Hot,h,R,RL,4,11,0
299998,35881,0,0,0,F,Y,Red,Square,Cat,Finland,Piano,3263bdce5,4daee3baf,d18fe040f,b13166170,9c58f3be5,3,Master,Freezing,a,K,be,2,1,0


In [3]:
y_train = df_train["target"]

y_train

0         1
1         1
2         0
3         0
4         1
         ..
299995    0
299996    1
299997    0
299998    0
299999    0
Name: target, Length: 300000, dtype: int64

In [9]:
def process_features(df):
    df_out = df.copy()
    
    # binary features to 0, 1
    df_out["bin_3"] = df_out["bin_3"].map({"T": 1, "F": 0})
    df_out["bin_4"] = df_out["bin_4"].map({"Y": 1, "N": 0})

    # ordinal features to int
    for field in ["ord_1", "ord_2", "ord_3", "ord_4", "ord_5"]:
        df[field] = pd.Categorical(df[field])
        df_out[field] = df[field].cat.codes
        
    # nominal features to int (same as for ordinal, but let's separate to keep us some options)
    for field in ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']:
        df[field] = pd.Categorical(df[field])
        df_out[field] = df[field].cat.codes
        
    return df_out


df_train_processed = process_features(df_train)

del df_train_processed["target"]
del df_train_processed["id"]

df_train_processed

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,0,0,0,1,1,2,3,4,5,1,59,461,419,564,7406,1,1,4,10,22,178,2,12
1,0,0,0,1,1,2,5,4,4,0,120,419,1186,727,8182,2,4,2,10,21,55,2,9
2,0,0,1,0,0,1,0,2,4,0,11,399,958,1615,684,2,4,0,8,8,89,7,1
3,0,1,1,1,1,2,4,5,0,1,170,40,1031,2123,8652,1,4,2,0,11,168,2,11
4,0,0,1,0,1,0,2,0,5,1,163,379,849,303,1704,3,4,0,9,10,167,4,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0,1,0,0,0,1,2,4,2,0,176,401,413,2029,6009,1,4,2,11,15,9,1,3
299996,0,0,0,1,0,2,2,1,3,2,67,54,786,696,10858,2,2,4,10,19,190,3,12
299997,0,0,0,1,0,2,4,3,2,2,109,231,819,654,4861,1,4,0,7,17,60,4,11
299998,0,0,0,0,1,2,2,1,3,2,44,159,989,1527,7234,3,3,2,0,10,95,2,1


In [17]:
model = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(
        criterion="entropy",
        min_samples_leaf = 600,
        min_samples_split = 1000,
        max_depth = 3
    ),
    n_estimators=30
)

i = df_train_processed.shape[0]
df_train_processed1 = df_train_processed.copy()[:i // 2]
df_train_processed2 = df_train_processed.copy()[i // 2:]
y_train1=y_train[:i // 2]
y_train2=y_train[i // 2:]

def fit_and_score(model, df_train, y_train, df_test, y_test):
    model = model.fit(df_train, y_train)
    
    y_score_train = [s[1] for s in model.predict_proba(df_train)]
    y_score_test = [s[1] for s in model.predict_proba(df_test)]

    print(f"""
        Score on training set: {roc_auc_score(y_true=y_train, y_score=y_score_train)}
        Score on test set: {roc_auc_score(y_true=y_test, y_score=y_score_test)}
    """)
    
    return model

model = fit_and_score(model, df_train_processed1, y_train1, df_train_processed2, y_train2)



        Score on training set: 0.7711043196454681
        Score on test set: 0.7626247192834823
    


In [19]:
"""Looks good, let'S submit"""



df_test_processed = process_features(df_test)

del df_test_processed["id"]

y_pred = [x[1] for x in model.predict_proba(df_test_processed)]


submission = pd.read_csv("data/competition_data/sample_submission.csv")

submission["target"] = y_pred

submission.to_csv("data/submissions/submission_adaboost.csv", header=True, index=False)

In [21]:
submission[:100]

Unnamed: 0,id,target
0,300000,0.490604
1,300001,0.500727
2,300002,0.485138
3,300003,0.499107
4,300004,0.511549
5,300005,0.492554
6,300006,0.499121
7,300007,0.484965
8,300008,0.500669
9,300009,0.501453


In [23]:
"""kaggle score: ~0.765"""

'kaggle score: ~0.765'