In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score

pd.options.display.max_columns = 100
pd.options.display.max_rows = 150

In [2]:
df_train = pd.read_csv("data/competition_data/train.csv")
df_train = df_train.sample(frac=1).reset_index(drop=True)

df_test = pd.read_csv("data/competition_data/test.csv")

df_train

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,295587,0,1,1,T,Y,Green,Trapezoid,Snake,Russia,Theremin,2cc9e16b9,c36cea1f7,352282217,d5b52c3c3,4ef405b94,1,Novice,Hot,i,H,Nf,1,11,0
1,296800,0,0,0,T,Y,Green,Polygon,Cat,India,Oboe,5b1a9f841,144279ac1,4cd460e06,382cf7e87,05be0a334,3,Novice,Lava Hot,f,Z,dP,5,9,0
2,191133,0,0,0,T,Y,Blue,Square,Snake,Russia,Bassoon,6fec43dd8,73dd803d0,01d32ba52,e601670bb,7188e7e3e,1,Grandmaster,Freezing,i,V,Bn,4,7,0
3,221013,1,0,1,T,N,Red,Polygon,Lion,Canada,Piano,488406659,113deddc9,a1e0839a7,99b33644d,23bc7322c,2,Master,Lava Hot,f,P,Ed,3,2,1
4,281647,0,0,0,T,N,Blue,Circle,Cat,Russia,Bassoon,91a20b464,3afd0489b,3642c70d4,fbd2354a7,2883f1fde,2,Novice,Lava Hot,k,I,cA,2,12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,103530,0,0,1,T,N,Red,Trapezoid,Snake,Canada,Oboe,5a466e166,7d8bc814a,d02a6b0ba,6b972b4e6,401c2455c,2,Novice,Boiling Hot,d,F,ex,3,10,1
299996,268822,0,0,0,F,N,Green,Trapezoid,Snake,Canada,Piano,a93b89fc9,2ed5a94b0,bb8e9193e,b19bb1c28,ec3116ee6,1,Grandmaster,Cold,a,R,Jc,4,2,0
299997,153713,0,0,1,F,N,Green,Trapezoid,Cat,Russia,Oboe,6a2269152,39981f199,c92eb0903,6196dcb02,e2a1ab525,1,Novice,Lava Hot,h,F,MV,1,8,0
299998,90880,0,0,0,T,N,Blue,Square,Lion,Finland,Theremin,f2d59cf51,32b4b12b1,89f4255e4,3691833c1,b29127fef,1,Master,Warm,j,Z,GJ,3,3,0


In [3]:
y_train = df_train["target"]

y_train

0         0
1         0
2         0
3         1
4         0
         ..
299995    1
299996    0
299997    0
299998    0
299999    1
Name: target, Length: 300000, dtype: int64

In [4]:
def process_features(df):
    df_out = df.copy()
    
    # binary features to 0, 1
    df_out["bin_3"] = df_out["bin_3"].map({"T": 1, "F": 0})
    df_out["bin_4"] = df_out["bin_4"].map({"Y": 1, "N": 0})

    # ordinal features to int
    for field in ["ord_1", "ord_2", "ord_3", "ord_4", "ord_5"]:
        df[field] = pd.Categorical(df[field])
        df_out[field] = df[field].cat.codes
        
    # nominal features to int (same as for ordinal, but let's separate to keep us some options)
    for field in ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']:
        df[field] = pd.Categorical(df[field])
        df_out[field] = df[field].cat.codes
        
    return df_out


df_train_processed = process_features(df_train)

del df_train_processed["target"]
del df_train_processed["id"]

df_train_processed

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,0,1,1,1,1,1,4,5,5,3,37,392,229,1852,3677,1,4,3,8,7,46,1,11
1,0,0,0,1,1,1,1,1,4,1,91,34,335,475,292,3,4,4,5,25,105,5,9
2,0,0,0,1,1,0,2,5,5,0,103,231,10,1987,5287,1,2,2,8,21,7,4,7
3,1,0,1,1,0,2,1,4,0,2,70,28,741,1329,1678,2,3,4,5,15,17,3,2
4,0,0,0,1,0,0,0,1,5,0,133,122,234,2178,1902,2,4,4,10,8,96,2,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0,0,1,1,0,2,4,5,0,1,88,250,980,933,2988,2,4,0,3,5,114,3,10
299996,0,0,0,0,0,1,4,5,0,2,152,99,859,1530,11067,1,2,1,0,17,31,4,2
299997,0,0,1,0,0,1,4,1,5,1,99,118,943,851,10596,1,4,4,7,5,41,1,8
299998,0,0,0,1,0,0,2,4,3,3,211,106,634,467,8294,1,3,5,9,25,24,3,3


# Adaboost

In [5]:
model = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(
        criterion="entropy",
        min_samples_leaf = 600,
        min_samples_split = 1000,
        max_depth = 3
    ),
    n_estimators=30
)

i = df_train_processed.shape[0]
df_train_processed1 = df_train_processed.copy()[:i // 2]
df_train_processed2 = df_train_processed.copy()[i // 2:]
y_train1=y_train[:i // 2]
y_train2=y_train[i // 2:]

def fit_and_score(model, df_train, y_train, df_test, y_test):
    model = model.fit(df_train, y_train)
    
    y_score_train = [s[1] for s in model.predict_proba(df_train)]
    y_score_test = [s[1] for s in model.predict_proba(df_test)]

    print(f"""
        Score on training set: {roc_auc_score(y_true=y_train, y_score=y_score_train)}
        Score on test set: {roc_auc_score(y_true=y_test, y_score=y_score_test)}
    """)
    
    return model

model = fit_and_score(model, df_train_processed1, y_train1, df_train_processed2, y_train2)



        Score on training set: 0.7685105166989699
        Score on test set: 0.7634993178224911
    


In [6]:
"""Looks good, let's submit"""

df_test_processed = process_features(df_test)

del df_test_processed["id"]

y_pred = [x[1] for x in model.predict_proba(df_test_processed)]


submission = pd.read_csv("data/competition_data/sample_submission.csv")

submission["target"] = y_pred

submission.to_csv("data/submissions/submission_adaboost.csv", header=True, index=False)

In [7]:
submission[:100]

Unnamed: 0,id,target
0,300000,0.490941
1,300001,0.499982
2,300002,0.489907
3,300003,0.492661
4,300004,0.512394
5,300005,0.49466
6,300006,0.504606
7,300007,0.481701
8,300008,0.499878
9,300009,0.505316


In [8]:
"""kaggle score: ~0.765"""

'kaggle score: ~0.765'

# XGBoost

In [9]:
from xgboost import XGBClassifier

In [22]:
model = XGBClassifier(
    base_estimator=DecisionTreeClassifier(
        criterion="entropy",
        min_samples_leaf = 1,
        min_samples_split = 1,
        max_depth = 10
    ),
    n_estimators=50,
    eval_metric='auc' # is this accurate for roc auc?
)

model = fit_and_score(model, df_train_processed1, y_train1, df_train_processed2, y_train2)



Parameters: { base_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



        Score on training set: 0.8048875240385385
        Score on test set: 0.7632805535795706
    


In [23]:
"""Appears comaprable in resutl to AdaBoost, let's not submit"""

"Appears comaprable in resutl to AdaBoost, let's not submit"