In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score

pd.options.display.max_columns = 100
pd.options.display.max_rows = 150

In [2]:
df_train = pd.read_csv("data/competition_data/train.csv")
df_train = df_train.sample(frac=1).reset_index(drop=True)

df_test = pd.read_csv("data/competition_data/test.csv")

df_train

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,188295,1,0,1,T,Y,Blue,Star,Lion,China,Theremin,39647c92a,0249d8675,bce7d2952,7befd1525,1aeb3a3d3,1,Novice,Lava Hot,f,U,Dc,7,1,1
1,57243,0,1,0,F,N,Red,Star,Snake,Finland,Piano,8266050c6,df5b440ae,097b0f4ce,64335b5e6,4a0e747c8,2,Grandmaster,Freezing,k,Q,dO,1,10,1
2,64132,0,1,0,F,Y,Blue,Trapezoid,Lion,Costa Rica,Oboe,2cadfed8e,6c90b0073,da7ad33d9,d6b5a53d9,9dba93fae,1,Novice,Lava Hot,g,L,eQ,1,1,0
3,225892,0,1,0,F,Y,Red,Square,Lion,Finland,Oboe,586b51342,9b1c3c8b7,af2a1f476,edbd3d8cb,2d53a379a,1,Master,Cold,h,R,qP,7,3,1
4,31390,0,0,1,F,Y,Red,Triangle,Cat,Russia,Oboe,05950689f,3bac0bd8e,c79fbc620,c86f89a21,73e8ef557,3,Novice,Lava Hot,o,Q,Uk,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,264418,0,0,0,T,N,Red,Triangle,Snake,Costa Rica,Piano,e844a1f66,3e44d44eb,2dd9daf45,19d1a424d,fca8b2e1e,2,Contributor,Boiling Hot,i,Z,AP,1,1,0
299996,56014,0,0,0,F,Y,Green,Trapezoid,Snake,India,Oboe,f12246592,6046454de,f31a506ec,19d1a424d,858fce1e7,3,Grandmaster,Boiling Hot,i,L,eb,1,4,1
299997,90532,0,1,0,T,N,Blue,Polygon,Snake,Russia,Bassoon,91c80026f,a148d86df,591491a89,7517f74f0,ca567c89e,1,Novice,Freezing,i,O,RL,7,7,0
299998,61985,0,0,1,T,Y,Green,Triangle,Axolotl,Finland,Theremin,8266050c6,8bd314488,a0be4f3da,68fce9db2,e22f7c603,1,Master,Freezing,o,E,sD,2,7,1


In [3]:
y_train = df_train["target"]

y_train

0         1
1         1
2         0
3         1
4         0
         ..
299995    0
299996    1
299997    0
299998    1
299999    0
Name: target, Length: 300000, dtype: int64

In [4]:
def process_features(df):
    df_out = df.copy()
    
    # binary features to 0, 1
    df_out["bin_3"] = df_out["bin_3"].map({"T": 1, "F": 0})
    df_out["bin_4"] = df_out["bin_4"].map({"Y": 1, "N": 0})

    # ordinal features to int
    for field in ["ord_1", "ord_2", "ord_3", "ord_4", "ord_5"]:
        df[field] = pd.Categorical(df[field])
        df_out[field] = df[field].cat.codes
        
    # nominal features to int (same as for ordinal, but let's separate to keep us some options)
    for field in ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']:
        df[field] = pd.Categorical(df[field])
        df_out[field] = df[field].cat.codes
        
    return df_out


df_train_processed = process_features(df_train)

del df_train_processed["target"]
del df_train_processed["id"]

df_train_processed

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,1,0,1,1,1,0,3,4,1,3,53,4,869,1082,1274,1,4,4,5,20,15,7,1
1,0,1,0,0,0,2,3,5,3,2,120,452,50,871,3434,2,2,2,10,16,104,1,10
2,0,1,0,0,1,0,4,4,2,1,36,213,1044,1858,7297,1,4,4,6,11,110,1,1
3,0,1,0,0,1,2,2,4,3,1,85,318,806,2054,2080,1,3,1,7,17,158,7,3
4,0,0,1,0,1,2,5,1,5,1,2,124,935,1730,5399,3,4,4,14,16,70,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0,0,0,1,0,2,5,5,2,2,199,131,188,243,11831,2,0,0,8,25,0,1,1
299996,0,0,0,0,1,1,4,5,4,1,209,198,1153,243,6197,3,2,0,8,11,111,1,4
299997,0,1,0,1,0,0,1,5,5,0,135,326,408,1031,9377,1,4,2,8,14,60,7,7
299998,0,0,1,1,1,1,5,0,3,3,120,277,736,913,10577,1,3,2,14,4,166,2,7


# Adaboost

In [5]:
model = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(
        criterion="entropy",
        min_samples_leaf = 600,
        min_samples_split = 1000,
        max_depth = 3
    ),
    n_estimators=30
)

i = df_train_processed.shape[0]
df_train_processed1 = df_train_processed.copy()[:i // 2]
df_train_processed2 = df_train_processed.copy()[i // 2:]
y_train1=y_train[:i // 2]
y_train2=y_train[i // 2:]

def fit_and_score(model, df_train, y_train, df_test, y_test):
    model = model.fit(df_train, y_train)
    
    y_score_train = [s[1] for s in model.predict_proba(df_train)]
    y_score_test = [s[1] for s in model.predict_proba(df_test)]

    print(f"""
        Score on training set: {roc_auc_score(y_true=y_train, y_score=y_score_train)}
        Score on test set: {roc_auc_score(y_true=y_test, y_score=y_score_test)}
    """)
    
    return model

model = fit_and_score(model, df_train_processed1, y_train1, df_train_processed2, y_train2)



        Score on training set: 0.7698815186816041
        Score on test set: 0.7609777738293734
    


In [6]:
"""Looks good, let's submit"""

df_test_processed = process_features(df_test)

del df_test_processed["id"]

y_pred = [x[1] for x in model.predict_proba(df_test_processed)]


submission = pd.read_csv("data/competition_data/sample_submission.csv")

submission["target"] = y_pred

submission.to_csv("data/submissions/submission_adaboost.csv", header=True, index=False)

In [7]:
submission[:100]

Unnamed: 0,id,target
0,300000,0.493311
1,300001,0.501472
2,300002,0.490969
3,300003,0.498693
4,300004,0.508872
5,300005,0.497333
6,300006,0.502996
7,300007,0.479057
8,300008,0.49758
9,300009,0.497603


In [8]:
"""kaggle score: ~0.765"""

'kaggle score: ~0.765'

# XGBoost

In [9]:
from xgboost import XGBClassifier

In [10]:
model = XGBClassifier(
    base_estimator=DecisionTreeClassifier(
        criterion="entropy",
        min_samples_leaf = 1,
        min_samples_split = 1,
        max_depth = 10
    ),
    n_estimators=50,
    eval_metric='auc' # is this accurate for roc auc?
)

model = fit_and_score(model, df_train_processed1, y_train1, df_train_processed2, y_train2)



Parameters: { base_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



        Score on training set: 0.8075389037777326
        Score on test set: 0.761342535917932
    


In [11]:
"""Appears comparable in resutl to AdaBoost, let's not submit"""

"Appears comaprable in resutl to AdaBoost, let's not submit"

# Random Forrest

In [29]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators = 100,
    criterion='entropy',
    max_depth = 12,
    min_samples_leaf = 200,
    verbose=1
)

model = fit_and_score(model, df_train_processed1, y_train1, df_train_processed2, y_train2)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   19.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.0s finished



        Score on training set: 0.758259939654648
        Score on test set: 0.7397825821995891
    


In [31]:
"""Appears to be again comparable to Adaboost, potentially a little inferior, no submission"""

'Appears to be again comparable to Adaboost, potentially a little inferior, no submission'

# Conclusion

Generally ensemble trees work much better than single decision trees for this dataset (and in general). None of them appear to stand out (without doing real tuning).