In [20]:
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

from v_time import timeit_out, mesure

from sklearn import metrics
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# 0. Auxiliar functions

In [17]:
plot = lambda x: px.imshow(x, color_continuous_scale="greys").show()

def train_and_evaluate(model, name):
    
    out = {}
    print(f"Evaluating '{name}':")

    # Train it
    _, out["train_time"] = timeit_out(model.fit)(x_train, y_train)
    print(f'- {"Training time:":16} {out["train_time"]:.6f}')
    
    # Predict time
    y_pred, out["predict_time"] = timeit_out(model.predict)(x_test)
    print(f'- {"Predict time:":16} {out["predict_time"]:.6f}')
          
    # AUC
    out["auc"] = metrics.roc_auc_score(y_test, y_pred)
    print(f'- {"AUC:":16} {out["auc"]:.6f}')
          
    return out

# 1. Read data

In [11]:
x_train = pd.read_csv("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
x_test = pd.read_csv("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")

display(x_train.head())

Unnamed: 0,response,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28
0,1.0,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,...,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,1.0,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
2,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
3,0.0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
4,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487


## 1.1. Extract features

In [12]:
y_train = x_train.pop("response").apply(int)
y_test = x_test.pop("response").apply(int)

# 2. Train SVC

In [21]:
out = {}
out["SVC_sklearn"] = train_and_evaluate(SVC(), "SVC_sklearn")

Evaluating 'SVC_sklearn':
- Training time:   5.429230
- Predict time:    1.787591
- AUC:             0.642001


# 3. Traing Random Forest

In [22]:
out["RFC_sklearn"] = train_and_evaluate(RandomForestClassifier(n_estimators=100), "RFC_sklearn")

Evaluating 'RFC_sklearn':
- Training time:   3.735469
- Predict time:    0.100549
- AUC:             0.709088


# 4. Traing with GridSearchCV + Random Forest

In [23]:
params = {
    "n_estimators": list(range(50, 250, 50)),
    'max_features': ['auto', 'sqrt', 'log2']
}
params

{'n_estimators': [50, 100, 150, 200], 'max_features': ['auto', 'sqrt', 'log2']}

In [24]:
gs = GridSearchCV(RandomForestClassifier(), params, cv=2)
out["RFC_GS_sklearn"] = train_and_evaluate(gs, "RFC_GS_sklearn")

Evaluating 'RFC_GS_sklearn':
- Training time:   63.880907
- Predict time:    0.236742
- AUC:             0.704552


In [28]:
df = pd.DataFrame(out).T
df["origin"] = "sklearn"
df

Unnamed: 0,train_time,predict_time,auc,origin
SVC_sklearn,5.42923,1.787591,0.642001,sklearn
RFC_sklearn,3.735469,0.100549,0.709088,sklearn
RFC_GS_sklearn,63.880907,0.236742,0.704552,sklearn


In [29]:
df.to_csv("results/sklearn.csv")