In [1]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor

import plotly.express as px

from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from helpers.preprocessing import *

import gc

In [2]:
train_df = pd.read_csv("data/train.csv", index_col="id")
test_df = pd.read_csv("data/test.csv", index_col="id")
orig_df = pd.read_csv("data/orig.csv", index_col="id")

CONT_FEATS = [
    "cap-diameter",
    "stem-height",
    "stem-width"
]
CAT_FEATS = [c for c in test_df.columns if c not in CONT_FEATS]
RESPONSE_COL = "class"

fix_categories(orig_df, train_df, test_df, CAT_FEATS, nan_as_cat=True)
fix_that_one_mushroom_in_test(test_df)

Unnamed: 0_level_0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3116945,8.64,x,NaN_cat,n,t,NaN_cat,NaN_cat,w,11.13,17.12,b,NaN_cat,w,u,w,t,g,NaN_cat,d,a
3116946,6.90,o,t,o,f,NaN_cat,c,y,1.27,10.75,NaN_cat,NaN_cat,n,NaN_cat,NaN_cat,f,f,NaN_cat,d,a
3116947,2.00,b,g,n,f,NaN_cat,c,n,6.18,3.14,NaN_cat,NaN_cat,n,NaN_cat,NaN_cat,f,f,NaN_cat,d,s
3116948,3.47,x,t,n,f,s,c,n,4.98,8.51,NaN_cat,NaN_cat,w,NaN_cat,n,t,z,NaN_cat,d,u
3116949,6.17,x,h,y,f,p,NaN_cat,y,6.73,13.70,NaN_cat,NaN_cat,y,NaN_cat,y,t,NaN_cat,NaN_cat,d,u
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5194904,0.88,x,g,w,f,a,d,w,2.67,1.35,NaN_cat,NaN_cat,e,NaN_cat,NaN_cat,f,f,NaN_cat,d,u
5194905,3.12,x,s,w,f,d,c,w,2.69,7.38,NaN_cat,NaN_cat,w,NaN_cat,NaN_cat,f,f,NaN_cat,g,a
5194906,5.73,x,e,e,f,a,NaN_cat,w,6.16,9.74,NaN_cat,NaN_cat,y,NaN_cat,w,t,z,NaN_cat,d,a
5194907,5.03,b,g,n,f,a,d,g,6.00,3.46,NaN_cat,s,g,NaN_cat,NaN_cat,f,f,NaN_cat,d,a


In [3]:
RETRAIN = False
train_data = TabularDataset(train_df)
test_data = TabularDataset(test_df)

if RETRAIN:
    predictor = TabularPredictor(label="class", path="AutogluonModels/nan_cat", eval_metric="mcc").fit(train_data=train_data, presets="best_quality")
else:
    predictor = TabularPredictor.load("AutogluonModels/nan_cat")

In [4]:
predictor.evaluate(train_data)

{'mcc': 0.9857494381300482,
 'accuracy': 0.9929353902619392,
 'balanced_accuracy': 0.9929759803197715,
 'roc_auc': 0.998190615989947,
 'f1': 0.9935375680947213,
 'precision': 0.9945316952508534,
 'recall': 0.992545426399499}

In [14]:
train_preds = predictor.predict(train_data)
train_pp = predictor.predict_proba(train_data)

out_pd = pd.DataFrame(index=train_df.index)
out_pd["Response"] = list(train_preds)
out_pd["Response"] = out_pd["Response"].replace({"e":0, "p":1})
out_pd = pd.concat([out_pd, train_pp], axis=1).rename(columns={"Response":"class", "e":"pp_0", "p":"pp_1"})
out_pd.to_csv("predictions/v2/autogluon_train.csv", index=True)

In [15]:
test_preds = predictor.predict(test_data)
test_pp = predictor.predict_proba(test_data)

out_pd = pd.DataFrame(index=test_df.index)
out_pd["Response"] = list(test_preds)
out_pd["Response"] = out_pd["Response"].replace({"e":0, "p":1})
out_pd = pd.concat([out_pd, test_pp], axis=1).rename(columns={"Response":"class", "e":"pp_0", "p":"pp_1"})
out_pd.to_csv("predictions/v2/autogluon_test.csv", index=True)

In [16]:
feature_importances = predictor.feature_importance(train_data)
feature_importances.to_csv("ag_feature_importances_v2.csv")

In [17]:
feature_importances

Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
cap-surface,0.025276,0.003057,2.5e-05,5,0.031571,0.01898
gill-attachment,0.020384,0.002278,1.8e-05,5,0.025075,0.015693
stem-surface,0.019709,0.003091,7e-05,5,0.026072,0.013345
stem-color,0.012543,0.00229,0.000128,5,0.017258,0.007829
stem-width,0.010756,0.002416,0.000286,5,0.015731,0.005781
gill-spacing,0.009869,0.002809,0.000709,5,0.015652,0.004086
ring-type,0.007924,0.002426,0.000935,5,0.012919,0.002928
stem-root,0.00744,0.001618,0.000252,5,0.010772,0.004108
gill-color,0.006071,0.001684,0.000644,5,0.00954,0.002603
stem-height,0.003237,0.001435,0.003627,5,0.006192,0.000283
