In [1]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor

import plotly.express as px

from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from preprocessing import *

import gc

In [2]:
train_df = pd.read_csv("data/train.csv", index_col="id")
test_df = pd.read_csv("data/test.csv", index_col="id")
orig_df = pd.read_csv("data/orig.csv", index_col="id")

CONT_FEATS = [
    "cap-diameter",
    "stem-height",
    "stem-width"
]
CAT_FEATS = [c for c in test_df.columns if c not in CONT_FEATS]
RESPONSE_COL = "class"

train_df = convert_cols(train_df, CONT_FEATS, CAT_FEATS)
test_df = convert_cols(test_df, CONT_FEATS, CAT_FEATS)
orig_df = convert_cols(orig_df, CONT_FEATS, CAT_FEATS)

train_df = null_all_non_original_categories(train_df, orig_df, CAT_FEATS)
test_df = null_all_non_original_categories(test_df, orig_df, CAT_FEATS)

In [5]:
RETRAIN = False
train_data = TabularDataset(train_df)
test_data = TabularDataset(test_df)

if RETRAIN:
    predictor = TabularPredictor(label="class", path="AutogluonModels/simple", eval_metric="mcc").fit(train_data=train_data, presets="best_quality")
else:
    predictor = TabularPredictor.load("AutogluonModels/simple")

In [6]:
# train_preds = predictor.predict(train_data)
# out_pd = pd.DataFrame(index=train_df.index)
# out_pd["Response"] = list(train_preds)
# out_pd.to_csv("train_autogluon.csv", columns=["Response"], index=True)

In [6]:
test_preds = predictor.predict(test_data)
predictor.evaluate(train_data)

{'mcc': 0.9856936500065705,
 'accuracy': 0.9929077991430711,
 'balanced_accuracy': 0.9929464938719754,
 'roc_auc': 0.9979671119746238,
 'f1': 0.9935124316202193,
 'precision': 0.9944907417199556,
 'recall': 0.9925360444143179}

In [7]:
out_pd = pd.DataFrame(index=test_df.index)
out_pd["Response"] = list(test_preds)
out_pd.to_csv("autogluon.csv", columns=["Response"], index=True)

In [8]:
feature_importances = predictor.feature_importance(train_data)
feature_importances.to_csv("ag_feature_importances.csv")

Computing feature importance via permutation shuffling for 20 features using 5000 rows with 5 shuffle sets...
	268.16s	= Expected runtime (53.63s per shuffle set)
	164.97s	= Actual runtime (Completed 5 of 5 shuffle sets)


In [9]:
feature_importances

Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
gill-attachment,0.028463,0.001075,2.433452e-07,5,0.030676,0.02625
cap-surface,0.02719,0.001999,3.478308e-06,5,0.031305,0.023075
stem-surface,0.017221,0.003278,0.0001502267,5,0.02397,0.010471
stem-color,0.013837,0.002314,9.050767e-05,5,0.018602,0.009072
stem-width,0.011566,0.001995,0.0001022253,5,0.015674,0.007457
gill-spacing,0.008658,0.002523,0.0007753747,5,0.013852,0.003463
ring-type,0.008411,0.002039,0.00038373,5,0.012609,0.004213
gill-color,0.006478,0.001832,0.0006928653,5,0.010251,0.002705
stem-root,0.005823,0.001161,0.0001802587,5,0.008215,0.003432
cap-shape,0.003157,0.001121,0.001625167,5,0.005465,0.000849
