In [7]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor

import plotly.express as px

from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import gc

In [3]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [4]:
categorical_cols = [
    "Gender",
    "Driving_License",
    "Region_Code",
    "Previously_Insured",
    "Vehicle_Age",
    "Vehicle_Damage",
    "Policy_Sales_Channel",
    "Response",
]

for df in [train_df, test_df]:
    for col in categorical_cols:
        try:
            df[col] = df[col].astype("category")
        except:
            continue

    df["Age"] = df["Age"].astype("int8")
    df["Region_Code"] = df["Region_Code"].astype("int8")
    df["Annual_Premium"] = df["Annual_Premium"].astype("int32")
    df["Vintage"] = df["Vintage"].astype("int16")

    df["Gender"] = df["Gender"].cat.rename_categories({"Female":0, "Male":1})
    df["Vehicle_Age"] = df["Vehicle_Age"].cat.rename_categories({"< 1 Year":0, "1-2 Year":1, "> 2 Years":2})
    df["Vehicle_Damage"] = df["Vehicle_Damage"].cat.rename_categories({"No":0, "Yes":1})

    df["Not_Insured_and_Damaged"] = (df["Previously_Insured"] == 0) & (df["Vehicle_Damage"] == 1)
    df["Not_Insured_and_Damaged"] = df["Not_Insured_and_Damaged"].astype("int8")

In [5]:
zeroes = train_df[train_df["Response"] == 0]
ones = train_df[train_df["Response"] == 1]
undersampled_zeroes = zeroes.sample(len(ones))

downsampled_df = pd.concat([ones, undersampled_zeroes])
zeroes = None
ones = None
gc.collect()

0

In [5]:
RETRAIN = True

if RETRAIN:
    train_data = TabularDataset(downsampled_df)
    test_data = TabularDataset(test_df)

    predictor = TabularPredictor(label="Response").fit(train_data=train_data)
else:
    predictor = TabularPredictor.load("AutogluonModels/simple")

No path specified. Models will be saved in: "AutogluonModels\ag-20240725_182647"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240725_182647"
AutoGluon Version:  1.1.0


[1000]	valid_set's binary_error: 0.194368


	0.8061	 = Validation score   (accuracy)
	66.79s	 = Training   runtime
	0.25s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's binary_error: 0.190623
[2000]	valid_set's binary_error: 0.190199


	0.8102	 = Validation score   (accuracy)
	73.18s	 = Training   runtime
	0.3s	 = Validation runtime
Fitting model: RandomForestGini ...
	0.8026	 = Validation score   (accuracy)
	293.35s	 = Training   runtime
	0.16s	 = Validation runtime
Fitting model: RandomForestEntr ...
	0.803	 = Validation score   (accuracy)
	360.04s	 = Training   runtime
	0.19s	 = Validation runtime
Fitting model: CatBoost ...
	0.8084	 = Validation score   (accuracy)
	978.36s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesGini ...
	0.7985	 = Validation score   (accuracy)
	154.03s	 = Training   runtime
	0.12s	 = Validation runtime
Fitting model: ExtraTreesEntr ...
	0.7982	 = Validation score   (accuracy)
	161.47s	 = Training   runtime
	0.12s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.8027	 = Validation score   (accuracy)
	1048.21s	 = Training   runtime
	0.14s	 = Validation runtime
Fitting model: XGBoost ...
	0.8074	 = Validation score   (accuracy)
	30.05s	 = Training   ru

In [6]:
test_preds = predictor.predict(test_df)
predictor.evaluate(train_data)

{'accuracy': 0.8163468802360891,
 'balanced_accuracy': 0.8163468802360891,
 'mcc': 0.6508285933140227,
 'roc_auc': 0.8846388421591463,
 'f1': 0.8356143934185032,
 'precision': 0.7562719869473744,
 'recall': 0.9335561273416868}

In [7]:
out_pd = pd.DataFrame(index=test_df["id"])
out_pd["Response"] = list(test_preds)
out_pd.to_csv("autogluon_downsampled.csv", columns=["Response"], index=True)

In [8]:
feature_importances = predictor.feature_importance(train_data)
feature_importances.to_csv("ag_feature_importances.csv")

Computing feature importance via permutation shuffling for 12 features using 5000 rows with 5 shuffle sets...
	37.33s	= Expected runtime (7.47s per shuffle set)
	11.59s	= Actual runtime (Completed 5 of 5 shuffle sets)


In [11]:
feature_importances

Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
Not_Insured_and_Damaged,0.06932,0.009943,4.9e-05,5,0.089794,0.048846
Previously_Insured,0.05712,0.008633,6.1e-05,5,0.074896,0.039344
Policy_Sales_Channel,0.04536,0.005233,2.1e-05,5,0.056136,0.034584
Age,0.0198,0.003826,0.000159,5,0.027678,0.011922
Vintage,0.00936,0.001652,0.000112,5,0.012761,0.005959
Region_Code,0.00864,0.003164,0.001819,5,0.015154,0.002126
Vehicle_Damage,0.00672,0.004272,0.012256,5,0.015517,-0.002077
Vehicle_Age,0.0056,0.001503,0.000568,5,0.008695,0.002505
Annual_Premium,0.004,0.001463,0.001811,5,0.007012,0.000988
Gender,0.00144,0.001108,0.021934,5,0.003722,-0.000842


In [15]:
fig = px.bar(x=feature_importances.index, y=feature_importances["importance"])
fig.show()

In [9]:
age_df = train_df[["Age", "Response"]]

mms = MinMaxScaler()
ss = StandardScaler()
svc_model = LinearSVC(verbose=1)

age_normed = mms.fit_transform(age_df["Age"].values.reshape(-1, 1))
age_ss = ss.fit_transform(age_df["Age"].values.reshape(-1, 1))


In [10]:
svc_model.fit(age_ss, age_df["Response"])



[LibLinear]

In [12]:
svc_preds = svc_model.predict(age_ss)
set(svc_preds)

{0}

In [6]:
from sklearn.utils.class_weight import compute_sample_weight

compute_sample_weight(class_weight="balanced", y=train_df["Response"])

array([0.57012367, 4.06513015, 0.57012367, ..., 0.57012367, 4.06513015,
       0.57012367])