In [7]:
import pandas as pd
import os

from autogluon.tabular import TabularDataset, TabularPredictor


In [8]:
train_df = pd.read_csv("data/train.csv", index_col="id")
test_df = pd.read_csv("data/test.csv", index_col="id")

train_extra_df = pd.read_csv("data/training_extra.csv", index_col="id")

In [9]:
def convert_cols(df, cat_feats):
    for feat in cat_feats:
        df[feat] = df[feat].astype("category")


    return df

CONT_FEATS = ["Compartments", "Weight Capacity (kg)"]
CAT_FEATS = [name for name in train_df.columns.to_list() if name not in CONT_FEATS and name != "Price"]
TARGET_FEAT = "Price"

train_df = convert_cols(train_df, CAT_FEATS)
test_df = convert_cols(test_df, CAT_FEATS)
train_extra_df = convert_cols(train_extra_df, CAT_FEATS)

In [10]:
RETRAIN = True
train_data = TabularDataset(train_df)
test_data = TabularDataset(test_df)

models_dir = "AutogluonModels"
model_name = "base"

metric = "root_mean_squared_error"


if RETRAIN:
    predictor = TabularPredictor(
        label=TARGET_FEAT, 
        path=os.path.join(models_dir, model_name), 
        eval_metric=metric).fit(
            train_data=train_data, 
            presets="best_quality",
            ag_args_fit={"num_gpus":1})
else:
    predictor = TabularPredictor.load(os.path.join(models_dir, model_name))

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.9
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
Memory Avail:       18.97 GB / 31.93 GB (59.4%)
Disk Space Avail:   449.10 GB / 1863.00 GB (24.1%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 900s of the 3600s of remaining time (25%).
		Cont

In [11]:
predictor.evaluate(train_data)

{'root_mean_squared_error': -38.49682012537276,
 'mean_squared_error': -1482.005159765305,
 'mean_absolute_error': -33.31073795959358,
 'r2': 0.027597154069714858,
 'pearsonr': 0.28052697509831,
 'median_absolute_error': -33.365060474853514}

In [17]:
ag_data_dir = "AutogluonDataSaves"

if not os.path.isdir(ag_data_dir):
    os.mkdir(ag_data_dir)


In [21]:
test_preds = predictor.predict(test_data)

test_preds.to_csv(os.path.join(ag_data_dir, "base.csv"))
test_preds

id
300000    81.038185
300001    83.754807
300002    83.539169
300003    81.186523
300004    79.426109
            ...    
499995    80.201431
499996    78.026917
499997    82.367645
499998    79.322334
499999    81.349251
Name: Price, Length: 200000, dtype: float32

In [18]:
feature_importances = predictor.feature_importance(train_data)
feature_importances.to_csv(os.path.join(ag_data_dir, "feat_imps.csv"))

feature_importances

Computing feature importance via permutation shuffling for 9 features using 5000 rows with 5 shuffle sets...
	208.59s	= Expected runtime (41.72s per shuffle set)
	88.64s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
Weight Capacity (kg),0.561485,0.017058,1.021065e-07,5,0.596609,0.526362
Compartments,0.3973,0.01858,5.722636e-07,5,0.435556,0.359044
Color,0.081503,0.012299,6.037601e-05,5,0.106826,0.056179
Material,0.077238,0.017175,0.0002749905,5,0.112601,0.041875
Brand,0.04612,0.006538,4.71943e-05,5,0.059582,0.032658
Size,0.028262,0.01249,0.003590287,5,0.053979,0.002546
Style,0.016524,0.002643,7.598027e-05,5,0.021967,0.011081
Waterproof,0.013758,0.02019,0.1011348,5,0.05533,-0.027815
Laptop Compartment,-0.002289,0.009394,0.6926187,5,0.017054,-0.021633
