In [1]:
import os

import numpy as np
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from pipeline import convert_cols, train_kmeans

In [2]:
ag_data_dir = "AutogluonDataSaves"

if not os.path.isdir(ag_data_dir):
    os.mkdir(ag_data_dir)

In [3]:
def setup_autogluon(
    train_dset,
    test_dset,
    model_name,
    saves_dir=None,
    metric="roc_auc",
    models_dir="AutogluonModels",
    **kwargs,
):
    if saves_dir is None:
        saves_dir = model_name

    if not os.path.isdir(saves_dir):
        os.mkdir(saves_dir)

    return {
        "train_data": train_dset,
        "test_data": test_dset,
        "models_dir": models_dir,
        "model_name": model_name,
        "metric": metric,
        "save_preds_name": f"{model_name}.csv",
        "save_probas_name": f"{model_name}_probs.csv",
        "feat_imps_save": f"{model_name}_importances.csv",
        **kwargs,
    }

# Base Dataset Model


In [5]:
train_df = pd.read_csv("data/train.csv", index_col="id")
test_df = pd.read_csv("data/test.csv", index_col="id")

In [6]:
TARGET_FEAT = "rainfall"

train_df = convert_cols(train_df)
test_df = convert_cols(test_df)

WARN: No 'rainfall' column found


In [7]:
RETRAIN = True
train_data = TabularDataset(train_df)
test_data = TabularDataset(test_df)

models_dir = "AutogluonModels"
model_name = "base"

metric = "roc_auc"

save_preds_name = "base_ag.csv"
save_probas_name = "base_ag_probas.csv"

feat_imps_save = "base_imps.csv"

if RETRAIN:
    predictor = TabularPredictor(
        label=TARGET_FEAT, path=os.path.join(models_dir, model_name), eval_metric=metric
    ).fit(train_data=train_data, num_gpus=1)
else:
    predictor = TabularPredictor.load(os.path.join(models_dir, model_name))

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.9
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
Memory Avail:       14.29 GB / 31.93 GB (44.8%)
Disk Space Avail:   445.68 GB / 1863.00 GB (23.9%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.
	presets='good'         : Good accuracy with very fast inference speed.
	presets='medi

In [8]:
predictor.evaluate(train_data)

{'roc_auc': 0.9566936026936026,
 'accuracy': 0.9009132420091325,
 'balanced_accuracy': 0.8383164983164983,
 'mcc': 0.7223862462328952,
 'f1': 0.9360070775582424,
 'precision': 0.9115450890292935,
 'recall': 0.9618181818181818}

In [9]:
test_preds = predictor.predict(test_data)
test_probas = predictor.predict_proba(test_data, as_multiclass=False)

test_preds.to_csv(os.path.join(ag_data_dir, save_preds_name))
test_probas.to_csv(os.path.join(ag_data_dir, save_probas_name))
test_probas

id
2190    0.930488
2191    0.939786
2192    0.896915
2193    0.085392
2194    0.062865
          ...   
2915    0.929509
2916    0.791402
2917    0.919580
2918    0.941110
2919    0.872247
Name: rainfall, Length: 730, dtype: float32

In [10]:
feature_importances = predictor.feature_importance(train_data)
feature_importances.to_csv(os.path.join(ag_data_dir, feat_imps_save))

feature_importances

Computing feature importance via permutation shuffling for 11 features using 2190 rows with 5 shuffle sets...
	8.46s	= Expected runtime (1.69s per shuffle set)
	2.47s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
cloud,0.141763,0.007004,7.127421e-07,5,0.156184,0.127341
sunshine,0.042046,0.00364,6.67481e-06,5,0.049542,0.034551
dewpoint,0.028898,0.00331,2.02937e-05,5,0.035713,0.022083
day,0.020211,0.000173,6.48543e-10,5,0.020567,0.019854
humidity,0.014024,0.00269,0.0001547493,5,0.019563,0.008486
winddirection,0.010759,0.000472,4.451354e-07,5,0.011732,0.009786
windspeed,0.010499,0.001289,2.67341e-05,5,0.013154,0.007845
pressure,0.010468,0.001522,5.210308e-05,5,0.013601,0.007335
mintemp,0.009477,0.0006,1.921343e-06,5,0.010713,0.008241
temperature,0.009279,0.000764,5.466742e-06,5,0.010852,0.007706


# Numerical Day Model


In [None]:
train_df = pd.read_csv("data/train.csv", index_col="id")
test_df = pd.read_csv("data/test.csv", index_col="id")

In [None]:
TARGET_FEAT = "rainfall"

train_df = convert_cols(train_df)
test_df = convert_cols(test_df)

WARN: No 'rainfall' column found


In [None]:
RETRAIN = True

train_data = TabularDataset(train_df)
test_data = TabularDataset(test_df)

models_dir = "AutogluonModels"
model_name = "day_numerical"

metric = "roc_auc"

save_preds_name = f"{model_name}.csv"
save_probas_name = f"{model_name}_probs.csv"

feat_imps_save = f"{model_name}_importances.csv"

if RETRAIN:
    predictor = TabularPredictor(
        label=TARGET_FEAT, path=os.path.join(models_dir, model_name), eval_metric=metric
    ).fit(train_data=train_data, num_gpus=1)
else:
    predictor = TabularPredictor.load(os.path.join(models_dir, model_name))

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.9
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
Memory Avail:       10.56 GB / 31.93 GB (33.1%)
Disk Space Avail:   445.63 GB / 1863.00 GB (23.9%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.
	presets='good'         : Good accuracy with very fast inference speed.
	presets='medi

In [None]:
predictor.evaluate(train_data)

{'roc_auc': 0.9825881032547699,
 'accuracy': 0.9374429223744293,
 'balanced_accuracy': 0.888097643097643,
 'mcc': 0.8273167001231257,
 'f1': 0.9595750958984951,
 'precision': 0.9350201265094882,
 'recall': 0.9854545454545455}

In [None]:
test_preds = predictor.predict(test_data)
test_probas = predictor.predict_proba(test_data, as_multiclass=False)

test_preds.to_csv(os.path.join(ag_data_dir, save_preds_name))
test_probas.to_csv(os.path.join(ag_data_dir, save_probas_name))
test_probas

id
2190    0.993326
2191    0.994826
2192    0.958189
2193    0.207794
2194    0.058902
          ...   
2915    0.992182
2916    0.931664
2917    0.988348
2918    0.993542
2919    0.940876
Name: rainfall, Length: 730, dtype: float32

In [None]:
feature_importances = predictor.feature_importance(train_data)
feature_importances.to_csv(os.path.join(ag_data_dir, feat_imps_save))

feature_importances

Computing feature importance via permutation shuffling for 11 features using 2190 rows with 5 shuffle sets...
	2.16s	= Expected runtime (0.43s per shuffle set)
	0.77s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
cloud,0.202862,0.007504,2.242631e-07,5,0.218312,0.187411
day,0.062283,0.003311,9.544971e-07,5,0.0691,0.055466
winddirection,0.057888,0.001558,6.285649e-08,5,0.061096,0.054681
dewpoint,0.034249,0.001911,1.158865e-06,5,0.038184,0.030314
windspeed,0.032945,0.001803,1.072646e-06,5,0.036657,0.029232
humidity,0.026457,0.001546,1.393701e-06,5,0.029641,0.023273
sunshine,0.023745,0.002168,8.249373e-06,5,0.028209,0.019281
pressure,0.023662,0.001351,1.270651e-06,5,0.026444,0.02088
maxtemp,0.020763,0.000966,5.595029e-07,5,0.022751,0.018775
mintemp,0.019367,0.001088,1.191912e-06,5,0.021608,0.017126


# Time Series with KMeans Cluster IDs Model


In [64]:
# train_df = pd.read_csv("data/train.csv", index_col="id")
# test_df = pd.read_csv("data/test.csv", index_col="id")

In [65]:
# # Fix the one test row with a nan value in winddirection
# test_df.loc[test_df.isna().any(axis=1), "winddirection"] = np.median(
#     train_df[train_df["day"] == 153]["winddirection"]
# )

In [None]:
# TARGET_FEAT = "rainfall"
# TIME_COL = "day"

# train_df = convert_cols(train_df)
# test_df = convert_cols(test_df)

# kmeans = train_kmeans(train_df.drop(columns=["rainfall"]), n_clusters=3)

# train_df["cluster_id"] = kmeans.predict(train_df.drop(columns=["rainfall"]))
# test_df["cluster_id"] = kmeans.predict(test_df)


In [67]:
# # Fix some mislabeled days
# train_df["expected_day"] = (train_df.index) % 365 + 1
# train_df["day_mislabeled"] = train_df["day"] != train_df["expected_day"]
# train_df.loc[train_df["day_mislabeled"], "day"] = train_df.loc[
#     train_df["day_mislabeled"], "expected_day"
# ]

# train_df = train_df.drop(columns=["day_mislabeled", "expected_day"])

# # Create artificial years for time series analysis
# train_df["year"] = train_df.index // 365
# train_df["year"] += 1
# test_df["year"] = test_df.index // 365
# test_df["year"] += 1

# # Create artificial datetime col
# train_df["date"] = pd.to_datetime(
#     (train_df["year"] + 1970).astype(str)
# ) + pd.to_timedelta(train_df["day"] - 1, unit="D")
# test_df["date"] = pd.to_datetime(
#     (test_df["year"] + 1970).astype(str)
# ) + pd.to_timedelta(test_df["day"] - 1, unit="D")

In [68]:
# train_df = train_df.drop(columns=["day", "year"])
# test_df = test_df.drop(columns=["day", "year"])

# train_df = train_df.set_index([train_df.index, "date"])
# test_df = test_df.set_index([test_df.index, "date"])

# train_df = train_df.rename(columns={"rainfall":"target"})
# test_df = test_df.rename(columns={"rainfall":"target"})

In [None]:
# train_static_df = train_df["cluster_id"].reset_index().drop(columns=["date"])
# test_static_df = test_df["cluster_id"].reset_index().drop(columns=["date"])
# train_static_df["id"] = 0
# test_static_df["id"] = 0

In [70]:
# train_df = train_df.reset_index()
# test_df = test_df.reset_index()
# train_df["id"] = 0
# test_df["id"] = 0

In [71]:
# train_ts_df = TimeSeriesDataFrame.from_data_frame(
#     train_df.drop(columns=["cluster_id"]),
#     id_column="id",
#     timestamp_column="date",
#     static_features_df=train_static_df,
# )
# test_ts_df = TimeSeriesDataFrame.from_data_frame(
#     test_df.drop(columns=["cluster_id"]),
#     id_column="id",
#     timestamp_column="date",
#     static_features_df=test_static_df,
# )

In [72]:
# prediction_len = 30
# train_data, test_data = train_ts_df[:-prediction_len], train_ts_df

In [None]:
# train_data.convert_frequency(freq="D")

In [73]:
# predictor_params = setup_autogluon(
#     train_df, test_df, "time_series_kmeans", saves_dir=os.path.join(ag_data_dir, "time_series_kmeans")
# )

In [None]:
# RETRAIN = True

# if RETRAIN:
#     predictor = TimeSeriesPredictor(
#         prediction_length=prediction_len,
#         eval_metric="MASE",
#         path=os.path.join(
#             predictor_params["models_dir"], predictor_params["model_name"]
#         ),
#     ).fit(train_data=train_data.convert_frequency(freq="D"))
# else:
#     predictor = TimeSeriesPredictor.load(os.path.join(models_dir, model_name))

In [None]:
# predictor.evaluate(train_data)

In [None]:
# test_preds = predictor.predict(test_data)
# test_probas = predictor.predict_proba(test_data, as_multiclass=False)

# test_preds.to_csv(os.path.join(ag_data_dir, save_preds_name))
# test_probas.to_csv(os.path.join(ag_data_dir, save_probas_name))
# test_probas

In [None]:
# feature_importances = predictor.feature_importance(train_data)
# feature_importances.to_csv(os.path.join(ag_data_dir, feat_imps_save))

# feature_importances