In [11]:
import pandas as pd
import optuna

import sklearn.linear_model as sklm
import sklearn.model_selection as skms
import sklearn.metrics as metrics
import sklearn.pipeline as skpl
import sklearn.preprocessing as skpp
import sklearn.compose as skcmp
import category_encoders as ce

import lightgbm as lgb
import xgboost as xgb

from preprocessing import *
from ucimlrepo import fetch_ucirepo
from autogluon.tabular import TabularDataset, TabularPredictor

import gc

In [12]:
train_df = pd.read_csv("data/train.csv", index_col="id")
test_df = pd.read_csv("data/test.csv", index_col="id")
orig_df = fetch_ucirepo(id=848)['data']['original']

CONT_FEATS = [
    "cap-diameter",
    "stem-height",
    "stem-width"
]
CAT_FEATS = [c for c in train_df.columns if c not in CONT_FEATS]
RESPONSE_COL = "class"

train_df = convert_cols(train_df, CONT_FEATS, CAT_FEATS)
test_df = convert_cols(test_df, CONT_FEATS, CAT_FEATS)
orig_df = convert_cols(orig_df, CONT_FEATS, CAT_FEATS)

train_df = null_all_non_original_categories(train_df, orig_df, CAT_FEATS)
test_df = null_all_non_original_categories(test_df, orig_df, CAT_FEATS)



In [13]:
for df in [train_df, orig_df]:
    df["class"] = df["class"].cat.rename_categories({"e":0, "p":1})

In [14]:
def fit_predict_save_train(model, path, save_pred_proba=True):
    print("Fitting model...")
    model.fit(train_df.drop(columns=["class"]), train_df["class"])
    
    print("Getting predictions...")
    train_preds = model.predict(train_df.drop(columns=["class"]))
    if save_pred_proba:
        pred_proba = model.predict_proba(train_df.drop(columns=["class"]))
        
    print("Saving predictions")
    out_df = pd.DataFrame({"class":train_preds}, index=train_df.index)
    out_df["class"] = out_df["class"].replace({0:"e", 1:"p"})
    if save_pred_proba:
        out_df["pred_proba_0"] = pred_proba[:, 0]
        out_df["pred_proba_1"] = pred_proba[:, 1]
    
        
    out_df.to_csv(path)

def predict_save_test(model, path, save_pred_proba=True):
    print("Getting predictions...")
    test_preds = model.predict(test_df)
    if save_pred_proba:
        pred_proba = model.predict_proba(test_df)
        
    print("Saving predictions")
    out_df = pd.DataFrame({"class":test_preds}, index=test_df.index)
    out_df["class"] = out_df["class"].replace({0:"e", 1:"p"})
    if save_pred_proba:
        out_df["pred_proba_0"] = pred_proba[:, 0]
        out_df["pred_proba_1"] = pred_proba[:, 1]
    
        
    out_df.to_csv(path)

In [15]:
study = optuna.load_study(
    storage="sqlite:///optuna.sqlite3",
    study_name="xgb_exploration",
)
best_model = xgb.XGBClassifier(
    enable_categorical=True,
    device="cuda",
    random_state=0,
    n_jobs=-1,
    **study.best_params
)
fit_predict_save_train(best_model, "ensemble_data/train_simple_xgb_100.csv")
predict_save_test(best_model, "test_preds/test_simple_xgb_100.csv")

Fitting model...
Getting predictions...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Saving predictions
Getting predictions...
Saving predictions


In [16]:
study = optuna.load_study(
    storage="sqlite:///optuna.sqlite3",
    study_name="dart_tuning",
)
best_model = lgb.LGBMClassifier(
    boosting_type="dart",
    n_jobs=-1,

    **study.best_params
)
fit_predict_save_train(best_model, "ensemble_data/train_tuned_dart_100.csv")
predict_save_test(best_model, "test_preds/test_tuned_dart_100.csv")

Fitting model...
[LightGBM] [Info] Number of positive: 1705396, number of negative: 1411549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065769 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 898
[LightGBM] [Info] Number of data points in the train set: 3116945, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547137 -> initscore=0.189110
[LightGBM] [Info] Start training from score 0.189110
Getting predictions...
Saving predictions
Getting predictions...
Saving predictions


In [17]:
study = optuna.load_study(
    storage="sqlite:///optuna.sqlite3",
    study_name="lgbm_tuning",
)
best_model = lgb.LGBMClassifier(
    boosting_type="gbdt",
    n_jobs=-1,

    **study.best_params
)
fit_predict_save_train(best_model, "ensemble_data/train_tuned_lgbm_100.csv")
predict_save_test(best_model, "test_preds/test_tuned_lgbm_100.csv")

Fitting model...
[LightGBM] [Info] Number of positive: 1705396, number of negative: 1411549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066205 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 898
[LightGBM] [Info] Number of data points in the train set: 3116945, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547137 -> initscore=0.189110
[LightGBM] [Info] Start training from score 0.189110
Getting predictions...
Saving predictions
Getting predictions...
Saving predictions


In [18]:
study = optuna.load_study(
    storage="sqlite:///optuna.sqlite3",
    study_name="xgb_tuning",
)
best_model = xgb.XGBClassifier(
    enable_categorical=True,
    device="cuda",
    random_state=0,
    n_jobs=-1,
    **study.best_params
)
fit_predict_save_train(best_model, "ensemble_data/train_tuned_xgb_100.csv")
predict_save_test(best_model, "test_preds/test_tuned_xgb_100.csv")

Fitting model...
Getting predictions...
Saving predictions
Getting predictions...
Saving predictions


In [19]:
train_data = TabularDataset(train_df)
test_data = TabularDataset(test_df)
predictor = TabularPredictor.load("AutogluonModels/simple")

train_preds = predictor.predict(train_data)
train_pp = predictor.predict_proba(train_data)
out_pd = pd.DataFrame(index=train_df.index)
out_pd["class"] = list(train_preds)
out_pd = pd.concat([out_pd, train_pp], axis=1).rename(columns={"e":"pred_proba_0", "p":"pred_proba_1"})
out_pd.to_csv("ensemble_data/train_autogluon.csv")

test_preds = predictor.predict(test_data)
test_pp = predictor.predict_proba(test_data)
out_pd = pd.DataFrame(index=test_df.index)
out_pd["class"] = list(test_preds)
out_pd = pd.concat([out_pd, test_pp], axis=1).rename(columns={"e":"pred_proba_0", "p":"pred_proba_1"})
out_pd.to_csv("test_preds/test_autogluon.csv")

del train_data, test_data, predictor, train_preds, train_pp, test_preds, test_pp, out_pd
gc.collect()

10145