In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, classification_report

import lightgbm as lgbm
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

import optuna

from preprocessing import agg_over_months
from utils import study_summary


plt.rcParams["figure.figsize"] = (12, 6)

# Setup

In [2]:
train_df = pd.read_csv("./data/train.csv", index_col=0).drop_duplicates()
X = train_df.loc[:, ~train_df.columns.isin(["LABELS"])]
y = train_df.loc[:, "LABELS"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42, stratify=y)

In [3]:
# Constants to select pandas columns efficiently
# all months abbreviations: jan, feb, mar, etc.
MONTHS = [m.lower() for m in pd.date_range(0, freq="M", periods=12).strftime("%b").to_list()]
# return all columns based on MONTH: train_df[COL_BY_MONTH["jan"]]
COL_BY_MONTH = {}
for month in MONTHS:
    COL_BY_MONTH[month] = [col for col in X.columns if month in col]

# all features
FEATURES = [col for col in X.columns if col != "LABELS"]
# all features with the month stripped: S2_B2_
COL_BASE = list({col[:-3] for col in FEATURES})
# return all columns based on FEATURE (e.g., across months): train_df[COL_BY_FEATURE["S2_B2_"]]
# alternatively, you can select the same df using: train_df[COL_BY_FEATURE[FEATURES[0]]]
COL_BY_FEATURE = {}
for feature in COL_BASE:
    COL_BY_FEATURE[feature] = [col for col in X.columns if feature in col]

# Feature Generation

In [4]:
generated_train_features_df = agg_over_months(X_train, agg_func=["mean", "std", "min", "max"], freq=3)
generated_train_features_df.dropna(axis=1, how='all', inplace=True)
X_train = X_train.merge(generated_train_features_df, left_index=True, right_index=True)

In [5]:
generated_test_features_df = agg_over_months(X_test, agg_func=["mean", "std", "min", "max"], freq=3)
X_test = X_test.merge(generated_test_features_df, left_index=True, right_index=True)
X_test = X_test[X_train.columns]

In [6]:
print(y_train.value_counts(), y_test.value_counts())

1.0    28433
0.0    13641
Name: LABELS, dtype: int64 1.0    12186
0.0     5846
Name: LABELS, dtype: int64


# Optuna Hyperparameter Optimization

In [7]:
def objective(trial, X, y):
    # hyperparameter grid for Optuna to explore
    hyperparams = {
        # structure
        "max_depth": trial.suggest_int("max_depth", 3, 20, step=1),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 15, step=1),
        # accuracy
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "n_estimators": trial.suggest_categorical("n_estimators", [100]),
        # overfitting
        "gamma": trial.suggest_float("gamma", 0.0, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "subsample": trial.suggest_float("subsample", 0.0, 1.0)
    }
    
    # hold the best score of each fold
    cv_scores = []
    kfold_cv = StratifiedKFold(n_splits=5, shuffle=True)
    for idx, (train_idx, test_idx) in enumerate(kfold_cv.split(X, y)):
        # train-validation split for each fold
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Scale weights to deal with imblanace
        y1_scale_weights = y_train.sum() / y_train.shape[0]
        hyperparams['scale_pos_weight'] = y1_scale_weights

        
        # Train classifier with optuna hyperparameters
        clf = XGBClassifier(objective="binary:logistic", use_label_encoder=False, tree_method='gpu_hist', gpu_id=0, **hyperparams)
        clf.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric=["logloss"],
            verbose=True,
        )
        
        # store the best score to the CV score
        results = clf.evals_result()
        best_iteration = clf.best_iteration
        cv_scores.append(results['validation_0']['logloss'][best_iteration])
    
    # return the average best score across CV folds
    return np.mean(cv_scores)

In [8]:
# create optuna study
study = optuna.create_study(direction="minimize", study_name="lgbm02")
optimize = lambda trial: objective(trial, X_train, y_train)

[32m[I 2021-12-11 10:45:15,984][0m A new study created in memory with name: lgbm02[0m


In [1]:
# launch optuna study
#study.optimize(optimize, n_trials=50)

In [17]:
study_summary(study)

Study:  lgbm02

Number of finished trials: 50
Best trial:
  Validation score: 0.33640980000000004
  Params: 
    max_depth: 16
    min_child_weight: 11
    learning_rate: 0.06824116817309857
    n_estimators: 100
    gamma: 0.21860883217784488
    reg_alpha: 0.01918484441539514
    reg_lambda: 0.8214537559144491
    subsample: 0.9121747941410446


In [30]:
X_df = X_train.copy()
y_df = y_train.copy()
X_test_df = X_test.copy()
y_test_df = y_test.copy()

In [33]:
def train_test_model(X_train,y_train,X_test,y_test, study):
    clf = XGBClassifier(objective="binary:logistic",
                        scale_pos_weight=y_train.sum()/y_train.shape[0],
                        use_label_encoder=False,
                        tree_method='gpu_hist',
                        gpu_id=0,
                        **study.best_params)
    clf = clf.fit(
            X_train,
            y_train.astype(int),
            eval_metric=["logloss"],
            verbose=True,
        )
    
    y_pred = clf.predict(X_test).astype(int)
    y_prob = clf.predict_proba(X_test)

    return (clf, y_pred, y_prob, y_test)

In [34]:
(clf, y_pred, y_prob, y_test) = train_test_model(X_df, y_df, X_test_df, y_test_df, study)
print(classification_report(y_test, y_pred))
f1_score(y_test, y_pred)

              precision    recall  f1-score   support

         0.0       0.75      0.78      0.76      5846
         1.0       0.89      0.88      0.88     12186

    accuracy                           0.84     18032
   macro avg       0.82      0.83      0.82     18032
weighted avg       0.84      0.84      0.84     18032



0.8828373960187063

In [35]:
features = pd.DataFrame()
features['columns'] = X_df.columns
features['importances'] = clf.feature_importances_
features.sort_values(by='importances', ascending=True, inplace=True)

In [46]:
features_reduced = features[features['importances'] > 0.001]['columns'].tolist()

In [49]:
len(features_reduced)

259

In [50]:
(clf, y_pred, y_prob, y_test) = train_test_model(X_df[features_reduced], y_df, X_test_df[features_reduced], y_test_df, study)
print(classification_report(y_test, y_pred))
f1_score(y_test, y_pred)

              precision    recall  f1-score   support

         0.0       0.75      0.78      0.76      5846
         1.0       0.89      0.87      0.88     12186

    accuracy                           0.84     18032
   macro avg       0.82      0.83      0.82     18032
weighted avg       0.84      0.84      0.84     18032



0.8825260017403554

In [51]:
# Train on full data
generated_X_features_df = agg_over_months(X, agg_func=["mean", "std", "min", "max"], freq=3)
generated_X_features_df.dropna(axis=1, how='all', inplace=True)
X_full_train = X.merge(generated_X_features_df, left_index=True, right_index=True)

In [55]:
clf = XGBClassifier(objective="binary:logistic",
                        scale_pos_weight=y.sum()/y.shape[0],
                        use_label_encoder=False,
                        tree_method='gpu_hist',
                        gpu_id=0,
                        **study.best_params)
clf = clf.fit(
            X_full_train[features_reduced],
            y.astype(int),
            eval_metric=["logloss"],
            verbose=True,
        )
    

In [56]:
output = {
    "hyperparams": study.best_params,
    "features": features_reduced
}


In [65]:
held_out_test = pd.read_csv("data/test_nolabels.csv")
generated_held_out_test_features_df = agg_over_months(held_out_test, agg_func=["mean", "std", "min", "max"], freq=3)
X_held_out = held_out_test.merge(generated_held_out_test_features_df, left_index=True, right_index=True)
X_held_out = X_held_out[features_reduced]

In [66]:
y_pred = clf.predict(X_held_out)
y_prob = clf.predict_proba(X_held_out)

In [67]:
submission = pd.DataFrame()
submission["S.No"] = held_out_test["S.No"]
submission["LABELS"] = y_pred

In [71]:
submission.to_csv("submissions/submission6.csv", index=False)

In [72]:
import pickle

In [73]:
with open("submissions/subimssion6_params.pkl", "wb") as f:
    pickle.dump(output, f)

In [74]:
with open("submissions/submission6_model.pkl", "wb") as f:
    pickle.dump(clf, f)