# Experiment LGBM
Grid Search + Cross Validation

In [None]:
import os
from working_dir import set_wd
set_wd()
os.getcwd()

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config('spark.ui.showConsoleProgress', 'false') \
                            .config("spark.sql.debug.maxToStringFields", 500) \
                            .config("spark.sql.debug.autoBroadcastJoinThreshold", -1) \
                            .config("spark.driver.memory", "8g") \
                            .appName("ExperimentXGBoost").getOrCreate()

In [None]:
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

from src.dao import dao_processed, dao_ml
from src.utils import dflib, stats, pretties

In [None]:
pretties.max_data_frame_columns()

In [None]:
BASIC_COLS = ['id', 'target', 'league_id', 'league_name',
              'home_team_name', 'away_team_name', 
              'match_date']

N_FOLDS = 10

In [None]:
def remove_cols(cols, cols_to_remove):
    for col_to_remove in cols_to_remove:
        if col_to_remove in cols:
            cols.remove(col_to_remove)
    return cols

# Loading Data

In [None]:
id_data_build = dao_processed.most_recent_data_build_id()
print(id_data_build)

In [None]:
feature_selection_data = dao_ml.load_feature_selection(id_data=id_data_build)[0]
metadata_json = dao_processed.load_processed_metadata(id_data=id_data_build)

In [None]:
use_features = remove_cols(cols=metadata_json["use_features"], cols_to_remove=feature_selection_data["cols_to_remove"])
use_features

In [None]:
df_ttrain = dao_processed.load_processed_data(which_dataset="train_train", id_data=id_data_build, spark=spark)
print(f"df_ttrain shape: {dflib.shape(df_ttrain)}")

# Data Pipeline

In [None]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import VectorAssembler #A feature transformer that merges multiple columns into a vector column.
from pyspark.ml.feature import StringIndexer #A label indexer that maps a string column of labels to an ML column of label indices.
from src.ml.transformers import DropNaTransformer, UndersamplingTransformer, DateFilterTransformer

# import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from src.ml import metrics

### Defining

In [None]:
undersampling_transformer = UndersamplingTransformer(target_colname="target")

date_filter_transformer = DateFilterTransformer("match_date", from_dt=FILTER_FROM_DT)

target_indexer_transformer = StringIndexer(inputCol="target", 
                                           outputCol="target_indexed", 
                                           stringOrderType="alphabetDesc").fit(df_ttrain)

In [None]:
pipeline_train = PipelineModel(stages=[target_indexer_transformer])

In [None]:
print(f"df_ttrain shape before: {dflib.shape(df_ttrain)}")
df_ttrain = DropNaTransformer(subset=use_features).transform(df_ttrain)
print(f"df_ttrain shape after: {dflib.shape(df_ttrain)}")

In [None]:
df_ttrain = dao_processed.load_processed_data(which_dataset="train_train", id_data=id_data_build, spark=spark)
df_tvalid = dao_processed.load_processed_data(which_dataset="train_valid", id_data=id_data_build, spark=spark)

print(f"df_ttrain shape: {dflib.shape(df_ttrain)}")
print(f"df_tvalid shape: {dflib.shape(df_tvalid)}")

df_train = df_ttrain.union(df_tvalid)

In [None]:
df_train = pipeline_train.transform(df_train)

### Model, Params and Cross-Validation

In [None]:
lgbmc = LGBMClassifier()

parameters = {'num_leaves': [5, 10, 20, 31], 
              'max_depth':[3, 5, 8, -1],
              'colsample_bytree': [0.5, 0.6, 0.7],
              'subsample': [0.4, 0.7],
              'n_estimators': [20, 40, 60],
              'learning_rate': [0.2, 0.4, 0.6]}

grid_search = GridSearchCV(estimator=lgbmc, 
                           param_grid=parameters,
                           cv=KFold(N_FOLDS),
                           scoring="neg_log_loss", 
                           return_train_score=True,
                           n_jobs=-1)

# Cross Validation
For dataset with no missing values in features

In [None]:
stats.freq(df_train, "target_indexed").show()

In [None]:
t0 = datetime.now()
df_train = df_train.toPandas().sample(df_train.count())
# df_train["is_cup"] = df_train["is_cup"].replace({True: 1, False:0})

grid_search_model = grid_search.fit(df_train[use_features], df_train["target_indexed"])
tf = datetime.now()
print((tf - t0).total_seconds(), "seconds")

In [None]:
clf = grid_search_model.best_estimator_

In [None]:
print("best_score")
print(grid_search_model.best_score_)

print("best_params")
print(grid_search_model.best_params_)

In [None]:
of_df = metrics.build_overfitting_analysis_df_xgboost(grid_search_model=grid_search_model)

In [None]:
feature_importances = get_feature_importances(clf, use_features)
display(feature_importances)
feature_importances.plot.bar(figsize=(12, 5), rot=30)
plt.show()

In [None]:
dao_ml.save_modeling_xgboost(id_data=id_data_build, 
                             grid_search_model=grid_search_model, 
                             features=use_features, 
                             overfitting_analysis_df=of_df, 
                             pipeline_train=pipeline_train,
                             n_fold=N_FOLDS,
                             grid_search_time=(tf-t0).total_seconds())