# Experiment XGBoost
Grid Search + Cross Validation

In [1]:
import os
from working_dir import set_wd
set_wd()
os.getcwd()

'/Users/tales.pimentel/ds/kaggle/football-match-prediction'

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config('spark.ui.showConsoleProgress', 'false') \
                            .config("spark.sql.debug.maxToStringFields", 500) \
                            .config("spark.sql.debug.autoBroadcastJoinThreshold", -1) \
                            .config("spark.driver.memory", "8g") \
                            .appName("ExperimentXGBoost").getOrCreate()

In [3]:
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

from src.dao import dao_processed, dao_ml
from src.utils import dflib, stats, pretties
from src.ml.transformers import DropNaTransformer

In [4]:
pretties.max_data_frame_columns()

In [5]:
BASIC_COLS = ['id', 'target', 'league_id', 'league_name',
              'home_team_name', 'away_team_name', 
              'match_date']

N_FOLDS = 10

In [6]:
def remove_cols(cols, cols_to_remove):
    for col_to_remove in cols_to_remove:
        if col_to_remove in cols:
            cols.remove(col_to_remove)
    return cols

# Loading Data

In [7]:
id_data_build = dao_processed.most_recent_data_build_id()
print(id_data_build)

04a4d619-00cc-4484-a724-e27e2161c91d


In [8]:
feature_selection_data = dao_ml.load_feature_selection(id_data=id_data_build)[0]
metadata_json = dao_processed.load_processed_metadata(id_data=id_data_build)

In [9]:
use_features = remove_cols(cols=metadata_json["use_features"], cols_to_remove=feature_selection_data["cols_to_remove"])
use_features

['home_mood_diff',
 'home_history_mood_mean',
 'away_history_mood_mean',
 'home_result_history_mean',
 'away_result_history_mean',
 'home_factor',
 'draw_factor']

In [10]:
df_ttrain = dao_processed.load_processed_data(which_dataset="train_train", id_data=id_data_build, spark=spark)
print(f"df_ttrain shape: {dflib.shape(df_ttrain)}")

df_ttrain shape: (87470, 15)


# Data Pipeline

In [11]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import VectorAssembler #A feature transformer that merges multiple columns into a vector column.
from pyspark.ml.feature import StringIndexer #A label indexer that maps a string column of labels to an ML column of label indices.
from src.ml.transformers import UndersamplingTransformer

import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from src.ml import metrics

### Defining

In [12]:
undersampling_transformer = UndersamplingTransformer(target_colname="target")

target_indexer_transformer = StringIndexer(inputCol="target", 
                                           outputCol="target_indexed", 
                                           stringOrderType="alphabetDesc").fit(df_ttrain)

In [13]:
pipeline_train = PipelineModel(stages=[target_indexer_transformer])

In [14]:
print(f"df_ttrain shape before: {dflib.shape(df_ttrain)}")
df_ttrain = DropNaTransformer(subset=use_features).transform(df_ttrain)
print(f"df_ttrain shape after: {dflib.shape(df_ttrain)}")

df_ttrain shape before: (87470, 15)
df_ttrain shape after: (85353, 15)


In [15]:
df_ttrain = dao_processed.load_processed_data(which_dataset="train_train", id_data=id_data_build, spark=spark)
df_tvalid = dao_processed.load_processed_data(which_dataset="train_valid", id_data=id_data_build, spark=spark)

print(f"df_ttrain shape: {dflib.shape(df_ttrain)}")
print(f"df_tvalid shape: {dflib.shape(df_tvalid)}")

df_train = df_ttrain.union(df_tvalid)

df_ttrain shape: (87470, 15)
df_tvalid shape: (23468, 15)


In [16]:
df_train = pipeline_train.transform(df_train)

### Model, Params and Cross-Validation

In [17]:
xgbc = xgb.XGBClassifier(n_jobs=10, use_label_encoder=False, eval_metric="logloss")

parameters = {'max_depth': [1, 2, 4, 6], 
              'subsample':[0.6, 0.7, 0.8],
              'colsample_bytree': [0.5, 0.6, 0.7],
              'n_estimators': [20, 40, 60]}

grid_search = GridSearchCV(estimator=xgbc, 
                           param_grid=parameters,
                           cv=KFold(N_FOLDS),
                           scoring="neg_log_loss", 
                           return_train_score=True,
                           n_jobs=10)

# Cross Validation
For dataset with no missing values in features

In [18]:
stats.freq(df_train, "target_indexed").show()

+--------------+--------+-------------------+
|target_indexed|Absolute|           Relative|
+--------------+--------+-------------------+
|           0.0|   48113| 0.4336926932160306|
|           1.0|   27651|0.24924732733598948|
|           2.0|   35174|   0.31705997944798|
+--------------+--------+-------------------+



In [19]:
t0 = datetime.now()
df_train = df_train.toPandas().sample(df_train.count())
# df_train["is_cup"] = df_train["is_cup"].replace({True: 1, False:0})

grid_search_model = grid_search.fit(df_train[use_features], df_train["target_indexed"])
tf = datetime.now()
print((tf - t0).total_seconds(), "seconds")

KeyError: 'is_cup'

In [None]:
clf = grid_search_model.best_estimator_

In [None]:
print("best_score")
print(grid_search_model.best_score_)

print("best_params")
print(grid_search_model.best_params_)

In [None]:
of_df = metrics.build_overfitting_analysis_df_xgboost(grid_search_model=grid_search_model)

In [None]:
feature_importances = metrics.get_feature_importances(clf, use_features)
display(feature_importances)
feature_importances.plot.bar(figsize=(12, 5), rot=30)
plt.show()

In [None]:
dao_ml.save_modeling_xgboost(id_data=id_data_build, 
                             grid_search_model=grid_search_model, 
                             features=use_features, 
                             overfitting_analysis_df=of_df, 
                             pipeline_train=pipeline_train,
                             n_fold=N_FOLDS,
                             grid_search_time=(tf-t0).total_seconds())