# ResultAnalysis
Notebook to check Grid Search results

In [1]:
import os
from working_dir import set_wd
set_wd()
os.getcwd()

'/Users/tales.pimentel/ds/kaggle/football-match-prediction'

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from src.dao import dao_ml
from src.utils import dflib, stats, pretties, plot, plot_domain, palette

In [3]:
pretties.max_data_frame_columns()

# Loading Results

In [6]:
all_results = dao_ml.load_all_modeling()
print(len(all_results))

all_results_df = pd.DataFrame(all_results)
all_results_df["undersampling"] = all_results_df["pipeline_train_stages"].apply(lambda ppl : "UndersamplingTransformer" in ppl)

all_results_df["undersampling"] = all_results_df["undersampling"].replace({True: "balanced", False: "no"})

all_results_df["features"] = all_results_df["feature_importances"].apply(lambda fi : list(fi["importance"].keys()))
all_results_df["n_features"] = all_results_df["features"].apply(len)

5


In [7]:
all_results_df[["id_modeling", "datetime", "clf_name", "undersampling", "n_features",
                 "best_score_cv_train", "best_score_cv", "clf_params", "features", "id_data"]].sort_values("best_score_cv", ascending=True)

Unnamed: 0,id_modeling,datetime,clf_name,undersampling,n_features,best_score_cv_train,best_score_cv,clf_params,features,id_data
2,b261bebf-e056-4c9f-b40e-b5b019613c2b,2022-05-27 05:02:13,RandomForestClassificationModel,balanced,7,0.999287,1.000171,"{'numTrees': 60, 'maxDepth': 10, 'subsamplingR...","[home_mood_diff, draw_factor, away_history_moo...",04a4d619-00cc-4484-a724-e27e2161c91d
3,39f825ce-4edc-4227-b69e-f353357b87d1,2022-05-27 17:27:36,LGBMClassifier,no,7,0.997671,1.010027,"{'colsample_bytree': 0.7, 'learning_rate': 0.1...","[home_mood_diff, home_factor, draw_factor, awa...",04a4d619-00cc-4484-a724-e27e2161c91d
4,7fb951d1-4a95-4210-9d2a-1b34674ff279,2022-05-27 10:42:58,XGBClassifier,no,7,1.005677,1.010452,"{'colsample_bytree': 0.6, 'max_depth': 2, 'n_e...","[home_mood_diff, home_history_mood_mean, draw_...",04a4d619-00cc-4484-a724-e27e2161c91d
0,62e46782-8f32-488e-9fec-19923681d8ea,2022-05-26 21:04:41,RandomForestClassificationModel,no,7,0.981549,1.012594,"{'numTrees': 60, 'maxDepth': 10, 'subsamplingR...","[home_mood_diff, home_history_mood_mean, away_...",04a4d619-00cc-4484-a724-e27e2161c91d
1,dfa5e718-a52e-442d-8c2a-7839f701305d,2022-05-27 10:09:20,XGBClassifier,balanced,7,1.033616,1.039655,"{'colsample_bytree': 0.6, 'max_depth': 2, 'n_e...","[home_mood_diff, draw_factor, away_result_hist...",04a4d619-00cc-4484-a724-e27e2161c91d


# Select id_modeling

In [None]:
load_ids_modeling = ["62e46782-8f32-488e-9fec-19923681d8ea",
                     "7fb951d1-4a95-4210-9d2a-1b34674ff279", 
                     "39f825ce-4edc-4227-b69e-f353357b87d1"]

rdf = pd.DataFrame()

for id_modeling in load_ids_modeling:
    results_temp = dao_ml.load_modeling(id_modeling)
    
    rdf_id = pd.DataFrame(results_temp["overfitting_analysis_df"])
    rdf_id["clf_name"] = results_temp["clf_name"]
    
    rdf = rdf.append(rdf_id)

In [None]:
rdf = rdf.sort_values("log_loss_cv")
rdf.head(5)

In [None]:
def overfitting_analysis_num(metrics_df, param_name, metric_colname):
    metric_colname_train = metric_colname + "_train"
    metric_colname_valid = metric_colname + "_cv"
    
    of_data = metrics_df.groupby(param_name)[[metric_colname_train, metric_colname_valid]].mean().reset_index()

    display(of_data)
    
    title = f"{metric_colname} mean values for {param_name}"
    
    plt.figure(figsize=(8,5))
    plt.scatter(of_data[param_name], of_data[metric_colname_train], label="train", s=130, alpha=0.6)
    plt.scatter(of_data[param_name], of_data[metric_colname_valid], label="cv", s=130, alpha=0.6)
    plt.title(label=title)
    plt.xlabel(param_name)
    plt.ylabel(metric_colname)
    plt.legend()
    plt.show()

In [None]:
pretties.md("RandomForestClassificationModel", size="#", color="blue")
rfc_df = rdf[rdf["clf_name"] == "RandomForestClassificationModel"]

for param_name in ["subsamplingRate", "maxDepth", "numTrees"]:
    pretties.md(param_name, size="####")
    overfitting_analysis_num(metrics_df=rfc_df, 
                             param_name=param_name, 
                             metric_colname="log_loss")

In [None]:
pretties.md("XGBClassifier", size="#", color="blue")
xgbc_df = rdf[rdf["clf_name"] == "XGBClassifier"]

for param_name in ["colsample_bytree", "max_depth", "n_estimators", "subsample"]:
    pretties.md(param_name, size="####")
    overfitting_analysis_num(metrics_df=xgbc_df, 
                             param_name=param_name, 
                             metric_colname="log_loss")

In [None]:
pretties.md("LGBMClassifier", size="#", color="blue")
xgbc_df = rdf[rdf["clf_name"] == "LGBMClassifier"]

for param_name in ["num_leaves", "max_depth", "colsample_bytree", "subsample", "n_estimators", "learning_rate"]:
    pretties.md(param_name, size="####")
    overfitting_analysis_num(metrics_df=xgbc_df, 
                             param_name=param_name, 
                             metric_colname="log_loss")
