# Build Best Model And Predict
Notebook to get the best params, builds, predicts and make submission.

In [1]:
import os
from working_dir import set_wd
set_wd()
os.getcwd()

'/Users/tales.pimentel/ds/kaggle/football-match-prediction'

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config('spark.ui.showConsoleProgress', 'false') \
                            .config("spark.sql.debug.maxToStringFields", 500) \
                            .config("spark.sql.debug.autoBroadcastJoinThreshold", -1) \
                            .config("spark.driver.memory", "10g") \
                            .appName("BuildBestModelAndPredict").getOrCreate()

In [3]:
import pandas as pd
import ast
from datetime import datetime
import matplotlib.pyplot as plt

from src.dao import dao_processed, dao_ml
from src.utils import dflib, stats, pretties

In [4]:
pretties.max_data_frame_columns()

In [5]:
def remove_cols(cols, cols_to_remove):
    for col_to_remove in cols_to_remove:
        if col_to_remove in cols:
            cols.remove(col_to_remove)
    return cols

In [6]:
FILTER_FROM_DT = "2020-04"

# Loading Results

In [7]:
all_results = dao_ml.load_all_modeling()
print(len(all_results))

all_results_df = pd.DataFrame(all_results)
all_results_df["undersampling"] = all_results_df["pipeline_train_stages"].apply(lambda ppl : "UndersamplingTransformer" in ppl)

all_results_df["undersampling"] = all_results_df["undersampling"].replace({True: "balanced", False: "no"})

all_results_df["features"] = all_results_df["feature_importances"].apply(lambda fi : list(fi["importance"].keys()))
all_results_df["n_features"] = all_results_df["features"].apply(len)

6


In [8]:
all_results_df[["id_modeling", "datetime", "clf_name", "undersampling", "n_features",
                 "best_score_cv_train", "best_score_cv", "clf_params", "features", "id_data"]].sort_values("best_score_cv", ascending=True)

Unnamed: 0,id_modeling,datetime,clf_name,undersampling,n_features,best_score_cv_train,best_score_cv,clf_params,features,id_data
3,b261bebf-e056-4c9f-b40e-b5b019613c2b,2022-05-27 05:02:13,RandomForestClassificationModel,balanced,7,0.999287,1.000171,"{'numTrees': 60, 'maxDepth': 10, 'subsamplingR...","[home_mood_diff, draw_factor, away_history_moo...",04a4d619-00cc-4484-a724-e27e2161c91d
4,39f825ce-4edc-4227-b69e-f353357b87d1,2022-05-27 17:27:36,LGBMClassifier,no,7,0.997671,1.010027,"{'colsample_bytree': 0.7, 'learning_rate': 0.1...","[home_mood_diff, home_factor, draw_factor, awa...",04a4d619-00cc-4484-a724-e27e2161c91d
5,7fb951d1-4a95-4210-9d2a-1b34674ff279,2022-05-27 10:42:58,XGBClassifier,no,7,1.005677,1.010452,"{'colsample_bytree': 0.6, 'max_depth': 2, 'n_e...","[home_mood_diff, home_history_mood_mean, draw_...",04a4d619-00cc-4484-a724-e27e2161c91d
0,62e46782-8f32-488e-9fec-19923681d8ea,2022-05-26 21:04:41,RandomForestClassificationModel,no,7,0.981549,1.012594,"{'numTrees': 60, 'maxDepth': 10, 'subsamplingR...","[home_mood_diff, home_history_mood_mean, away_...",04a4d619-00cc-4484-a724-e27e2161c91d
1,a7c53082-93af-4eb7-ab90-f942c94c9dbf,2022-05-27 18:05:41,LGBMClassifier,balanced,7,1.029246,1.039577,"{'colsample_bytree': 0.5, 'learning_rate': 0.2...","[home_mood_diff, home_factor, draw_factor, hom...",04a4d619-00cc-4484-a724-e27e2161c91d
2,dfa5e718-a52e-442d-8c2a-7839f701305d,2022-05-27 10:09:20,XGBClassifier,balanced,7,1.033616,1.039655,"{'colsample_bytree': 0.6, 'max_depth': 2, 'n_e...","[home_mood_diff, draw_factor, away_result_hist...",04a4d619-00cc-4484-a724-e27e2161c91d


# Loading Data

The data loaded bellow is placed in a dictionary along with its id.

<b>Feature Selection</b>

In [9]:
feature_selection = {}
features = {}
metadata = {}

for id_data in all_results_df["id_data"].unique():
    feature_selection_data = pd.DataFrame(dao_ml.load_feature_selection(id_data=id_data)).sort_values("datetime").iloc[-1].to_dict()
    metadata_json = dao_processed.load_processed_metadata(id_data=id_data)
    
    features[id_data] = remove_cols(cols=metadata_json["use_features"], cols_to_remove=feature_selection_data["cols_to_remove"])
    feature_selection[id_data] = feature_selection_data
    metadata[id_data] = metadata_json

<b>Data</b>

In [10]:
def load_processed_train(id_data):
    return dao_processed.load_processed_data(which_dataset="train_train", id_data=id_data, spark=spark) \
            .union(dao_processed.load_processed_data(which_dataset="train_valid", id_data=id_data, spark=spark))

In [11]:
data = {}

for id_data in all_results_df["id_data"].unique():
    print(f"id_data: {id_data}")
    data[id_data] = {}
    
    df_train = load_processed_train(id_data)
    df_train = dflib.sample(df_train, n=df_train.count()) #shuffling train data
    data[id_data]["train"] = df_train

    df_test = dao_processed.load_processed_data(which_dataset="test", id_data=id_data, spark=spark)
    data[id_data]["test"] = df_test
    
    print(f"train shape: {dflib.shape(df_train)}")
    print(f"test shape : {dflib.shape(df_test)}")

id_data: 04a4d619-00cc-4484-a724-e27e2161c91d
train shape: (110938, 15)
test shape : (72711, 14)


# Data Pipeline

In [12]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import VectorAssembler, StringIndexer, IndexToString
from src.ml.transformers import DropNaTransformer, UndersamplingTransformer, DateFilterTransformer
from src.ml.estimators import FillProbaEstimator
from src.ml import metrics
from src.ml import missing_values

### Defining

In [13]:
undersampling_transformer = UndersamplingTransformer(target_colname="target")

date_filter_transformer = DateFilterTransformer("match_date", from_dt=FILTER_FROM_DT)

target_indexer_transformer = StringIndexer(inputCol="target", 
                                           outputCol="target_indexed", 
                                           stringOrderType="alphabetDesc").fit(df_train)
labels = [target_indexer_transformer.labels[i] for i in range(len(target_indexer_transformer.labels))]

target_reverter_transformer = IndexToString(inputCol="target_indexed", 
                                            outputCol="target",
                                            labels=labels)

### Applying

In [14]:
for id_data in all_results_df["id_data"].unique():
    print(f"id_data: {id_data}")
    use_features = features[id_data].copy()
    
    feature_assembler_transformer = VectorAssembler(inputCols=use_features, 
                                                    outputCol="features")

    pipeline_train = PipelineModel(stages=[feature_assembler_transformer, 
                                           target_indexer_transformer])

    pipeline_test = PipelineModel(stages=[feature_assembler_transformer])

    
    df_train = data[id_data]["train"]
    df_test = data[id_data]["test"]
    
    df_train_na = dflib.filter_any_null(df_train, subset=use_features)
    df_train = DropNaTransformer(subset=use_features).transform(df_train)
    
    df_train = pipeline_train.transform(df_train)
    df_train_u = undersampling_transformer.transform(df_train) #undersampling
    df_train_dt_filtered = date_filter_transformer.transform(df_train)
    
    data[id_data]["train"] = df_train
    data[id_data]["train_balanced"] = df_train_u
    data[id_data]["train_na"] = df_train_na
    data[id_data]["train_dt_filtered"] = df_train_dt_filtered
    
    df_test_na = dflib.filter_any_null(df_test)
    df_test = DropNaTransformer().transform(df_test)
    df_test = pipeline_test.transform(df_test) 
    
    data[id_data]["test"] = df_test
    data[id_data]["test_na"] = df_test_na
    
    df_train = data[id_data]["train"]
    df_test = data[id_data]["test"]
    
    train_na_imputed_median = pipeline_train.transform(
        missing_values.imputer(df_train, df_train_na, use_features, 'median'))
    train_na_imputed_mean = pipeline_train.transform(
        missing_values.imputer(df_train, df_train_na, use_features, 'mean'))
    
    data[id_data]["train_na_imputed_median"] = train_na_imputed_median 
    data[id_data]["train_na_imputed_mean"] = train_na_imputed_mean 
    
    df_test_na_imputed_median = pipeline_test.transform(
        missing_values.imputer(df_train, df_test_na, use_features, 'median'))
    df_test_na_imputed_mean = pipeline_test.transform(
        missing_values.imputer(df_train, df_test_na, use_features, 'mean'))
    
    data[id_data]["test_na_imputed_median"] = df_test_na_imputed_median
    data[id_data]["test_na_imputed_mean"] = df_test_na_imputed_mean
    print()

id_data: 04a4d619-00cc-4484-a724-e27e2161c91d



# Fit and Prediction

In [15]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import xgboost as xgb
from sklearn.metrics import log_loss

import xgboost as xgb
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import log_loss

In [16]:
def clean_params(params):
    for k in params.keys():
        if params[k] == 1.0 or k in ["n_estimators", "max_depth"]:
            params[k] = int(params[k])
    return params

def get_xgb_algorithm(params, df_train, features):
    params = clean_params(params)
    params["use_label_encoder"] = False
    params["eval_metric"] = "logloss"
    
    xgbc = xgb.XGBClassifier(**params)
    
    df_train_pd = df_train[features + ["target_indexed"]].toPandas()
    
    return xgbc.fit(df_train_pd[features], df_train_pd["target_indexed"])

def get_rf_algorithm(params, df_train, features):
    params = clean_params(params)

    rfc = RandomForestClassifier(numTrees=params["numTrees"], 
                                 maxDepth=params["maxDepth"], 
                                 subsamplingRate=params["subsamplingRate"])
    rfc.setLabelCol("target_indexed")
    rfc.setFeaturesCol("features")
    rfc.setPredictionCol("prediction")
    rfc.setProbabilityCol("proba")
    
    return rfc.fit(df_train)

def fit(result_row, data):
    clf_name = result_row["clf_name"]
    id_data = result_row["id_data"]
    id_modeling = result_row["id_modeling"]
    features = result_row["features"]
    
    if result_row["undersampling"] == "balanced":
        df_train = data[id_data]["train_balanced"]
        
    elif result_row["undersampling"] == "no":
        df_train = data[id_data]["train"]
    
    else:
        raise Exception(f'undersampling not recognized: {result_row["undersampling"]}')
        
    
    if clf_name == "RandomForestClassificationModel":
        clf = get_rf_algorithm(params=result_row["clf_params"], df_train=df_train, features=features)
        
    elif clf_name == "XGBClassifier" or clf_name == "LGBMClassifier":
        clf = get_xgb_algorithm(params=result_row["clf_params"], df_train=df_train, features=features)
        
    return clf

def predict(result_row, data, labels, test_dataset_name):
    clf_name = result_row["clf_name"]
    id_data = result_row["id_data"]
    id_modeling = result_row["id_modeling"]
    features = result_row["features"]
    clf = result_row["clf"]
    
    df_test = data[id_data][test_dataset_name]
    
    if clf_name == "RandomForestClassificationModel":
        preds = clf.transform(df_test)
        preds = dflib.dense_vector_to_columns(df=preds, 
                              dense_vector_colname="proba", 
                              new_colnames=labels)[["id"] + labels].toPandas()
        
    elif clf_name == "XGBClassifier" or clf_name == "LGBMClassifier":
        df_test_pd = df_test[["id"] + features].toPandas()
        preds = clf.predict_proba(df_test_pd[features])
        preds = pd.DataFrame(preds, columns=labels, index=df_test_pd["id"]).reset_index()
        
    return preds

<b>Fitting</b>

In [17]:
all_results_df["clf"] = all_results_df.apply(lambda row: fit(row, data), axis=1)

Parameters: { "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




<b>Predicting</b>

In [18]:
all_results_df["preds"] = all_results_df.apply(lambda row : predict(row, data, labels, "test"), axis=1)

In [19]:
all_results_df["preds_na"] = all_results_df.apply(lambda row : predict(row, data, labels, "test_na_imputed_median"), axis=1)

In [20]:
all_results_df["preds_complete"] = all_results_df[["preds", "preds_na"]].apply(lambda row : row["preds"].append(row["preds_na"]), axis=1)

# Build Submission

In [21]:
def save_submission(preds_row):
    filepath = "data/preds/preds_" + preds_row["id_modeling"] + ".csv"
    submission_df = preds_row["preds_complete"]
    print(filepath, len(submission_df))
    submission_df.to_csv(filepath, index=False)
    
all_results_df.apply(save_submission, axis=1)
print("done!")

data/preds/preds_62e46782-8f32-488e-9fec-19923681d8ea.csv 72711
data/preds/preds_a7c53082-93af-4eb7-ab90-f942c94c9dbf.csv 72711
data/preds/preds_dfa5e718-a52e-442d-8c2a-7839f701305d.csv 72711
data/preds/preds_b261bebf-e056-4c9f-b40e-b5b019613c2b.csv 72711
data/preds/preds_39f825ce-4edc-4227-b69e-f353357b87d1.csv 72711
data/preds/preds_7fb951d1-4a95-4210-9d2a-1b34674ff279.csv 72711
done!


In [22]:
all_results_df[["id_modeling", "datetime", "clf_name", "undersampling", "n_features",
                 "best_score_cv_train", "best_score_cv", "clf_params", "features", "id_data"]].sort_values("best_score_cv", ascending=True)

Unnamed: 0,id_modeling,datetime,clf_name,undersampling,n_features,best_score_cv_train,best_score_cv,clf_params,features,id_data
3,b261bebf-e056-4c9f-b40e-b5b019613c2b,2022-05-27 05:02:13,RandomForestClassificationModel,balanced,7,0.999287,1.000171,"{'numTrees': 60, 'maxDepth': 10, 'subsamplingR...","[home_mood_diff, draw_factor, away_history_moo...",04a4d619-00cc-4484-a724-e27e2161c91d
4,39f825ce-4edc-4227-b69e-f353357b87d1,2022-05-27 17:27:36,LGBMClassifier,no,7,0.997671,1.010027,"{'colsample_bytree': 0.7, 'learning_rate': 0.1...","[home_mood_diff, home_factor, draw_factor, awa...",04a4d619-00cc-4484-a724-e27e2161c91d
5,7fb951d1-4a95-4210-9d2a-1b34674ff279,2022-05-27 10:42:58,XGBClassifier,no,7,1.005677,1.010452,"{'colsample_bytree': 0.6, 'max_depth': 2, 'n_e...","[home_mood_diff, home_history_mood_mean, draw_...",04a4d619-00cc-4484-a724-e27e2161c91d
0,62e46782-8f32-488e-9fec-19923681d8ea,2022-05-26 21:04:41,RandomForestClassificationModel,no,7,0.981549,1.012594,"{'numTrees': 60, 'maxDepth': 10, 'subsamplingR...","[home_mood_diff, home_history_mood_mean, away_...",04a4d619-00cc-4484-a724-e27e2161c91d
1,a7c53082-93af-4eb7-ab90-f942c94c9dbf,2022-05-27 18:05:41,LGBMClassifier,balanced,7,1.029246,1.039577,"{'colsample_bytree': 0.5, 'learning_rate': 0.2...","[home_mood_diff, home_factor, draw_factor, hom...",04a4d619-00cc-4484-a724-e27e2161c91d
2,dfa5e718-a52e-442d-8c2a-7839f701305d,2022-05-27 10:09:20,XGBClassifier,balanced,7,1.033616,1.039655,"{'colsample_bytree': 0.6, 'max_depth': 2, 'n_e...","[home_mood_diff, draw_factor, away_result_hist...",04a4d619-00cc-4484-a724-e27e2161c91d
