# Build Best Model And Predict
Notebook to get the best params, builds, predicts and make submission.

In [1]:
import os
from working_dir import set_wd
set_wd()
os.getcwd()

'/home/tales/ds/kaggle/football-match-prediction'

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config('spark.ui.showConsoleProgress', 'false') \
                            .config("spark.sql.debug.maxToStringFields", 500) \
                            .config("spark.sql.debug.autoBroadcastJoinThreshold", -1) \
                            .config("spark.driver.memory", "10g") \
                            .appName("BuildBestModelAndPredict").getOrCreate()

22/05/26 20:01:48 WARN Utils: Your hostname, tales-samsung resolves to a loopback address: 127.0.1.1; using 192.168.0.102 instead (on interface wlp1s0)
22/05/26 20:01:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/26 20:01:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
import pandas as pd
import ast
from datetime import datetime
import matplotlib.pyplot as plt

from src.dao import dao_processed, dao_ml
from src.utils import dflib, stats, pretties
from src.ml.transformers import DropNaTransformer

In [4]:
pretties.max_data_frame_columns()

In [5]:
def remove_cols(cols, cols_to_remove):
    for col_to_remove in cols_to_remove:
        if col_to_remove in cols:
            cols.remove(col_to_remove)
    return cols

# Loading Results

In [6]:
all_results = dao_ml.load_all_modeling()
print(len(all_results))

all_results_df = pd.DataFrame(all_results)
all_results_df["undersampling"] = all_results_df["pipeline_train_stages"].apply(lambda ppl : "UndersamplingTransformer" in ppl)

all_results_df["undersampling"] = all_results_df["undersampling"].replace({True: "balanced", False: "no"})

all_results_df["features"] = all_results_df["feature_importances"].apply(lambda fi : list(fi["importance"].keys()))
all_results_df["n_features"] = all_results_df["features"].apply(len)
all_results_df[["id_modeling", "datetime", "clf_name", "undersampling", "n_features",
                 "best_score_cv_train", "best_score_cv", "clf_params", "features", "id_data"]].sort_values("best_score_cv", ascending=True)

4


Unnamed: 0,id_modeling,datetime,clf_name,undersampling,n_features,best_score_cv_train,best_score_cv,clf_params,features,id_data
0,9bd15ac0-889a-4cf5-9012-5675edce5889,2022-05-26 16:37:12,RandomForestClassificationModel,balanced,8,1.001444,1.000331,"{'numTrees': 60, 'maxDepth': 10, 'subsamplingR...","[home_mood_diff, draw_factor, home_history_moo...",b2beffb2-fb6d-4cb4-8869-82b859c2dd3b
3,d6e238c1-c6a8-41be-ac18-3cad3f47a420,2022-05-26 16:44:32,XGBClassifier,no,8,0.996144,1.010365,"{'colsample_bytree': 0.7, 'max_depth': 4, 'n_e...","[home_mood_diff, draw_factor, away_history_moo...",b2beffb2-fb6d-4cb4-8869-82b859c2dd3b
1,6d671cab-ea98-4471-8455-0a8092241123,2022-05-26 15:52:32,RandomForestClassificationModel,no,8,0.98231,1.012607,"{'numTrees': 60, 'maxDepth': 10, 'subsamplingR...","[home_mood_diff, away_history_mood_mean, home_...",b2beffb2-fb6d-4cb4-8869-82b859c2dd3b
2,3f709ede-053d-434d-a212-867de706844b,2022-05-26 17:45:35,XGBClassifier,balanced,8,1.033072,1.039629,"{'colsample_bytree': 0.7, 'max_depth': 2, 'n_e...","[home_mood_diff, home_history_mood_mean, away_...",b2beffb2-fb6d-4cb4-8869-82b859c2dd3b


# Loading Data

The data loaded bellow is placed in a dictionary along with its id.

<b>Feature Selection</b>

In [7]:
feature_selection = {}
features = {}
metadata = {}

for id_data in all_results_df["id_data"].unique():
    feature_selection_data = pd.DataFrame(dao_ml.load_feature_selection(id_data=id_data)).sort_values("datetime").iloc[-1].to_dict()
    metadata_json = dao_processed.load_processed_metadata(id_data=id_data)
    
    features[id_data] = remove_cols(cols=metadata_json["use_features"], cols_to_remove=feature_selection_data["cols_to_remove"])
    feature_selection[id_data] = feature_selection_data
    metadata[id_data] = metadata_json

<b>Data</b>

In [8]:
def load_processed_train(id_data):
    return dao_processed.load_processed_data(which_dataset="train_train", id_data=id_data, spark=spark) \
            .union(dao_processed.load_processed_data(which_dataset="train_valid", id_data=id_data, spark=spark))

In [9]:
data = {}

for id_data in all_results_df["id_data"].unique():
    print(f"id_data: {id_data}")
    data[id_data] = {}
    
    df_train = load_processed_train(id_data)
    df_train = dflib.sample(df_train, n=df_train.count()) #shuffling train data
    data[id_data]["train"] = df_train

    df_test = dao_processed.load_processed_data(which_dataset="test", id_data=id_data, spark=spark)
    data[id_data]["test"] = df_test
    
    print(f"train shape: {dflib.shape(df_train)}")
    print(f"test shape : {dflib.shape(df_test)}")

id_data: b2beffb2-fb6d-4cb4-8869-82b859c2dd3b
train shape: (110938, 16)
test shape : (72711, 15)


# Data Pipeline

In [10]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import VectorAssembler, StringIndexer, IndexToString
from src.ml.transformers import UndersamplingTransformer, ProbaVectorToPrediction
from src.ml.estimators import FillProbaEstimator
from src.ml import metrics
from src.ml import missing_values

### Defining

In [11]:
undersampling_transformer = UndersamplingTransformer(target_colname="target")

target_indexer_transformer = StringIndexer(inputCol="target", 
                                           outputCol="target_indexed", 
                                           stringOrderType="alphabetDesc").fit(df_train)
labels = [target_indexer_transformer.labels[i] for i in range(len(target_indexer_transformer.labels))]

target_reverter_transformer = IndexToString(inputCol="target_indexed", 
                                            outputCol="target",
                                            labels=labels)

### Applying

In [12]:
for id_data in all_results_df["id_data"].unique():
    print(f"id_data: {id_data}")
    use_features = features[id_data].copy()
    
    feature_assembler_transformer = VectorAssembler(inputCols=use_features, 
                                                    outputCol="features")

    pipeline_train = PipelineModel(stages=[feature_assembler_transformer, 
                                           target_indexer_transformer])

    pipeline_test = PipelineModel(stages=[feature_assembler_transformer])

    
    df_train = data[id_data]["train"]
    df_test = data[id_data]["test"]
    
    df_train_na = dflib.filter_any_null(df_train, subset=use_features)
    df_train = DropNaTransformer(subset=use_features).transform(df_train)
    
    df_train = pipeline_train.transform(df_train)
    df_train_u = undersampling_transformer.transform(df_train) #undersampling
    
    data[id_data]["train"] = df_train
    data[id_data]["train_balanced"] = df_train_u
    data[id_data]["train_na"] = df_train_na 
    
    
    df_test_na = dflib.filter_any_null(df_test)
    df_test = DropNaTransformer().transform(df_test)
    df_test = pipeline_test.transform(df_test) 
    
    data[id_data]["test"] = df_test
    data[id_data]["test_na"] = df_test_na
    
    df_train = data[id_data]["train"]
    df_test = data[id_data]["test"]
    
    train_na_imputed_median = pipeline_train.transform(
        missing_values.imputer(df_train, df_train_na, use_features, 'median'))
    train_na_imputed_mean = pipeline_train.transform(
        missing_values.imputer(df_train, df_train_na, use_features, 'mean'))
    
    data[id_data]["train_na_imputed_median"] = train_na_imputed_median 
    data[id_data]["train_na_imputed_mean"] = train_na_imputed_mean 
    
    
    df_test_na_imputed_median = pipeline_test.transform(
        missing_values.imputer(df_train, df_test_na, use_features, 'median'))
    df_test_na_imputed_mean = pipeline_test.transform(
        missing_values.imputer(df_train, df_test_na, use_features, 'mean'))
    
    data[id_data]["test_na_imputed_median"] = df_test_na_imputed_median
    data[id_data]["test_na_imputed_mean"] = df_test_na_imputed_mean
    print()

id_data: b2beffb2-fb6d-4cb4-8869-82b859c2dd3b



# Fit and Prediction

In [13]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import xgboost as xgb
from sklearn.metrics import log_loss

import xgboost as xgb
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import log_loss

In [17]:
def clean_params(params):
    for k in params.keys():
        if params[k] == 1.0 or k in ["n_estimators", "max_depth"]:
            params[k] = int(params[k])
    return params

def get_xgb_algorithm(params, df_train, features):
    params = clean_params(params)
    params["use_label_encoder"] = False
    params["eval_metric"] = "logloss"
    
    xgbc = xgb.XGBClassifier(**params)
    
    df_train_pd = df_train[features + ["target_indexed"]].toPandas()
    
    return xgbc.fit(df_train_pd[features], df_train_pd["target_indexed"])

def get_rf_algorithm(params, df_train, features):
    params = clean_params(params)

    rfc = RandomForestClassifier(numTrees=params["numTrees"], 
                                 maxDepth=params["maxDepth"], 
                                 subsamplingRate=params["subsamplingRate"])
    rfc.setLabelCol("target_indexed")
    rfc.setFeaturesCol("features")
    rfc.setPredictionCol("prediction")
    rfc.setProbabilityCol("proba")
    
    return rfc.fit(df_train)

def fit(result_row, data):
    clf_name = result_row["clf_name"]
    id_data = result_row["id_data"]
    id_modeling = result_row["id_modeling"]
    features = result_row["features"]
    
    if result_row["undersampling"] == "balanced":
        df_train = data[id_data]["train_balanced"]
        
    elif result_row["undersampling"] == "no":
        df_train = data[id_data]["train"]
    
    else:
        raise Exception(f'undersampling not recognized: {result_row["undersampling"]}')
        
    
    if clf_name == "RandomForestClassificationModel":
        clf = get_rf_algorithm(params=result_row["clf_params"], df_train=df_train, features=features)
        
    elif clf_name == "XGBClassifier":
        clf = get_xgb_algorithm(params=result_row["clf_params"], df_train=df_train, features=features)
        
    return clf

def predict(result_row, data, labels, test_dataset_name):
    clf_name = result_row["clf_name"]
    id_data = result_row["id_data"]
    id_modeling = result_row["id_modeling"]
    features = result_row["features"]
    clf = result_row["clf"]
    
    df_test = data[id_data][test_dataset_name]
    
    if clf_name == "RandomForestClassificationModel":
        preds = clf.transform(df_test)
        preds = dflib.dense_vector_to_columns(df=preds, 
                              dense_vector_colname="proba", 
                              new_colnames=labels)[["id"] + labels].toPandas()
        
    elif clf_name == "XGBClassifier":
        df_test_pd = df_test[["id"] + features].toPandas()
        preds = clf.predict_proba(df_test_pd[features])
        preds = pd.DataFrame(preds, columns=labels, index=df_test_pd["id"]).reset_index()
        
    return preds

<b>Fitting</b>

In [15]:
all_results_df["clf"] = all_results_df.apply(lambda row: fit(row, data), axis=1)

22/05/26 20:02:27 WARN DAGScheduler: Broadcasting large task binary with size 1294.1 KiB
22/05/26 20:02:30 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
22/05/26 20:02:33 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB
22/05/26 20:02:36 WARN DAGScheduler: Broadcasting large task binary with size 1206.2 KiB
22/05/26 20:02:38 WARN DAGScheduler: Broadcasting large task binary with size 7.7 MiB
22/05/26 20:02:43 WARN DAGScheduler: Broadcasting large task binary with size 1996.1 KiB
22/05/26 20:03:00 WARN DAGScheduler: Broadcasting large task binary with size 1192.6 KiB
22/05/26 20:03:05 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/05/26 20:03:12 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB
22/05/26 20:03:20 WARN DAGScheduler: Broadcasting large task binary with size 1261.0 KiB
22/05/26 20:03:21 WARN DAGScheduler: Broadcasting large task binary with size 7.9 MiB
22/05/26 20:03:31 WARN DAGScheduler: Br

<b>Predicting</b>

In [18]:
all_results_df["preds"] = all_results_df.apply(lambda row : predict(row, data, labels, "test"), axis=1)

22/05/26 20:03:54 WARN DAGScheduler: Broadcasting large task binary with size 6.0 MiB
22/05/26 20:03:58 WARN DAGScheduler: Broadcasting large task binary with size 6.0 MiB


In [19]:
all_results_df["preds_na"] = all_results_df.apply(lambda row : predict(row, data, labels, "test_na_imputed_median"), axis=1)

22/05/26 20:04:02 WARN DAGScheduler: Broadcasting large task binary with size 6.0 MiB
22/05/26 20:04:04 WARN DAGScheduler: Broadcasting large task binary with size 6.0 MiB


In [26]:
all_results_df["preds_complete"] = all_results_df[["preds", "preds_na"]].apply(lambda row : row["preds"].append(row["preds_na"]), axis=1)

# Build Submission

In [39]:
def save_submission(preds_row):
    filepath = "data/preds/preds_" + preds_row["id_modeling"] + ".csv"
    submission_df = preds_row["preds_complete"]
    print(filepath, len(submission_df))
    submission_df.to_csv(filepath, index=False)
    
all_results_df.apply(save_submission, axis=1)
print("done!")

data/preds/preds_9bd15ac0-889a-4cf5-9012-5675edce5889.csv 72711
data/preds/preds_6d671cab-ea98-4471-8455-0a8092241123.csv 72711
data/preds/preds_3f709ede-053d-434d-a212-867de706844b.csv 72711
data/preds/preds_d6e238c1-c6a8-41be-ac18-3cad3f47a420.csv 72711
done!


In [38]:
all_results_df[["id_modeling", "datetime", "clf_name", "undersampling", "n_features",
                 "best_score_cv_train", "best_score_cv", "clf_params", "features", "id_data"]].sort_values("best_score_cv", ascending=True)

Unnamed: 0,id_modeling,datetime,clf_name,undersampling,n_features,best_score_cv_train,best_score_cv,clf_params,features,id_data
0,9bd15ac0-889a-4cf5-9012-5675edce5889,2022-05-26 16:37:12,RandomForestClassificationModel,balanced,8,1.001444,1.000331,"{'numTrees': 60, 'maxDepth': 10, 'subsamplingR...","[home_mood_diff, draw_factor, home_history_moo...",b2beffb2-fb6d-4cb4-8869-82b859c2dd3b
3,d6e238c1-c6a8-41be-ac18-3cad3f47a420,2022-05-26 16:44:32,XGBClassifier,no,8,0.996144,1.010365,"{'colsample_bytree': 0.7, 'max_depth': 4, 'n_e...","[home_mood_diff, draw_factor, away_history_moo...",b2beffb2-fb6d-4cb4-8869-82b859c2dd3b
1,6d671cab-ea98-4471-8455-0a8092241123,2022-05-26 15:52:32,RandomForestClassificationModel,no,8,0.98231,1.012607,"{'numTrees': 60, 'maxDepth': 10, 'subsamplingR...","[home_mood_diff, away_history_mood_mean, home_...",b2beffb2-fb6d-4cb4-8869-82b859c2dd3b
2,3f709ede-053d-434d-a212-867de706844b,2022-05-26 17:45:35,XGBClassifier,balanced,8,1.033072,1.039629,"{'colsample_bytree': 0.7, 'max_depth': 2, 'n_e...","[home_mood_diff, home_history_mood_mean, away_...",b2beffb2-fb6d-4cb4-8869-82b859c2dd3b
