# Model Selection And Prediction
Notebook to experiment different model configurations and store results.

In [1]:
import os
from working_dir import set_wd
set_wd()
os.getcwd()

'/Users/tales.pimentel/ds/kaggle/football-match-prediction'

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config('spark.ui.showConsoleProgress', 'false') \
                            .config("spark.sql.debug.maxToStringFields", 500) \
                            .config("spark.sql.debug.autoBroadcastJoinThreshold", -1) \
                            .config("spark.driver.memory", "10g") \
                            .appName("ModelSelectionAndPrediction").getOrCreate()

In [3]:
import pandas as pd
import ast
from datetime import datetime
import matplotlib.pyplot as plt

from src.dao import dao_processed, dao_ml
from src.utils import dflib, stats, pretties
from src.ml.transformers import DropNaTransformer

In [4]:
pretties.max_data_frame_columns()

In [5]:
def build_result_df(result):
    result_df = pd.DataFrame(result["overfitting_analysis_df"])
    result_df["params"] = result_df[list(result["clf_params"].keys())].apply(lambda row : row.to_dict(), axis=1)
    result_df["undersampling"] = "UndersamplingTransformer" in result["pipeline_train_stages"]
    result_df["features"] = str(result["feature_importances"]["importance"].keys())
    result_df["clf_name"] = result["clf_name"]
    result_df["features"] = str(list(result["feature_importances"]["importance"].keys()))

    return result_df[["clf_name", "undersampling", "log_loss_cv", "log_loss_train", "params", "features"]].sort_values("log_loss_cv")

def get_top_configs(results, n=3):
    return results.sort_values("log_loss_cv").head(n)

def remove_cols(cols, cols_to_remove):
    for col_to_remove in cols_to_remove:
        if col_to_remove in cols:
            cols.remove(col_to_remove)
    return cols

# Loading Results

In [12]:
all_results = dao_ml.load_all_modeling()
print(len(all_results))

all_results_df = pd.DataFrame(all_results)
all_results_df["undersampling"] = all_results_df["pipeline_train_stages"].apply(lambda ppl : "UndersamplingTransformer" in ppl)

all_results_df["undersampling"] = all_results_df["undersampling"].replace({True: "balanced", False: "no"})

all_results_df["features"] = all_results_df["feature_importances"].apply(lambda fi : list(fi["importance"].keys()))
all_results_df["n_features"] = all_results_df["features"].apply(len)
all_results_df[["id_modeling", "datetime", "clf_name", "undersampling", "n_features",
                 "best_score_cv_train", "best_score_cv", "id_data"]].sort_values("datetime", ascending=False)

3


Unnamed: 0,id_modeling,datetime,clf_name,undersampling,n_features,best_score_cv_train,best_score_cv,id_data
2,a26f990a-12e8-4a8d-9e2e-74e4969f6508,2022-05-26 19:02:06,RandomForestClassificationModel,no,8,0.981899,1.012797,b2beffb2-fb6d-4cb4-8869-82b859c2dd3b
0,d6e238c1-c6a8-41be-ac18-3cad3f47a420,2022-05-26 16:44:32,XGBClassifier,no,8,0.996144,1.010365,b2beffb2-fb6d-4cb4-8869-82b859c2dd3b
1,6d671cab-ea98-4471-8455-0a8092241123,2022-05-26 15:52:32,RandomForestClassificationModel,no,8,0.98231,1.012607,b2beffb2-fb6d-4cb4-8869-82b859c2dd3b


In [7]:
id_modeling_xgb_u = "15a43545-665e-44b8-91ef-d21816c90c4e"
id_modeling_xgb = "3118f21c-2508-4dfd-8a49-890f6d09bd4f"
id_modeling_rf_u = "6846618f-8912-4f18-b8b7-ffb9fff01fc2"
id_modeling_rf = "31949b43-998b-4669-90de-08ec5f75d64f"

results_xgb_u = dao_ml.load_modeling(id_modeling_xgb_u)
results_xgb = dao_ml.load_modeling(id_modeling_xgb)
results_rf_u = dao_ml.load_modeling(id_modeling_rf_u)
results_rf = dao_ml.load_modeling(id_modeling_rf)

results_xgb_u = build_result_df(results_xgb_u).sort_values("log_loss_cv").head(3)
results_xgb = build_result_df(results_xgb).sort_values("log_loss_cv").head(3)
results_rf_u = build_result_df(results_rf_u).sort_values("log_loss_cv").head(3)
results_rf = build_result_df(results_rf).sort_values("log_loss_cv").head(3)

# Loading Data

In [8]:
id_data_build = dao_processed.most_recent_data_build_id()
id_data_build = "dbd64bc0-9cf2-44ad-a5c5-edcce0393bf4"
print(id_data_build)#"dbd64bc0-9cf2-44ad-a5c5-edcce0393bf4"

dbd64bc0-9cf2-44ad-a5c5-edcce0393bf4


In [9]:
feature_selection_data = dao_ml.load_feature_selection(id_data=id_data_build)[0]
metadata_json = dao_processed.load_processed_metadata(id_data=id_data_build)

In [10]:
use_features = remove_cols(cols=metadata_json["use_features"], cols_to_remove=feature_selection_data["cols_to_remove"])

In [11]:
df_ttrain = dao_processed.load_processed_data(which_dataset="train_train", id_data=id_data_build, spark=spark)
df_ttrain = dflib.sample(df_ttrain, n=df_ttrain.count())
df_tvalid = dao_processed.load_processed_data(which_dataset="train_valid", id_data=id_data_build, spark=spark)
df_test = dao_processed.load_processed_data(which_dataset="test", id_data=id_data_build, spark=spark)

print(f"df_ttrain shape: {dflib.shape(df_ttrain)}")
print(f"df_tvalid shape: {dflib.shape(df_tvalid)}")
print(f"df_test shape: {dflib.shape(df_test)}")

df_ttrain shape: (87470, 15)
df_tvalid shape: (23468, 15)
df_test shape: (72711, 14)


# Data Pipeline

In [12]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import VectorAssembler, StringIndexer, IndexToString
from src.ml.transformers import UndersamplingTransformer, ProbaVectorToPrediction
from src.ml.estimators import FillProbaEstimator
from src.ml import metrics
from pyspark.ml.feature import Imputer

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import xgboost as xgb
from sklearn.metrics import log_loss

import xgboost as xgb
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import log_loss

### Defining

In [13]:
undersampling_transformer = UndersamplingTransformer(target_colname="target")

feature_assembler_transformer = VectorAssembler(inputCols=use_features, 
                                                outputCol="features")

target_indexer_transformer = StringIndexer(inputCol="target", 
                                           outputCol="target_indexed", 
                                           stringOrderType="alphabetDesc").fit(df_ttrain)
labels = [target_indexer_transformer.labels[i] for i in range(len(target_indexer_transformer.labels))]

target_reverter_transformer = IndexToString(inputCol="target_indexed", 
                                            outputCol="target",
                                            labels=labels)

In [14]:
pipeline_train = PipelineModel(stages=[feature_assembler_transformer, 
                                       target_indexer_transformer])

pipeline_test = PipelineModel(stages=[feature_assembler_transformer])

### Applying

In [15]:
df_ttrain_na = dflib.filter_any_null(df_ttrain, subset=use_features)
df_ttrain = DropNaTransformer(subset=use_features).transform(df_ttrain)

df_tvalid_na = dflib.filter_any_null(df_tvalid, subset=use_features)
df_tvalid = DropNaTransformer(subset=use_features).transform(df_tvalid)

df_test_na = dflib.filter_any_null(df_test)
df_test = DropNaTransformer().transform(df_test)

print(f"df_ttrain shape: {dflib.shape(df_ttrain)}")
print(f"df_ttrain_na shape: {dflib.shape(df_ttrain_na)}")
print(f"df_tvalid shape: {dflib.shape(df_tvalid)}")
print(f"df_tvalid_na shape: {dflib.shape(df_tvalid_na)}")
print(f"df_test shape: {dflib.shape(df_test)}")
print(f"df_test_na shape: {dflib.shape(df_test_na)}")

df_ttrain shape: (85353, 15)
df_ttrain_na shape: (2117, 15)
df_tvalid shape: (22807, 15)
df_tvalid_na shape: (661, 15)
df_test shape: (65795, 14)
df_test_na shape: (6916, 14)


In [16]:
df_ttrain = pipeline_train.transform(df_ttrain)
df_ttrain_u = undersampling_transformer.transform(df_ttrain)
df_tvalid = pipeline_train.transform(df_tvalid)
df_tvalid_u = undersampling_transformer.transform(df_tvalid)
df_test = pipeline_test.transform(df_test)

df_ttrain_pd = df_ttrain.toPandas()
df_ttrain_u_pd = df_ttrain_u.toPandas()
df_tvalid_pd = df_tvalid.toPandas()
df_test_pd = df_test.toPandas()

In [17]:
df_train = df_ttrain.union(df_tvalid)
df_train_pd = df_train.toPandas()

In [18]:
df_train_u = undersampling_transformer.transform(df_ttrain.union(df_tvalid))
df_train_u_pd = df_train_u.toPandas()

# Checking validation score with `tvalid` dataset

In [19]:
def get_xgb_algorithm(xgb_conf_row):
    params = xgb_conf_row["params"]
    features = ast.literal_eval(xgb_conf_row["features"])
    
    for k in params.keys():
        if params[k] == 1.0 or k in ["n_estimators", "max_depth"]:
            params[k] = int(params[k])
    
    params["use_label_encoder"] = False
    params["eval_metric"] = "logloss"
    xgbc = xgb.XGBClassifier(**params)
    
    return {"clf": xgbc, "features": features}

def get_rf_algorithm(rf_conf_row):
    params = rf_conf_row["params"]
    for k in params.keys():
        if params[k] == 1.0 or k in ["numTrees", "maxDepth"]:
            params[k] = int(params[k])

    rfc = RandomForestClassifier(numTrees=params["numTrees"], 
                                 maxDepth=params["maxDepth"], 
                                 subsamplingRate=params["subsamplingRate"])
    rfc.setLabelCol("target_indexed")
    rfc.setFeaturesCol("features")
    rfc.setPredictionCol("prediction")
    rfc.setProbabilityCol("proba")
    
    return rfc

def predict_proba_xgb(clf, df, features, labels=labels):
    proba = clf.predict_proba(df[features])
    proba = pd.DataFrame(proba, columns=labels, index=df.index)
    return proba.join(df[["target"]], how="inner")

def predict_proba_rf(clf, df, features, target_to_string=target_reverter_transformer):
    proba = clf.transform(df.select(["features", "target_indexed", "target"] + features))
    if not "target" in proba.columns:
        proba = target_to_string.transform(proba)
    
    return proba

<b>Fitting</b>

In [20]:
# rfc_u_clf_df = results_rf_u \
#         .apply(get_rf_algorithm, axis=1) \
#         .apply(lambda rf : rf.fit(df_ttrain_u))

# rfc_clf_df = results_rf \
#         .apply(get_rf_algorithm, axis=1) \
#         .apply(lambda rf : rf.fit(df_ttrain))

In [21]:
xgbc_u_clf_df = results_xgb_u \
        .apply(get_xgb_algorithm, axis=1) \
        .apply(lambda xgbc : xgbc["clf"].fit(df_ttrain_u_pd[xgbc["features"]], df_ttrain_u_pd["target_indexed"]))

xgbc_clf_df = results_xgb \
        .apply(get_xgb_algorithm, axis=1) \
        .apply(lambda xgbc : xgbc["clf"].fit(df_ttrain_pd[xgbc["features"]], df_ttrain_pd["target_indexed"]))

<b>Predicting</b>

In [22]:
# rfc_u_preds_valid_df = rfc_u_clf_df.apply(lambda rfc : predict_proba_rf(rfc, df_tvalid, use_features))

# rfc_preds_valid_df = rfc_clf_df.apply(lambda rfc : predict_proba_rf(rfc, df_tvalid, use_features))

# rfc_u_preds_valid_df = rfc_u_preds_valid_df.apply(lambda df : dflib.dense_vector_to_columns(df=df,
#                                                                dense_vector_colname="proba",
#                                                                new_colnames=labels).toPandas())

# rfc_preds_valid_df = rfc_preds_valid_df.apply(lambda df : dflib.dense_vector_to_columns(df=df,
#                                                                dense_vector_colname="proba",
#                                                                new_colnames=labels).toPandas())

In [23]:
xgbc_u_preds_valid_df = xgbc_u_clf_df.apply(lambda xgbc : 
                                        predict_proba_xgb(clf=xgbc, df=df_tvalid_pd, features=use_features))

xgbc_preds_valid_df = xgbc_clf_df.apply(lambda xgbc : 
                                        predict_proba_xgb(clf=xgbc, df=df_tvalid_pd, features=use_features))

<b>Evaluating</b>

In [24]:
xgbc_u_score_valid_df = xgbc_u_preds_valid_df.apply(lambda preds : log_loss(y_true=preds["target"], y_pred=preds[labels].to_numpy()))
xgbc_score_valid_df = xgbc_preds_valid_df.apply(lambda preds : log_loss(y_true=preds["target"], y_pred=preds[labels].to_numpy()))

# rfc_u_preds_valid_df = rfc_u_preds_valid_df.apply(lambda preds : log_loss(y_true=preds["target"], y_pred=preds[labels].to_numpy()))
# rfc_preds_valid_df = rfc_preds_valid_df.apply(lambda preds : log_loss(y_true=preds["target"], y_pred=preds[labels].to_numpy()))

In [25]:
print("xgbc_u_score_valid_df")
display(xgbc_u_score_valid_df)
print()
print("xgbc_score_valid_df")
display(xgbc_score_valid_df)
print()
# print("rfc_u_preds_valid_df")
# display(rfc_u_preds_valid_df)
# print()
# print("rfc_preds_valid_df")
# display(rfc_preds_valid_df)

xgbc_u_score_valid_df


87    1.374880
51    1.374880
53    1.350037
dtype: float64


xgbc_score_valid_df


88    1.334647
52    1.334647
53    1.397094
dtype: float64




In [26]:
final_clf = results_xgb.loc[["53"]] \
            .apply(get_xgb_algorithm, axis=1) \
            .apply(lambda xgbc : xgbc["clf"].fit(df_ttrain_pd.append(df_tvalid_pd)[xgbc["features"]], df_ttrain_pd.append(df_tvalid_pd)["target_indexed"])).item()

final_clf

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6,
              enable_categorical=False, eval_metric='logloss', gamma=0,
              gpu_id=-1, importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=60, n_jobs=12, num_parallel_tree=1,
              objective='multi:softprob', predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=0.8,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, ...)

# Selecting Missing Values Strategy
For dataset with <b>missing values</b> in features

#### Imputation

In [27]:
def predict_na_imputer(clf, df_train, df_valid, strategy='median'):
    imputer = Imputer(strategy=strategy, inputCols=use_features, outputCols=use_features).fit(df_train)
    df_valid_imputed = imputer.transform(df_valid)
    df_valid_imputed = target_indexer_transformer.transform(df_valid_imputed)
    
    df_valid_imputed = df_valid_imputed.toPandas()
    
    preds_valid_imputed = clf.predict_proba(df_valid_imputed[use_features])
    preds_valid_imputed = pd.DataFrame(preds_valid_imputed, columns=[int(c) for c in clf.classes_], index=df_valid_imputed["id"])
    
    preds_valid_imputed = preds_valid_imputed.join(df_valid_imputed.set_index("id"), how="right")

    
    return preds_valid_imputed

In [28]:
preds_tvalid_na_median_imputed = predict_na_imputer(final_clf, df_ttrain, df_tvalid_na, strategy='median')
preds_tvalid_na_mean_imputed = predict_na_imputer(final_clf, df_ttrain, df_tvalid_na, strategy='mean')

#### Filling Prediction with Global Frequency

In [29]:
def predict_na_filler(df_train, df_valid):
    proba_filler = FillProbaEstimator(strategy="global_frequency", labels=target_indexer_transformer.labels,
                                      proba_vector_col="proba").fit(df_train)

    pred_indexer = ProbaVectorToPrediction(target_transformer=target_indexer_transformer, 
                                           prediction_col="prediction",
                                           dense_vector_colname="proba")

    df_valid_proba_filled = proba_filler.transform(df_valid)

    if not "away" in df_valid_proba_filled.columns:
        df_valid_proba_filled = pred_indexer.transform(df_valid_proba_filled)
        
    df_valid_proba_filled = target_indexer_transformer.transform(df_valid_proba_filled)
    return df_valid_proba_filled

In [30]:
preds_tvalid_na_filled = predict_na_filler(df_ttrain, df_tvalid_na).toPandas()

#### Comparing missing values filling strategy

In [31]:
print(f"score_imputer (median): {len(preds_tvalid_na_median_imputed)}")
print(log_loss(preds_tvalid_na_median_imputed["target_indexed"], preds_tvalid_na_median_imputed[[0,1,2]].to_numpy()))
print()
print(f"score_imputer (mean): {len(preds_tvalid_na_mean_imputed)}")
print(log_loss(preds_tvalid_na_mean_imputed["target_indexed"], preds_tvalid_na_mean_imputed[[0,1,2]].to_numpy()))
print()
print(f"score_filler: {len(preds_tvalid_na_filled)}")
print(log_loss(preds_tvalid_na_mean_imputed["target_indexed"], preds_tvalid_na_filled["proba"].to_list()))

score_imputer (median): 661
1.1572800701238926

score_imputer (mean): 661
1.1566998537139526

score_filler: 661
1.063540489975523


# Prediction

In [71]:
preds_test = final_clf.predict_proba(df_test_pd[use_features])
preds_test = pd.DataFrame(preds_test, columns=[c for c in target_indexer_transformer.labels], index=df_test_pd["id"])

# preds_test_na = predict_na_imputer(final_clf, df_ttrain, df_test_na.toPandas())
preds_test_na = predict_na_imputer(final_clf, df_ttrain.union(df_tvalid), df_test_na, strategy='median')
preds_test_na = preds_test_na[[0,1,2]]
preds_test_na.columns = [c for c in target_indexer_transformer.labels]

In [72]:
cols = list(set(preds_test.columns).intersection(set(preds_test_na.columns)))

preds = preds_test[cols].append(preds_test_na[cols])
preds.columns = [target_indexer_transformer.labels[i] for i in range(3)]
len(preds)

72711

# Build Submission

In [75]:
preds.reset_index().to_csv("data/preds/preds57.csv", index=False, sep=",") # 

In [None]:
preds.reset_index().to_csv("data/preds/preds1313.csv", index=False, sep=",") # 

In [None]:
submission.toPandas().to_csv("data/preds/preds1212.csv", index=False, sep=",") # 

In [None]:
submission.toPandas().to_csv("data/preds/preds1111.csv", index=False, sep=",") # 1.04262 undersampling

In [None]:
submission.toPandas().to_csv("data/preds/preds1010.csv", index=False, sep=",")

In [None]:
submission.toPandas().to_csv("data/preds/preds8888.csv", index=False, sep=",")

In [None]:
# submission.toPandas().to_csv("data/preds/preds7777.csv", index=False, sep=",") #1.02029

In [None]:
# submission.toPandas().to_csv("data/preds/preds6666.csv", index=False, sep=",") #1.02059

In [None]:
# submission.toPandas().to_csv("data/preds/preds5555.csv", index=False, sep=",") #1.02231

In [None]:
missing_values_strategies = ["global_frequency", "league_frequency", "uniform_proba"]

In [None]:
def pick_rf_classifier(params):
    num_trees = params["num_trees"]
    max_depth = params["max_depth"]
    subsampling_rate = params["subsampling_rate"]
    
    rf = RandomForestClassifier(labelCol="target_indexed", 
                                predictionCol="prediction",
                                probabilityCol='proba', 
                                featuresCol="features", 
                                numTrees=num_trees,
                                maxDepth=max_depth,
                                subsamplingRate=subsampling_rate)
    
    return rf
    

def pick_input_data(undersampling):
    if undersampling:
        X = df_ttrain_undersampling
    else:
        X = df_ttrain
    
    return X

def calc_metrics(df_preds, which_dataset):                            
    preds_metrics = metrics.get_metrics(df_preds, 
                                        labelCol="target_indexed", 
                                        predictionCol="prediction", 
                                        probabilityCol="proba")
    
    preds_metrics["which_dataset"] = which_dataset
    return preds_metrics

def build_result(model_params, metrics_train, metrics_valid, data_build_params, id_data_build, labels):
    id_result = str(uuid.uuid4())
    result = model_params
    
    result["metrics_train_train"] = metrics_train
    result["metrics_train_valid"] = metrics_valid
    result["id_result"] = id_result
    result["features"] = data_build_params["use_features"]
    result["id_data_build"] = id_data_build
    result["datetime"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    result["labels"] = labels
    
    return result

missing_preds_dfs = {("ttrain", "global_frequency"): pipeline_missing_preds_global_freq.transform(df_ttrain_na),
                     ("ttrain", "league_frequency"): pipeline_missing_preds_league_freq.transform(df_ttrain_na),
                     ("ttrain", "uniform_proba"): pipeline_missing_preds_uniform.transform(df_ttrain_na),
                     ("tvalid", "global_frequency"): pipeline_missing_preds_global_freq.transform(df_tvalid_na),
                     ("tvalid", "league_frequency"): pipeline_missing_preds_league_freq.transform(df_tvalid_na),
                     ("tvalid", "uniform_proba"): pipeline_missing_preds_uniform.transform(df_tvalid_na)}

def fill_missing_proba_matches(df_label, missing_values_strategy):
    return missing_preds_dfs[(df_label, missing_values_strategy)]