# ModelSelection - Experiment
Notebook to experiment different model configurations and store results.

In [1]:
import os
from working_dir import set_wd
set_wd()
os.getcwd()

'/home/tales/ds/kaggle/football-match-prediction'

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config("spark.sql.debug.maxToStringFields", 500) \
                            .config("spark.driver.memory", "14g") \
                            .appName("ModelSelection-Experiment").getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

22/05/09 11:31:13 WARN Utils: Your hostname, tales-samsung resolves to a loopback address: 127.0.1.1; using 192.168.0.104 instead (on interface wlxd03745e80dbf)
22/05/09 11:31:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/09 11:31:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from datetime import datetime
import uuid
import pyspark.sql.functions as f
from pyspark.sql.functions import when
from src.dao import dao, dao_processed, dao_ml
from src.utils import dflib, stats, pretties, plot, plot_domain, palette

In [4]:
pretties.max_data_frame_columns()

# Loading Data

In [5]:
df_id = "8cf83372-54d4-4bf2-a1ee-5036ab694faa"

In [6]:
df_ttrain = dao_processed.load_processed_data(which_dataset="train_train", id=df_id, spark=spark)
df_tvalid = dao_processed.load_processed_data(which_dataset="train_valid", id=df_id, spark=spark)

metadata_json = dao_processed.load_processed_metadata(id=df_id)
print(f"df_ttrain shape: {dflib.shape(df_ttrain)}")
print(f"df_tvalid shape: {dflib.shape(df_tvalid)}")

                                                                                

df_ttrain shape: (87470, 14)
df_tvalid shape: (23468, 14)


# Fit Data Pipeline

In [7]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import VectorIndexer #Class for indexing categorical feature columns in a dataset of Vector.
from pyspark.ml.feature import VectorAssembler #A feature transformer that merges multiple columns into a vector column.
from pyspark.ml.feature import StringIndexer #A label indexer that maps a string column of labels to an ML column of label indices.
from pyspark.ml.feature import IndexToString
from src.ml.transformers import DropNaTransformer, UndersamplingTransformer

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from src.ml import metrics

In [8]:
use_features = metadata_json["use_features"]

feature_assembler_transformer = VectorAssembler(inputCols=use_features, 
                                                outputCol="features")

target_indexer_transformer = StringIndexer(inputCol="target", 
                                           outputCol="target_indexed", 
                                           stringOrderType="alphabetDesc")
target_indexer_transformer = target_indexer_transformer.fit(df_ttrain)

inverter_transformer = IndexToString(inputCol="target_indexed", 
                                     outputCol="indexed_to_target", 
                                     labels=target_indexer_transformer.labels)

In [9]:
pipeline_stages=[DropNaTransformer(),
                 feature_assembler_transformer, 
                 target_indexer_transformer]

pipeline_model = PipelineModel(stages=pipeline_stages)

In [10]:
df_ttrain = pipeline_model.transform(df_ttrain.select(["target"] + use_features))
df_ttrain_undersampling = UndersamplingTransformer(target_colname="target").transform(df_ttrain)

df_tvalid = pipeline_model.transform(df_tvalid.select(["target"] + use_features))
df_tvalid_undersampling = UndersamplingTransformer(target_colname="target").transform(df_tvalid)

DropNaTransformer
UndersamplingTransformer


                                                                                

DropNaTransformer
UndersamplingTransformer


In [11]:
experiment_params = []

for missing_values_strategy in ["dropna"]:
    for undersampling in (True, False):
        for num_trees in (60, 25, 10):
            for max_depth in (15, 13, 10, 8, 5, 1, 0):
                for subsampling_rate in (0.8, 0.65, 0.5, 0.3):

                    params = {'missing_values_strategy': missing_values_strategy,
                              'undersampling': undersampling, 
                              'num_trees': num_trees, 
                              'max_depth': max_depth, 
                              'subsampling_rate': subsampling_rate}
                    
                    experiment_params.append(params)
                
print(f'n params: {len(experiment_params)}')

n params: 168


In [None]:
counter = 0

for params in experiment_params:
    counter += 1
    if counter <= 84:
        continue
    print(f"n: {counter}")
    display(params)
    
    missing_values_strategy = params["missing_values_strategy"]
    undersampling = params["undersampling"]
    num_trees = params["num_trees"]
    max_depth = params["max_depth"]
    subsampling_rate = params["subsampling_rate"]
    
    if undersampling:
        use_df_train = df_ttrain_undersampling
        use_df_valid = df_tvalid_undersampling
    else:
        use_df_train = df_ttrain
        use_df_valid = df_tvalid
    
    ### TRAINING
    rf = RandomForestClassifier(labelCol="target_indexed", 
                                probabilityCol='proba', 
                                featuresCol="features", 
                                numTrees=num_trees,
                                maxDepth=max_depth,
                                subsamplingRate=subsampling_rate)
    
    model = rf.fit(use_df_train)
    
    
    ### TRAIN METRICS
    train_preds = model.transform(use_df_train)
    metrics_train_train = metrics.get_metrics(train_preds, 
                                                    labelCol="target_indexed", 
                                                    predictionCol="prediction", 
                                                    probabilityCol="proba")
    metrics_train_train["which_dataset"] = "train_train"
    
    ### VALIDATION METRICS
    valid_preds = model.transform(use_df_valid)
    metrics_train_valid = metrics.get_metrics(valid_preds, 
                                                    labelCol="target_indexed", 
                                                    predictionCol="prediction", 
                                                    probabilityCol="proba")
    metrics_train_valid["which_dataset"] = "train_valid"
    
    ### BULDING RESULTS
    id_model = str(uuid.uuid4())
#     result = {**params, **metric_values_train_train, **metric_values_train_valid}
    result = params
    result["metrics_train_train"] = metrics_train_train
    result["metrics_train_valid"] = metrics_train_valid
    result["id_model"] = id_model
    result["datetime"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    result = {**result, **params}
    
    print("accuracy train:", metrics_train_train["accuracy"])
    print("accuracy valid:", metrics_train_valid["accuracy"])
    print("log loss train:", metrics_train_train["log_loss"])
    print("log loss valid:", metrics_train_valid["log_loss"])

    dao_ml.save_result(result=result, id_data_build=df_id, model=model, id_model=id_model)
    
    pretties.hr()

n: 85


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 15,
 'subsampling_rate': 0.8}

                                                                                

accuracy train: 0.7115096460011238
accuracy valid: 0.492748588797221
log loss train: 0.7743561909621416
log loss valid: 1.0129436194326276


                                                                                

n: 86


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 15,
 'subsampling_rate': 0.65}

                                                                                

accuracy train: 0.6952495785727665
accuracy valid: 0.4932262266608771
log loss train: 0.790303404605915
log loss valid: 1.0119142020585494


                                                                                

n: 87


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 15,
 'subsampling_rate': 0.5}

                                                                                

accuracy train: 0.6773155085221951
accuracy valid: 0.4905775075987842
log loss train: 0.8086759684707814
log loss valid: 1.0120113479539867


                                                                                

n: 88


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 15,
 'subsampling_rate': 0.3}

                                                                                

accuracy train: 0.6390475744521446
accuracy valid: 0.4911419887103778
log loss train: 0.8466097935853455
log loss valid: 1.0113579452063017


                                                                                

n: 89


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 13,
 'subsampling_rate': 0.8}

                                                                                

accuracy train: 0.6104841730661172
accuracy valid: 0.49674337820234477
log loss train: 0.8755432307156645
log loss valid: 1.0047672467639754


                                                                                

n: 90


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 13,
 'subsampling_rate': 0.65}

                                                                                

accuracy train: 0.6029921333583068
accuracy valid: 0.49756838905775075
log loss train: 0.8829362037449108
log loss valid: 1.0053853041601049


                                                                                

n: 91


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 13,
 'subsampling_rate': 0.5}

                                                                                

accuracy train: 0.5943762876943248
accuracy valid: 0.4968302214502822
log loss train: 0.8912097251001955
log loss valid: 1.0054855287585596


                                                                                

n: 92


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 13,
 'subsampling_rate': 0.3}

                                                                                

accuracy train: 0.579240026222139
accuracy valid: 0.49782891880156316
log loss train: 0.9092179912404662
log loss valid: 1.005893665942458


                                                                                

n: 93


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 10,
 'subsampling_rate': 0.8}

                                                                                

accuracy train: 0.5266786851470313
accuracy valid: 0.5001736864958749
log loss train: 0.9720035073644077
log loss valid: 1.000051821516461


n: 94


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 10,
 'subsampling_rate': 0.65}

                                                                                

accuracy train: 0.5249812699007305
accuracy valid: 0.5005644811115936
log loss train: 0.9729724525685246
log loss valid: 1.0001829077146402


n: 95


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 10,
 'subsampling_rate': 0.5}

                                                                                

accuracy train: 0.5248642067802959
accuracy valid: 0.5010855405992184
log loss train: 0.9740169557804197
log loss valid: 0.9999332769683708


n: 96


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 10,
 'subsampling_rate': 0.3}

                                                                                

accuracy train: 0.5215513204719985
accuracy valid: 0.5029960920538428
log loss train: 0.9765634197535521
log loss valid: 0.9997199695290993


n: 97


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 8,
 'subsampling_rate': 0.8}

                                                                                

accuracy train: 0.5038279640382094
accuracy valid: 0.4997394702561876
log loss train: 1.0002293658090156
log loss valid: 1.0020063699528499


n: 98


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 8,
 'subsampling_rate': 0.65}

                                                                                

accuracy train: 0.5035704251732535
accuracy valid: 0.49913156752062526
log loss train: 1.0000330919572826
log loss valid: 1.0018102571967142


n: 99


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 8,
 'subsampling_rate': 0.5}

                                                                                

accuracy train: 0.5038045514141225
accuracy valid: 0.4998697351280938
log loss train: 1.0000015689015753
log loss valid: 1.0017039286702698


n: 100


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 8,
 'subsampling_rate': 0.3}

[Stage 1242:>                                                       (0 + 2) / 2]

accuracy train: 0.5040855029031653
accuracy valid: 0.5003039513677812
log loss train: 1.000053579688342
log loss valid: 1.0013823662204935


                                                                                

n: 101


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 5,
 'subsampling_rate': 0.8}

                                                                                

accuracy train: 0.4934093463195355
accuracy valid: 0.49409465914025186
log loss train: 1.0221916681494858
log loss valid: 1.0190241847411907


n: 102


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 5,
 'subsampling_rate': 0.65}

                                                                                

accuracy train: 0.49420537553849037
accuracy valid: 0.4950933564915328
log loss train: 1.0213502227784483
log loss valid: 1.0178810065791009


n: 103


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 5,
 'subsampling_rate': 0.5}

                                                                                

accuracy train: 0.4938190672410564
accuracy valid: 0.49305254016500216
log loss train: 1.0206513219674194
log loss valid: 1.0170977497747173


n: 104


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 5,
 'subsampling_rate': 0.3}

                                                                                

accuracy train: 0.4932454579509271
accuracy valid: 0.4947459834997829
log loss train: 1.0195460765547575
log loss valid: 1.015452159449197


n: 105


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 1,
 'subsampling_rate': 0.8}

                                                                                

accuracy train: 0.48441889867016297
accuracy valid: 0.48684324793747286
log loss train: 1.0440399491365906
log loss valid: 1.0459055152407757


n: 106


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 1,
 'subsampling_rate': 0.65}

                                                                                

accuracy train: 0.4859290129237685
accuracy valid: 0.48975249674337823
log loss train: 1.0438792638983598
log loss valid: 1.0458322551213963


n: 107


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 1,
 'subsampling_rate': 0.5}

                                                                                

accuracy train: 0.4847583817194231
accuracy valid: 0.48805905340859745
log loss train: 1.0433373578627265
log loss valid: 1.0452202936776689


n: 108


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 1,
 'subsampling_rate': 0.3}

                                                                                

accuracy train: 0.4863504401573328
accuracy valid: 0.4897959183673469
log loss train: 1.0434965769110363
log loss valid: 1.045216478356744


n: 109


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 0,
 'subsampling_rate': 0.8}

                                                                                

accuracy train: 0.4346670724854842
accuracy valid: 0.4258358662613982
log loss train: 1.0721707254550683
log loss valid: 1.0781476904737057


n: 110


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 0,
 'subsampling_rate': 0.65}

                                                                                

accuracy train: 0.4346670724854842
accuracy valid: 0.4258358662613982
log loss train: 1.0721719361550206
log loss valid: 1.0781793613767867


n: 111


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 0,
 'subsampling_rate': 0.5}

                                                                                

accuracy train: 0.4346670724854842
accuracy valid: 0.4258358662613982
log loss train: 1.0721715149310516
log loss valid: 1.0781576905947834


n: 112


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 60,
 'max_depth': 0,
 'subsampling_rate': 0.3}

                                                                                

accuracy train: 0.4346670724854842
accuracy valid: 0.4258358662613982
log loss train: 1.0721706859011626
log loss valid: 1.0781326817804422


n: 113


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 15,
 'subsampling_rate': 0.8}

                                                                                

accuracy train: 0.7071080726727852
accuracy valid: 0.487320885801129
log loss train: 0.7757632434859673
log loss valid: 1.0216908182746947


                                                                                

n: 114


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 15,
 'subsampling_rate': 0.65}

                                                                                

accuracy train: 0.6898295560966473
accuracy valid: 0.4883630047763786
log loss train: 0.7933842113183842
log loss valid: 1.020715043445037


[Stage 2108:>                                                       (0 + 1) / 1]                                                                                

n: 115


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 15,
 'subsampling_rate': 0.5}

                                                                                

accuracy train: 0.6704321970406443
accuracy valid: 0.48554059921841075
log loss train: 0.8116217755870053
log loss valid: 1.0215972286519857


                                                                                

n: 116


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 15,
 'subsampling_rate': 0.3}

                                                                                

accuracy train: 0.6249531747518262
accuracy valid: 0.4830655666521928
log loss train: 0.8565524917056859
log loss valid: 1.0245188623157162


n: 117


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 13,
 'subsampling_rate': 0.8}

                                                                                

accuracy train: 0.6103554036336393
accuracy valid: 0.49400781589231435
log loss train: 0.8771674182587251
log loss valid: 1.0096300837476442


n: 118


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 13,
 'subsampling_rate': 0.65}

                                                                                

accuracy train: 0.6021492788911781
accuracy valid: 0.4941815023881893
log loss train: 0.8861264826110644
log loss valid: 1.0097706030529812


n: 119


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 13,
 'subsampling_rate': 0.5}

                                                                                

accuracy train: 0.5926320471998502
accuracy valid: 0.4920538428137212
log loss train: 0.8935520829361395
log loss valid: 1.0103284528424494


n: 120


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 13,
 'subsampling_rate': 0.3}

                                                                                

accuracy train: 0.5706944184304177
accuracy valid: 0.49161962657403385
log loss train: 0.9160593747861606
log loss valid: 1.013383153810867


n: 121


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 10,
 'subsampling_rate': 0.8}

                                                                                

accuracy train: 0.5255782918149466
accuracy valid: 0.49926183239253147
log loss train: 0.9726487835374642
log loss valid: 1.000705459935408


n: 122


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 10,
 'subsampling_rate': 0.65}

                                                                                

accuracy train: 0.5237404008241243
accuracy valid: 0.5011723838471559
log loss train: 0.9741093306299855
log loss valid: 1.002187986938873


n: 123


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 10,
 'subsampling_rate': 0.5}

                                                                                

accuracy train: 0.5243725416744709
accuracy valid: 0.49848024316109424
log loss train: 0.9751109099028001
log loss valid: 1.002227721308177


n: 124


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 10,
 'subsampling_rate': 0.3}

                                                                                

accuracy train: 0.5182852594118749
accuracy valid: 0.49795918367346936
log loss train: 0.9794191150076506
log loss valid: 1.0039914244144132


n: 125


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 8,
 'subsampling_rate': 0.8}

                                                                                

accuracy train: 0.5040972092152088
accuracy valid: 0.4982631350412505
log loss train: 1.0003990107090286
log loss valid: 1.0023880777335816


n: 126


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 8,
 'subsampling_rate': 0.65}

                                                                                

accuracy train: 0.5047176437535119
accuracy valid: 0.4981762917933131
log loss train: 1.0007578799972232
log loss valid: 1.002785841107004


n: 127


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 8,
 'subsampling_rate': 0.5}

                                                                                

accuracy train: 0.5040035587188612
accuracy valid: 0.49791576204950067
log loss train: 1.00061355849824
log loss valid: 1.0025960684099304


n: 128


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 8,
 'subsampling_rate': 0.3}

                                                                                

accuracy train: 0.5028680464506462
accuracy valid: 0.4989578810247503
log loss train: 1.0010471001637657
log loss valid: 1.0032787747669858


                                                                                

n: 129


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 5,
 'subsampling_rate': 0.8}

                                                                                

accuracy train: 0.49481410376474994
accuracy valid: 0.49491966999565784
log loss train: 1.0213216950054602
log loss valid: 1.0179244672214287


n: 130


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 5,
 'subsampling_rate': 0.65}

                                                                                

accuracy train: 0.4933039895111444
accuracy valid: 0.4928354320451585
log loss train: 1.0210764748270196
log loss valid: 1.0172670857185149


n: 131


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 5,
 'subsampling_rate': 0.5}

                                                                                

accuracy train: 0.4941585502903165
accuracy valid: 0.49331306990881457
log loss train: 1.021000929743481
log loss valid: 1.0172820570809182


n: 132


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 5,
 'subsampling_rate': 0.3}

                                                                                

accuracy train: 0.4943809702191422
accuracy valid: 0.494398610508033
log loss train: 1.019498042802492
log loss valid: 1.015462984388942


n: 133


{'missing_values_strategy': 'dropna',
 'undersampling': False,
 'num_trees': 25,
 'max_depth': 1,
 'subsampling_rate': 0.8}

                                                                                