# BuildSubmission
Notebook to experiment different model configurations and store results.

In [1]:
import os
from working_dir import set_wd
set_wd()
os.getcwd()

'/home/tales/ds/kaggle/football-match-prediction'

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config("spark.sql.debug.maxToStringFields", 500) \
                            .config("spark.driver.memory", "10g") \
                            .appName("BuildSubmission").getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

22/05/10 10:53:44 WARN Utils: Your hostname, tales-samsung resolves to a loopback address: 127.0.1.1; using 192.168.0.104 instead (on interface wlxd03745e80dbf)
22/05/10 10:53:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/10 10:53:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from datetime import datetime
import uuid
import pyspark.sql.functions as f
from pyspark.sql.functions import when
from src.dao import dao, dao_processed, dao_ml
from src.utils import dflib, stats, pretties, plot, plot_domain, palette

In [4]:
pretties.max_data_frame_columns()

# Fit Data Pipeline

In [5]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import VectorIndexer #Class for indexing categorical feature columns in a dataset of Vector.
from pyspark.ml.feature import VectorAssembler #A feature transformer that merges multiple columns into a vector column.
from pyspark.ml.feature import StringIndexer #A label indexer that maps a string column of labels to an ML column of label indices.
from pyspark.ml.feature import IndexToString
from src.ml.transformers import DropNaTransformer, UndersamplingTransformer
from src.ml.estimators import FillProbaEstimator

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from src.ml import metrics

In [6]:
id_result = "22cf967a-5dc2-46fb-af40-1bee1ea7de43"
id_data_build = "1298bdd7-d506-4394-9708-3a9946ce602b"

best_model_params = dao_ml.load_result(id_data_build, id_result)
best_model_params

FileNotFoundError: [Errno 2] No such file or directory: 'data/results/result-1298bdd7-d506-4394-9708-3a9946ce602b-22cf967a-5dc2-46fb-af40-1bee1ea7de43.json'

In [8]:
df_ttrain = dao_processed.load_processed_data(which_dataset="train_train", id=id_data_build, spark=spark)
df_tvalid = dao_processed.load_processed_data(which_dataset="train_valid", id=id_data_build, spark=spark)

                                                                                

In [9]:
basic_cols = ['id', 'target', 'league_id', 'league_name',
              'home_team_name', 'away_team_name', 
              'match_date']

In [10]:
use_features = ['home_mood_diff',
 'away_mood_diff',
 'home_history_mood_mean',
 'away_history_mood_mean',
 'home_result_history_mean',
 'away_result_history_mean',
 'home_factor',
 'draw_factor']

feature_assembler_transformer = VectorAssembler(inputCols=use_features, 
                                                outputCol="features")

target_indexer_transformer = StringIndexer(inputCol="target", 
                                           outputCol="target_indexed", 
                                           stringOrderType="alphabetDesc")
target_indexer_transformer = target_indexer_transformer.fit(df_ttrain)

inverter_transformer = IndexToString(inputCol="target_indexed", 
                                     outputCol="indexed_to_target", 
                                     labels=target_indexer_transformer.labels)

                                                                                

In [11]:
pipeline_stages=[feature_assembler_transformer, 
                 target_indexer_transformer]
pipeline_model = PipelineModel(stages=pipeline_stages)

pipeline_stages_na=[target_indexer_transformer]
pipeline_model_na = PipelineModel(stages=pipeline_stages_na)

pipeline_stages_test=[feature_assembler_transformer]
pipeline_model_test = PipelineModel(stages=pipeline_stages_test)

In [12]:
df_ttrain_na = dflib.filter_any_null(df_ttrain)
df_ttrain = DropNaTransformer().transform(df_ttrain)
df_tvalid_na = dflib.filter_any_null(df_tvalid)
df_tvalid = DropNaTransformer().transform(df_tvalid)

df_ttrain = pipeline_model.transform(df_ttrain.select(basic_cols + use_features))
df_ttrain_na = pipeline_model_na.transform(df_ttrain_na.select(basic_cols))

df_tvalid = pipeline_model.transform(df_tvalid.select(basic_cols + use_features))
df_tvalid_na = pipeline_model_na.transform(df_tvalid_na.select(basic_cols))

DropNaTransformer
DropNaTransformer


In [13]:
print(dflib.shape(df_ttrain))
print(dflib.shape(df_ttrain_na))
print(dflib.shape(df_tvalid))
print(dflib.shape(df_tvalid_na))

                                                                                

(85423, 17)


                                                                                

(2047, 8)
(23030, 17)
(438, 8)


In [14]:
### FILLING MISSING MATCHES PROBA

def prediction_numeric(df, labels):
    #TODO: Gerenarlize for n labels
    return df.withColumn(
            "prediction",
            f.col("prediction_str")).withColumn(
            "prediction", f.when(f.col("prediction_str") == "home", float(labels.index("home"))) \
                .otherwise(f.when(f.col("prediction_str") == "draw", float(labels.index("draw"))) \
                           .otherwise(f.when(f.col("prediction_str") == "away", float(labels.index("away"))))))

missing_values_strategy_dict = {}

for missing_values_strategy in ["global_frequency", "league_frequency", "uniform_proba"]:
    print(missing_values_strategy)
    missing_values_strategy_dict[missing_values_strategy] = {}
    
    use_df_na_proba_filled_ttrain = FillProbaEstimator(strategy=missing_values_strategy, 
                                      labels=target_indexer_transformer.labels, 
                                      output_col="proba").fit(df_ttrain).transform(df_ttrain_na)
    use_df_na_proba_filled_ttrain = prediction_numeric(use_df_na_proba_filled_ttrain, target_indexer_transformer.labels)
    
    
    use_df_na_proba_filled_tvalid = FillProbaEstimator(strategy=missing_values_strategy, 
                                      labels=target_indexer_transformer.labels, 
                                      output_col="proba").fit(df_ttrain).transform(df_tvalid_na)
    use_df_na_proba_filled_tvalid = prediction_numeric(use_df_na_proba_filled_tvalid, target_indexer_transformer.labels)
   
    display(use_df_na_proba_filled_ttrain.limit(5).toPandas())
    display(use_df_na_proba_filled_tvalid.limit(5).toPandas())
    missing_values_strategy_dict[missing_values_strategy]["ttrain"] = use_df_na_proba_filled_ttrain
    missing_values_strategy_dict[missing_values_strategy]["tvalid"] = use_df_na_proba_filled_tvalid

global_frequency


                                                                                

Unnamed: 0,id,target,target_indexed,prediction_str,home,draw,away,proba,prediction
0,11931668,away,2.0,home,0.477772,0.186126,0.336102,"[0.4777723497801661, 0.18612603810454323, 0.33...",0.0
1,11931677,home,0.0,home,0.477772,0.186126,0.336102,"[0.4777723497801661, 0.18612603810454323, 0.33...",0.0
2,16689696,away,2.0,home,0.477772,0.186126,0.336102,"[0.4777723497801661, 0.18612603810454323, 0.33...",0.0
3,17135787,home,0.0,home,0.477772,0.186126,0.336102,"[0.4777723497801661, 0.18612603810454323, 0.33...",0.0
4,17561099,home,0.0,home,0.477772,0.186126,0.336102,"[0.4777723497801661, 0.18612603810454323, 0.33...",0.0


Unnamed: 0,id,target,target_indexed,prediction_str,home,draw,away,proba,prediction
0,17871789,draw,1.0,home,0.449772,0.184932,0.365297,"[0.4497716894977169, 0.18493150684931506, 0.36...",0.0
1,17747561,home,0.0,home,0.449772,0.184932,0.365297,"[0.4497716894977169, 0.18493150684931506, 0.36...",0.0
2,17747564,away,2.0,home,0.449772,0.184932,0.365297,"[0.4497716894977169, 0.18493150684931506, 0.36...",0.0
3,17747566,home,0.0,home,0.449772,0.184932,0.365297,"[0.4497716894977169, 0.18493150684931506, 0.36...",0.0
4,17871768,away,2.0,home,0.449772,0.184932,0.365297,"[0.4497716894977169, 0.18493150684931506, 0.36...",0.0


league_frequency


                                                                                

Unnamed: 0,id,target,target_indexed,prediction_str,home,draw,away,proba,prediction
0,11983072,home,0.0,home,0.4,0.4,0.2,"[0.4, 0.4, 0.2]",0.0
1,11993775,away,2.0,away,0.222222,0.333333,0.444444,"[0.2222222222222222, 0.3333333333333333, 0.444...",2.0
2,11993776,draw,1.0,away,0.222222,0.333333,0.444444,"[0.2222222222222222, 0.3333333333333333, 0.444...",2.0
3,11993780,home,0.0,away,0.222222,0.333333,0.444444,"[0.2222222222222222, 0.3333333333333333, 0.444...",2.0
4,17656653,draw,1.0,home,0.625,0.25,0.125,"[0.625, 0.25, 0.125]",0.0


Unnamed: 0,id,target,target_indexed,prediction_str,home,draw,away,proba,prediction
0,17878064,draw,1.0,home,0.428571,0.428571,0.142857,"[0.42857142857142855, 0.42857142857142855, 0.1...",0.0
1,17878070,home,0.0,home,0.428571,0.428571,0.142857,"[0.42857142857142855, 0.42857142857142855, 0.1...",0.0
2,17898913,home,0.0,home,0.428571,0.428571,0.142857,"[0.42857142857142855, 0.42857142857142855, 0.1...",0.0
3,17898918,home,0.0,home,0.428571,0.428571,0.142857,"[0.42857142857142855, 0.42857142857142855, 0.1...",0.0
4,17988395,draw,1.0,home,0.428571,0.428571,0.142857,"[0.42857142857142855, 0.42857142857142855, 0.1...",0.0


uniform_proba


Unnamed: 0,id,target,target_indexed,prediction_str,home,draw,away,proba,prediction
0,11931668,away,2.0,home,0.333333,0.333333,0.333333,"[0.33333334, 0.33333333, 0.33333333]",0.0
1,11931677,home,0.0,home,0.333333,0.333333,0.333333,"[0.33333334, 0.33333333, 0.33333333]",0.0
2,16689696,away,2.0,home,0.333333,0.333333,0.333333,"[0.33333334, 0.33333333, 0.33333333]",0.0
3,17135787,home,0.0,home,0.333333,0.333333,0.333333,"[0.33333334, 0.33333333, 0.33333333]",0.0
4,17561099,home,0.0,home,0.333333,0.333333,0.333333,"[0.33333334, 0.33333333, 0.33333333]",0.0


Unnamed: 0,id,target,target_indexed,prediction_str,home,draw,away,proba,prediction
0,17871789,draw,1.0,home,0.333333,0.333333,0.333333,"[0.33333334, 0.33333333, 0.33333333]",0.0
1,17747561,home,0.0,home,0.333333,0.333333,0.333333,"[0.33333334, 0.33333333, 0.33333333]",0.0
2,17747564,away,2.0,home,0.333333,0.333333,0.333333,"[0.33333334, 0.33333333, 0.33333333]",0.0
3,17747566,home,0.0,home,0.333333,0.333333,0.333333,"[0.33333334, 0.33333333, 0.33333333]",0.0
4,17871768,away,2.0,home,0.333333,0.333333,0.333333,"[0.33333334, 0.33333333, 0.33333333]",0.0


In [15]:
df_ttrain_undersampling = UndersamplingTransformer(target_colname="target").transform(df_ttrain)

UndersamplingTransformer


                                                                                

# Build Submission

In [16]:
full_train = True

In [17]:
# best_model_params = {}

# missing_values_strategy = best_model_params["missing_values_strategy"]
# undersampling = best_model_params["undersampling"]
# num_trees = best_model_params["num_trees"]
# max_depth = best_model_params["max_depth"]
# subsampling_rate = best_model_params["subsampling_rate"]



In [18]:
if best_model_params["undersampling"]:
    use_df_ttrain = df_ttrain_undersampling
    use_df_train_complete = UndersamplingTransformer(target_colname="target").transform(df_ttrain.union(df_tvalid))
    
else:
    use_df_ttrain = df_ttrain
    use_df_train_complete = df_ttrain.union(df_tvalid)
    
use_df_train_complete = DropNaTransformer().transform(use_df_train_complete)
    
print(dflib.shape(use_df_ttrain))
print(dflib.shape(use_df_train_complete))

UndersamplingTransformer


                                                                                

DropNaTransformer


                                                                                

(63543, 17)




(81567, 17)


                                                                                

In [21]:
df_test = dao_processed.load_processed_data(which_dataset="test", id=id_data_build, spark=spark)

# df_test = df_test.drop(*["home_factor", "draw_factor"])
# use_features.remove("home_factor")
# use_features.remove("draw_factor")

print(df_test.printSchema())
df_test.select(use_features).limit(5).show()

if "target" in basic_cols:
    basic_cols.remove("target")

df_test_na = dflib.filter_any_null(df_test)
df_test = DropNaTransformer().transform(df_test)

if full_train:
    fill_proba_X = df_ttrain.union(df_tvalid)
else:
    fill_proba_X = df_ttrain
    
print("FillProbaEstimator")
use_df_na_proba_filled_test = FillProbaEstimator(strategy="global_frequency", 
                                                 labels=target_indexer_transformer.labels, 
                                                 output_col="proba").fit(fill_proba_X).transform(df_test_na)

use_df_na_proba_filled_test = prediction_numeric(use_df_na_proba_filled_test, target_indexer_transformer.labels)
    
df_test = feature_assembler_transformer.transform(df_test.select(basic_cols + use_features))


df_test_na = pipeline_model_na.transform(df_test_na.select(basic_cols))
df_test = pipeline_model_test.transform(df_test.select(basic_cols + use_features))

root
 |-- id: string (nullable = true)
 |-- home_team_name: string (nullable = true)
 |-- away_team_name: string (nullable = true)
 |-- match_date: date (nullable = true)
 |-- league_name: string (nullable = true)
 |-- league_id: string (nullable = true)
 |-- home_mood_diff: float (nullable = true)
 |-- away_mood_diff: float (nullable = true)
 |-- home_history_mood_mean: float (nullable = true)
 |-- away_history_mood_mean: float (nullable = true)
 |-- home_result_history_mean: float (nullable = true)
 |-- away_result_history_mean: float (nullable = true)
 |-- home_factor: float (nullable = true)
 |-- draw_factor: float (nullable = true)

None
+--------------+--------------+----------------------+----------------------+------------------------+------------------------+-----------+-----------+
|home_mood_diff|away_mood_diff|home_history_mood_mean|away_history_mood_mean|home_result_history_mean|away_result_history_mean|home_factor|draw_factor|
+--------------+--------------+--------------

                                                                                

AnalysisException: cannot resolve 'target' given input columns: [away_history_mood_mean, away_mood_diff, away_result_history_mean, away_team_name, draw_factor, home_factor, home_history_mood_mean, home_mood_diff, home_result_history_mean, home_team_name, id, league_id, league_name, match_date];
'Aggregate ['target], ['target, count(1) AS Absolute#5686L]
+- Filter (((((((((((((isnull(id#5310) OR isnull(home_team_name#5311)) OR isnull(away_team_name#5312)) OR isnull(match_date#5313)) OR isnull(league_name#5314)) OR isnull(league_id#5315)) OR isnull(home_mood_diff#5316)) OR isnull(away_mood_diff#5317)) OR isnull(home_history_mood_mean#5318)) OR isnull(away_history_mood_mean#5319)) OR isnull(home_result_history_mean#5320)) OR isnull(away_result_history_mean#5321)) OR isnull(home_factor#5322)) OR isnull(draw_factor#5323))
   +- Relation [id#5310,home_team_name#5311,away_team_name#5312,match_date#5313,league_name#5314,league_id#5315,home_mood_diff#5316,away_mood_diff#5317,home_history_mood_mean#5318,away_history_mood_mean#5319,home_result_history_mean#5320,away_result_history_mean#5321,home_factor#5322,draw_factor#5323] csv


In [None]:
print(dflib.shape(df_test_na))
print(dflib.shape(df_test))

In [None]:
print(dflib.shape(df_test_na))
print(dflib.shape(df_test))

In [None]:
rf = RandomForestClassifier(labelCol="target_indexed", 
                            probabilityCol='proba', 
                            featuresCol="features", 
                            numTrees=60,
                            maxDepth=7,
                            subsamplingRate=0.7)

if full_train:
    X = use_df_train_complete
else:
    X = use_df_ttrain

print("training")
model = rf.fit(X)

In [None]:
X_preds = model.transform(X).select(["id", "prediction", "proba", "target", "target_indexed"])

metrics_X = metrics.get_metrics(X_preds, 
                                labelCol="target_indexed", 
                                predictionCol="prediction", 
                                probabilityCol="proba")

In [None]:
metrics_X

In [None]:
preds = model.transform(df_test).select(["id", "prediction", "proba"])
preds = preds.union(use_df_na_proba_filled_test.select(["id", "prediction", "proba"]))

In [None]:
dflib.shape(preds)

In [None]:
preds.printSchema()

In [None]:
dflib.shape(preds)

In [None]:
preds.limit(5).show()

In [None]:
from pyspark.ml.functions import vector_to_array


def build_submission(df, labels, proba_colname):
    df = df.select(["id", proba_colname])
    
#     for i in range(len(labels)):
#         label = labels[i]
#         df = df.withColumn(label, f.col(proba_colname)[i])

    df = df.withColumn("xs", vector_to_array("proba")).select(["id"] + [f.col("xs")[i].alias(labels[i]) for i in range(3)])
    
    df = df.drop(*[proba_colname])
    return df

submission = build_submission(df=preds, labels=target_indexer_transformer.labels, proba_colname="proba")
submission.limit(5).show()

In [None]:
# dao_ml.save_preds(preds_df=submission, metadata=[])

In [None]:
submission.toPandas().to_csv("data/preds/preds4444.csv", index=False, sep=",")

In [None]:
submission.count()