# Appendix - FillProba

In [None]:
import os
from working_dir import set_wd
set_wd()
os.getcwd()

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config('spark.ui.showConsoleProgress', 'false') \
                            .config("spark.sql.debug.maxToStringFields", 500) \
                            .config("spark.driver.memory", "12g") \
                            .appName("FillProba").getOrCreate()

In [None]:
import numpy as np
import pyspark.sql.functions as f
from pyspark.ml.feature import VectorAssembler
from src.ml.estimators import FillProbaEstimator
from src.dao import dao, dao_processed, dao_raw, dao_interim
from src.utils import dflib, stats, pretties, plot, plot_domain, palette

In [None]:
pretties.max_data_frame_columns()

# Loading Data

In [None]:
df_ttrain = dao_interim.load_train_train_data(spark)
df_tvalid = dao_interim.load_train_valid_data(spark)
df_tvalid = dflib.sample(df_tvalid, 2000)

# FillProbaEstimator

In [None]:
def dummy_classifier(df, labels, output_col):
    df, df_na = df.randomSplit(weights=[0.8,0.2], seed=200)
    
    for label in labels:
        df = df.withColumn(label, f.lit(np.random.randint(low=1, high=100) / 100))

    feature_assembler_transformer = VectorAssembler(inputCols=labels,
                                                    outputCol=output_col)
    
    df_na = df_na.withColumn(output_col, f.lit(None))
    df = feature_assembler_transformer.transform(df).drop(*labels)
    
    return df.union(df_na)

In [None]:
df_tvalid = dummy_classifier(df_tvalid, labels=["home", "draw", "away"], output_col="proba")

### Uniform Proba

In [None]:
fill_proba_transfomer = FillProbaEstimator(strategy="uniform_proba",
                                           labels=["home", "draw", "away"], 
                                           proba_vector_col="proba").fit(df_ttrain)
fill_proba_transfomer

In [None]:
dflib.shape(df_tvalid)

In [None]:
df_ttrain_transformed = fill_proba_transfomer.transform(df_tvalid)

In [None]:
dflib.shape(df_ttrain_transformed)

In [None]:
stats.freq(df_ttrain_transformed, "proba").show()

### Global Frequency

In [None]:
fill_proba_transfomer = FillProbaEstimator(strategy="global_frequency",
                                           labels=["home", "draw", "away"], 
                                           proba_vector_col="proba").fit(df_ttrain)
fill_proba_transfomer

In [None]:
dflib.shape(df_tvalid)

In [None]:
df_ttrain_transformed = fill_proba_transfomer.transform(df_tvalid)

In [None]:
dflib.shape(df_ttrain_transformed)

In [None]:
stats.freq(df_ttrain_transformed, "proba").show()

### League Frequency

In [None]:
fill_proba_transfomer = FillProbaEstimator(strategy="league_frequency",
                                           labels=["home", "draw", "away"], 
                                           proba_vector_col="proba",
                                           strategy_b="uniform_proba").fit(df_ttrain)
fill_proba_transfomer

In [None]:
dflib.shape(df_tvalid)

In [None]:
df_tvalid_transformed = fill_proba_transfomer.transform(df_tvalid)

In [None]:
dflib.shape(df_ttrain_transformed)

In [None]:
stats.freq(df_tvalid_transformed, "proba").show()

In [None]:
print("tvalid")
print(df_tvalid.count(), "df_tvalid")
print(df_tvalid_transformed.count(), "df_tvalid_transformed")
stats.freq(df_tvalid_transformed, "proba").show()