# BuildData
Notebook to build and save processed data

In [1]:
import os
from working_dir import set_wd
set_wd()
os.getcwd()

'/Users/tales.pimentel/ds/kaggle/football-match-prediction'

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config("spark.sql.debug.maxToStringFields", 500) \
                            .config("spark.driver.memory", "10g") \
                            .appName("BuldData").getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

In [3]:
import pyspark.sql.functions as f
from datetime import datetime
from src.dao import dao_raw, dao_interim, dao_processed, columns
from src.utils import dflib, stats, pretties, plot

from src.ml.transformers import TeamMoodDiffTransformer, TeamHistoryResultTransformer, SelectColumnsTransformer
from src.ml.estimators import HomeFactorEstimator
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline, PipelineModel
import uuid

In [4]:
pretties.max_data_frame_columns()

# Build Data

### Pipeline

In [6]:
use_features = ['home_mood_diff', 'away_mood_diff', 
                'home_history_mood_mean', 'away_history_mood_mean',
                'home_result_history_mean', 'away_result_history_mean',
                'home_factor', 'draw_factor', "is_cup"]

basic_cols = ['id', "target", 'home_team_name', 'away_team_name', 
              'match_date', 'league_name', 'league_id']

In [7]:
pipeline_stages=[TeamMoodDiffTransformer(), 
                 TeamHistoryResultTransformer(),
                 HomeFactorEstimator(n_matches_min=6).fit(dao_interim.load_train_train_data(spark=spark)),
                 SelectColumnsTransformer(subset_colnames=use_features)]

pipeline_model = PipelineModel(stages=pipeline_stages)

### Bulding

In [8]:
id_data = str(uuid.uuid4())
pretties.md(f'id: {id_data}', size="####")

#### <font color=black>id: b2beffb2-fb6d-4cb4-8869-82b859c2dd3b</font>

In [9]:
metadata_json = {}

transformer_stages_order = []

for stage in pipeline_stages:
    stage_class_name = stage.__class__.__name__
    try:
        metadata_json[stage_class_name] = stage.get_params()
    except AttributeError:
        metadata_json[stage_class_name] = None
        
    transformer_stages_order.append(stage_class_name)
    
metadata_json["datetime"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
metadata_json["id_data"] = id_data
metadata_json["use_features"] = use_features
metadata_json["transformer_stages_order"] = transformer_stages_order

dao_processed.save_processed_metadata(metadata_json, id_data)

In [10]:
for which_dataset in ["train_train", "train_valid", "test"]:
    print(f'which_dataset: {which_dataset}')
    print(f'id_data: {id_data}')
    print()
    print("loading data")
    
    print("processing")
    if which_dataset == "train_train":
        df = dao_interim.load_train_train_data(spark)
        df_processed = pipeline_model.transform(df)
        
    elif which_dataset == "train_valid":
        df = dao_interim.load_train_valid_data(spark)
        df_processed = pipeline_model.transform(df)

    elif which_dataset == "test":
        basic_cols.remove("target")
        df = dao_raw.load_parse_test_data(spark)
        df_processed = pipeline_model.transform(df)
    print()
    
    df_processed = df_processed.select(basic_cols + use_features)
    display(df_processed.limit(5).toPandas())
    
    print(f"df: {dflib.shape(df_processed)}")
    print()
    
    print("saving")
    dao_processed.save_processed_data(df_processed, which_dataset, id_data)
    pretties.hr()

which_dataset: train_train
id_data: b2beffb2-fb6d-4cb4-8869-82b859c2dd3b

loading data
processing
TeamMoodDiffTransformer



Unnamed: 0,id,target,home_team_name,away_team_name,match_date,league_name,league_id,home_mood_diff,away_mood_diff,home_history_mood_mean,away_history_mood_mean,home_result_history_mean,away_result_history_mean,home_factor,draw_factor,is_cup
0,12009847,home,Matsumoto Yamaga,Fagiano Okayama,2020-11-21,J2-League,1022,0.249791,-0.249791,-0.634455,-0.884245,0.2,0.2,0.3983,0.2662,False
1,12010200,home,Gifu,Fujieda MYFC,2020-11-21,J3-League,1025,0.370443,-0.370443,0.52695,0.156508,-0.1,-0.2,0.3962,0.2358,False
2,12017335,away,Rayong,Nakhon Ratchasima,2020-02-21,Thai Premier League,1064,2.134997,-2.134997,0.904687,-1.23031,0.1,-0.4,0.4867,0.1467,False
3,11915273,away,Drancy JA,Bobigny,2020-01-18,CFA Group A,1177,-0.307759,0.307759,0.506896,0.814655,-0.3,0.4,0.3222,0.4333,False
4,11928219,home,Laredo,Textil Escudo,2020-01-18,Tercera - Group 3,1260,4.529987,-4.529987,3.246695,-1.283292,0.3,0.2,0.4308,0.2077,False


df: (87470, 16)

saving


which_dataset: train_valid
id_data: b2beffb2-fb6d-4cb4-8869-82b859c2dd3b

loading data
processing
TeamMoodDiffTransformer



Unnamed: 0,id,target,home_team_name,away_team_name,match_date,league_name,league_id,home_mood_diff,away_mood_diff,home_history_mood_mean,away_history_mood_mean,home_result_history_mean,away_result_history_mean,home_factor,draw_factor,is_cup
0,17876707,away,Sabah,MISC,2021-03-13,Super League,1052,-1.645766,1.645766,-3.971774,-2.326008,-0.5,0.0,0.4462,0.2462,False
1,17213311,draw,Juventus U23,Pontedera,2021-04-11,Serie C: Girone A,1203,0.111271,-0.111271,0.362912,0.251641,-0.2,0.0,0.3815,0.3237,False
2,17212936,home,Cavese,Teramo,2021-04-11,Serie C: Girone C,1205,-2.652757,2.652757,-2.097581,0.555175,-1.0,-0.2,0.3746,0.307,False
3,17974563,away,Real Murcia II,Racing Murcia,2021-04-11,Tercera - Group 13,1270,0.370224,-0.370224,3.422269,3.052045,0.6,0.2,0.3689,0.3067,False
4,17972591,away,Taiwan CPC,Taichung Futuro,2021-04-11,Taiwan Football Premier League,1340,,,,4.495957,0.0,0.2,0.4605,0.1184,False


df: (23468, 16)

saving


which_dataset: test
id_data: b2beffb2-fb6d-4cb4-8869-82b859c2dd3b

loading data
processing
TeamMoodDiffTransformer



Unnamed: 0,id,home_team_name,away_team_name,match_date,league_name,league_id,home_mood_diff,away_mood_diff,home_history_mood_mean,away_history_mood_mean,home_result_history_mean,away_result_history_mean,home_factor,draw_factor,is_cup
0,18177930,team home,team away,2021-08-07,Club Friendlies,1101,0.325935,-0.325935,0.270253,-0.055682,-0.2,0.3,0.4599,0.2247,False
1,18149919,team home,team away,2021-08-07,Club Friendlies,1101,2.507638,-2.507638,2.929633,0.421995,-0.1,0.1,0.4599,0.2247,False
2,18053847,team home,team away,2021-05-16,CAF Confederations Cup,1108,1.11013,-1.11013,1.766411,0.656281,0.0,0.1,0.4352,0.2778,True
3,18053851,team home,team away,2021-05-16,CAF Confederations Cup,1108,2.426355,-2.426355,2.21138,-0.214976,0.1,0.1,0.4352,0.2778,True
4,17947242,team home,team away,2021-05-16,Tercera - Group 14,1271,-3.757339,3.757339,-2.673534,1.083805,-0.2,-0.2,0.4475,0.1713,False


df: (72711, 15)

saving
