# BuildData
Notebook to build and save processed data

In [None]:
import os
from working_dir import set_wd
set_wd()
os.getcwd()

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config("spark.sql.debug.maxToStringFields", 500) \
                            .config("spark.driver.memory", "14g") \
                            .appName("BuldData").getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

In [None]:
import pyspark.sql.functions as f
from pyspark.sql.functions import when
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
from src.dao import dao_raw, dao_interim, dao_processed, columns
from src.utils import dflib, stats, pretties, plot

In [None]:
pretties.max_data_frame_columns()

# Transformers

In [None]:
from src.ml.transformers import TeamMoodDiffTransformer, \
                                TeamHistoryResultTransformer, \
                                HomeFactorTransformer, \
                                SelectColumnsTransformer

# Build Data

In [None]:
from pyspark.ml import Pipeline, PipelineModel
import uuid

In [None]:
use_features = ['home_mood_diff', 'away_mood_diff', 
                'home_history_mood_mean', 'away_history_mood_mean',
                'home_result_history_mean', 'away_result_history_mean',
                'home_factor', 'draw_factor']

In [None]:
pipeline_stages=[TeamMoodDiffTransformer(), 
                 TeamHistoryResultTransformer(), 
                 HomeFactorTransformer(spark=spark),
                 SelectColumnsTransformer(subset_colnames=use_features)]

pipeline_model = PipelineModel(stages=pipeline_stages)

In [None]:
id = str(uuid.uuid4())
pretties.md(f'id: {id}', size="####")

In [None]:
metadata_json = {}

for stage in pipeline_stages:
    metadata_json[stage.__class__.__name__] = stage.get_params()
    
metadata_json["datetime"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
metadata_json["id"] = id
metadata_json["use_features"] = use_features

dao_processed.save_processed_metadata(metadata_json, id)

In [None]:
basic_cols = ['id', 'target', 'home_team_name', 'away_team_name', 
              'match_date', 'league_name']

In [None]:
for which_dataset in ["train_train", "train_valid", "test"]:
    print(f'which_dataset: {which_dataset}')
    print(f'id: {id}')
    print()
    print("loading data")
    
    if which_dataset == "train_train":
        df = dao_interim.load_train_train_data(spark)
        
    elif which_dataset == "train_valid":
        df = dao_interim.load_train_valid_data(spark)

    elif which_dataset == "test":
        df = dao_raw.load_parse_test_data(spark)

    
    drop_cols = columns.team_history_coach_colnames
    df = df.drop(*drop_cols)

    print(f"df: {dflib.shape(df)}")
    print()
    
    print("processing")
    df_processed = pipeline_model.transform(df)
    print()
    
    print("saving")
    dao_processed.save_processed_data(df_processed, which_dataset, id)
    pretties.hr()