# BuildingTransformers
Notebook to wrap the analysis into Transformer for ML Pipelines

In [None]:
import os
from working_dir import set_wd
set_wd()
os.getcwd()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("BuildingTransformers").getOrCreate()

In [None]:
import pyspark.sql.functions as f
from pyspark.sql.functions import when
from pyspark.ml import Pipeline, Transformer
import matplotlib.pyplot as plt

from src.dao import dao_raw, dao_interim, columns
from src.utils import dflib, stats, pretties, plot, plot_domain, palette

In [None]:
pretties.max_data_frame_columns()

In [None]:
COLOR_HOME = palette.PALETTE_TARGET["home"]
COLOR_DRAW = palette.PALETTE_TARGET["draw"]
COLOR_AWAY = palette.PALETTE_TARGET["away"]

COLOR_OPTIM = palette.PALETTE_MOOD["optimistic"]
COLOR_NEUTR = palette.PALETTE_MOOD["neutral"]
COLOR_PESSI = palette.PALETTE_MOOD["pessimistic"]

# Loading Data

In [None]:
ttrain = dao_interim.load_train_train_data(spark)
drop_cols = columns.team_history_coach_colnames

ttrain = ttrain.drop(*drop_cols)

print(f"ttrain: {dflib.shape(ttrain)}")

In [None]:
basic_cols = ['id', 'target', 'home_team_name', 'away_team_name', 
              'match_date', 'league_name']

# Transformers

* https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.Transformer.html#pyspark.ml.Transformer.transform
* https://stackoverflow.com/questions/49734374/pyspark-ml-pipelines-are-custom-transformers-necessary-for-basic-preprocessing
* https://www.oreilly.com/content/extend-spark-ml-for-your-own-modeltransformer-types/
* https://www.youtube.com/watch?v=iO4ebMzj7t8&ab_channel=ManningPublications

## Team Mood Diff Transformer

In [None]:
from src.ml_pipeline.transformers_lib import team_mood_diff

class TeamMoodDiffTransformer(Transformer):
    def __init__(self, neutral_numeric_threshold, colnames="*"):
        self.neutral_numeric_threshold = neutral_numeric_threshold
        self.colnames = colnames
        
    def _transform(self, df):
        use_df = df.select(self.colnames)
        df_transformed = team_mood_diff.build(use_df, self.neutral_numeric_threshold) 
        return df_transformed

## Team History Transformer

In [None]:
from src.ml_pipeline.transformers_lib import team_history_result

class TeamHistoryResultTransformer(Transformer):
    def __init__(self, colnames="*"):
        self.colnames = colnames
        
    def _transform(self, df):
        use_df = df.select(self.colnames)
        df_transformed = team_history_result.build(use_df) 
        return df_transformed

## Home Factor Transformer

In [None]:
from src.ml_pipeline.transformers_lib import home_factor

class HomeFactorTransformer(Transformer):
    def __init__(self, colnames="*"):
        self.colnames = colnames
        
    def _transform(self, df):
        use_df = df.select(self.colnames)
        df_transformed = home_factor.build(use_df) 
        return df_transformed

# Combining

In [None]:
print("shape dataframe:", dflib.shape(ttrain))
original_columns = ttrain.columns

ttrain = TeamMoodDiffTransformer(neutral_numeric_threshold=0.5).transform(ttrain)
print("shape transformed:", dflib.shape(ttrain))

ttrain = TeamHistoryResultTransformer().transform(ttrain)
print("shape transformed:", dflib.shape(ttrain))

ttrain = HomeFactorTransformer().transform(ttrain)
print("shape transformed:", dflib.shape(ttrain))

final_columns = ttrain.columns

In [None]:
final_columns

In [None]:
new_features = list(set(final_columns) - set(original_columns))
print(new_features)

df_transformed = ttrain.select(basic_cols + new_features)
print("shape transformed:", dflib.shape(df_transformed))

In [None]:
dflib.sample(df_transformed, 5).toPandas()