# Feature Selection

In [1]:
import os
from working_dir import set_wd
set_wd()
os.getcwd()

'/Users/tales.pimentel/ds/kaggle/football-match-prediction'

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config("spark.sql.debug.maxToStringFields", 500) \
                            .config("spark.driver.memory", "14g") \
                            .appName("FeatureSelection").getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

In [3]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
import pyspark.sql.functions as f
from src.dao import dao, dao_processed, dao_raw, dao_interim, dao_ml
from src.utils import dflib, stats, pretties, plot, plot_domain, palette

In [4]:
pretties.max_data_frame_columns()

# Loading Data

In [5]:
id_data = dao_processed.most_recent_data_build_id()
id_data

'04a4d619-00cc-4484-a724-e27e2161c91d'

In [6]:
df_ttrain = dao_processed.load_processed_data(which_dataset="train_train", 
                                              id_data=id_data,
                                              spark=spark)

metadata = dao_processed.load_processed_metadata(id_data)
use_features = metadata["use_features"]

In [7]:
len_features_na = dflib.filter_any_null(df_ttrain, subset=use_features).count()
len_target_na = dflib.filter_any_null(df_ttrain, subset=["target"]).count()

print(f"{len_features_na} rows with any feature null")
print(f"{len_target_na} rows with target null")

2117 rows with any feature null
0 rows with target null


In [8]:
df_ttrain_not_na = df_ttrain.dropna(how="any", subset=use_features + ["target"])
df_ttrain_na = dflib.filter_any_null(df_ttrain, subset=use_features + ["target"])

print("df_ttrain_not_na", df_ttrain_not_na.count())
print("df_ttrain_na", df_ttrain_na.count())

assert df_ttrain_not_na.count() + df_ttrain_na.count() == df_ttrain.count()

df_ttrain_not_na 85353
df_ttrain_na 2117


In [9]:
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

selector = SelectKBest(score_func=f_classif, k="all")
df_ttrain_not_na_pd = df_ttrain_not_na.select(use_features + ["target", "id"]).toPandas()

In [10]:
result = f_classif(df_ttrain_not_na_pd[use_features], df_ttrain_not_na_pd["target"])
result_df_pd = pd.DataFrame(result, 
                            columns=use_features, 
                            index=["f_statistic", "p_value"]).transpose().sort_values("f_statistic", ascending=False)

### ANOVA

In [11]:
print(len(use_features), "variables")
display(result_df_pd)

8 variables


Unnamed: 0,f_statistic,p_value
home_mood_diff,3935.458008,0.0
away_mood_diff,3935.458008,0.0
home_history_mood_mean,1760.184204,0.0
away_history_mood_mean,1654.938354,0.0
home_result_history_mean,1308.731567,0.0
away_result_history_mean,1253.516113,0.0
draw_factor,836.310608,0.0
home_factor,541.662842,1.734726e-234


#### Notes

The `f_statistic` stands for the correlation between the continuous and categorical variables. <br>
The greater is it, the greater is the correlation.
<br>
Furthermore, if the p-value is below a certain threshold (e.g. α = .05), we can reject the null hypothesis of the
ANOVA and conclude that there is a statistically significant difference between group means.<br>
<br>
So, all variables are correlated to the target, even though they differ from one to another in intensity.

Source: https://www.statology.org/what-does-a-high-f-value-mean/#:~:text=The%20F%2Dvalue%20in%20an%20ANOVA%20is%20calculated%20as%3A%20variation,lower%20the%20corresponding%20p%2Dvalue.


# Correlations

In [12]:
corr_df_pd = df_ttrain_not_na_pd.corr(method="pearson")
display(corr_df_pd.style.background_gradient(cmap='bwr'))

Unnamed: 0,home_mood_diff,away_mood_diff,home_history_mood_mean,away_history_mood_mean,home_result_history_mean,away_result_history_mean,home_factor,draw_factor
home_mood_diff,1.0,-1.0,0.673354,-0.676285,0.436569,-0.43404,0.010001,-0.007102
away_mood_diff,-1.0,1.0,-0.673354,0.676285,-0.436569,0.43404,-0.010001,0.007102
home_history_mood_mean,0.673354,-0.673354,1.0,0.089234,0.649905,0.061371,-0.000866,-0.002465
away_history_mood_mean,-0.676285,0.676285,0.089234,1.0,0.059405,0.645886,-0.014336,0.007112
home_result_history_mean,0.436569,-0.436569,0.649905,0.059405,1.0,0.036975,0.003151,-0.017398
away_result_history_mean,-0.43404,0.43404,0.061371,0.645886,0.036975,1.0,0.003426,-0.006218
home_factor,0.010001,-0.010001,-0.000866,-0.014336,0.003151,0.003426,1.0,-0.460194
draw_factor,-0.007102,0.007102,-0.002465,0.007112,-0.017398,-0.006218,-0.460194,1.0


# Dropping vars that are mutually correlated
Of course only one of them will be removed

In [13]:
import numpy as np
def high_correlated_features(corr_df, max_treshold=0.9):
    upper_tri = corr_df.where(np.triu(np.ones(corr_df.shape),k=1).astype(bool))

    to_drop = [column for column in upper_tri.columns if any(abs(upper_tri[column]) > max_treshold)]
    return to_drop

MAX_THRESHOLD = 0.9

vars_to_drop = high_correlated_features(corr_df_pd, max_treshold=MAX_THRESHOLD)
vars_to_drop

['away_mood_diff']

In [14]:
clean_features = list(set(use_features) - set(vars_to_drop))
clean_corr_df_pd = corr_df_pd[clean_features].loc[clean_features]
clean_corr_df_pd

Unnamed: 0,home_factor,home_result_history_mean,draw_factor,home_history_mood_mean,away_result_history_mean,away_history_mood_mean,home_mood_diff
home_factor,1.0,0.003151,-0.460194,-0.000866,0.003426,-0.014336,0.010001
home_result_history_mean,0.003151,1.0,-0.017398,0.649905,0.036975,0.059405,0.436569
draw_factor,-0.460194,-0.017398,1.0,-0.002465,-0.006218,0.007112,-0.007102
home_history_mood_mean,-0.000866,0.649905,-0.002465,1.0,0.061371,0.089234,0.673354
away_result_history_mean,0.003426,0.036975,-0.006218,0.061371,1.0,0.645886,-0.43404
away_history_mood_mean,-0.014336,0.059405,0.007112,0.089234,0.645886,1.0,-0.676285
home_mood_diff,0.010001,0.436569,-0.007102,0.673354,-0.43404,-0.676285,1.0


In [15]:
clean_corr_df_pd.where(np.triu(np.ones(clean_corr_df_pd.shape),k=1).astype(bool))

Unnamed: 0,home_factor,home_result_history_mean,draw_factor,home_history_mood_mean,away_result_history_mean,away_history_mood_mean,home_mood_diff
home_factor,,0.003151,-0.460194,-0.000866,0.003426,-0.014336,0.010001
home_result_history_mean,,,-0.017398,0.649905,0.036975,0.059405,0.436569
draw_factor,,,,-0.002465,-0.006218,0.007112,-0.007102
home_history_mood_mean,,,,,0.061371,0.089234,0.673354
away_result_history_mean,,,,,,0.645886,-0.43404
away_history_mood_mean,,,,,,,-0.676285
home_mood_diff,,,,,,,


In [16]:
dao_ml.save_feature_selection(anova_df=result_df_pd, id_data=id_data, mutual_corr_max_treshold=MAX_THRESHOLD, cols_to_remove=vars_to_drop)

ee2235a7-9d36-40d8-96d6-6d68019536b2


'ee2235a7-9d36-40d8-96d6-6d68019536b2'