In [94]:
# import modules 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os 
import sys
import seaborn as sns
import scipy as sp
%matplotlib inline

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel 
from sklearn.metrics import classification_report

In [95]:
df_original = pd.read_csv(r"C:/Users/Yehonatan/PycharmProject/DS/la_liga_data.csv")
#df_original.info()

In [148]:

def deal_nans(df, percantage_nan):
    nans = df[df.columns[df.isnull().any()]]
    nan_col_names = list(nans)
    for i in nan_col_names:
        count_nan = df[i].isnull().sum()
        if count_nan/len(df[i]) >= 0.1:       # if nans are more than 10% delete the column 
            df = df.drop(columns=[i])
        else:
            col_avg = df[i].mean()
            df[i] = df[i].fillna(col_avg)
    return df 


# create a function that gets the features and target . calcs the p-values and if it rejects or accepts H_0 null hypothesis 

def pvalue_filter(target, features, alpha): # returns a list of columns that are possible drop, p_val > alpha, corr, pval
    features_columns_names = list(features)
    target_column_name = list(target)
    features_np = features.to_numpy()
    target_np = target.to_numpy()
    drop_index = []
    p_val_list =[]
    corr_list = []
    
    for i in range(len(features_columns_names)):
        corr, p_val = sp.stats.pearsonr(features_np[:,i], target_np)
        corr_list.append(round(corr,3))
        if p_val > alpha:           # accept the null hypothesis, no statisitcal significance  
            drop_index.append(i)
            p_val_list.append(p_val)
            
    drop_col = [features_columns_names[i] for i in drop_index]
    return drop_col, corr_list, p_val_list

# create a function that returns the selected and rejected columns by correlation between features 
def features_corr_filter(features, corr_cutoff): 
    corr = features.corr()
    columns = np.full((corr.shape[0],), True, dtype=bool) # create boolean filter 

    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i,j] >= corr_cutoff:
                columns[j] = False

    rejected_columns = x_train.columns[np.invert(columns)]
    #print('reject',rejected_columns)
    selected_columns = x_train.columns[columns]
    #print('selected',selected_columns)
    return selected_columns, rejected_columns


In [149]:
###### EDA and data processing ###############

# turn data into numerical float64 
#create Feature selectin 

df = df_original.copy()
# deal with object type attributes 
col_list = df.select_dtypes(include=['object']).columns.to_list()
df = df.drop(columns = ['start_time', 'round', 'dayofweek', 'opponent'])
df['venue']= df['venue'].apply(lambda x : 1 if x== 'Home' else 0) 

# for now we turn this classifier to be a WIN classifier so OvA strategy
df['result']= df['result'].apply(lambda x : 1 if x== 'W' else 0) 

#turn df into float 64 
df=df.astype('float64')

# deal with NAN values 
# special columns to deal with
df['gk_save_pct'] = df['gk_save_pct'].fillna(100) # no shots on target means no saves in a way same effect as 100% saves
df['own_goals'] = df['own_goals'].fillna(0) # safe to assume that if there is a NAN there were no owngoals as it is a rare occasion 
df = df.drop(columns=['tackles_interceptions', 'Unnamed: 0']) # all Nan_s in this column

deal_nans(df, 0.1)


#count_nan = df['tackles_interceptions'].isnull().sum()
#print('Number of NaN values present: ' + str(count_nan))

# divide to x,y sets 
y = df['result']
x = df.drop(['result'], axis=1)

# divide into test set and train set   
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9)

train_set = pd.concat([x_train,y_train],axis=1)





In [150]:
######### APPLY FEATURE SELECTION ##########

# drop the statisticly insignificant features 
drop_col, corr_list, p_val_list = pvalue_filter(y_train, x_train, 0.05)
#print(drop_col)
x_train = x_train.drop(drop_col, axis=1)
x_train.rename(columns = {'passes.1':'attempted_passes'}, inplace=True)




In [151]:
# drop features that are by product of different circumstances like xg related features
# leave features that are controllable - meaning can be worked and controlled by the team

hand_pick_drop = ['goals_for','goals_against','gk_psxg_net', 'assists', 'passes_left_foot', 'passes_right_foot',
                  'passes_head','goals','goals_per_shot', 'goals_per_shot_on_target',
                  'xg', 'npxg', 'npxg_per_shot', 'xg_net', 'npxg_net']

x_train = x_train.drop(hand_pick_drop, axis=1)

In [152]:
### apply corr filter on x_train #### might want to check performance without it
#might reduce errors and process time and run time 

selected_columns, rejected_columns = features_corr_filter(x_train, 0.9)
x_train = x_train[selected_columns]
#x_train.info()

In [153]:
#### after all feature selection filtering
### now x_train has 84 features (half of the beginning) #### pvalue drop has been made and high corr between features 

In [154]:
##### now we scale the data with standardscaler ##### prepre it for learning, convert to np array

scaler = StandardScaler()
x_train_np = x_train.to_numpy()   
x_train_np = scaler.fit_transform(x_train_np)

x_train_df = pd.DataFrame(x_train_np, columns= list(selected_columns)) # df after scaling
y_train_np = y_train.to_numpy()

In [155]:
#### apply classifier ####

In [156]:
clf = RandomForestClassifier(n_estimators= 100, max_depth=2, random_state=0)
clf.fit(x_train_np, y_train_np)
y_predict = clf.predict(x_train_np)

In [157]:
#### check the accuracy, precision and recall ###### all feature slection is applied
## so far it is the best result because I want high precision , 
# if the model classified it as a win I want it to missclassify it as much as it can. i dont care if it missclassfies it as a lose 

precision, recall, f_1, support = precision_recall_fscore_support(y_train_np, y_predict, average='binary')
accuracy = accuracy_score(y_train_np, y_predict)
print('accuracy :', np.round(accuracy,3))
print('precision :', np.round(precision,3))
print('recall :', np.round(recall,3))
print('f_1 :', np.round(f_1,3))


accuracy : 0.824
precision : 0.937
recall : 0.624
f_1 : 0.749


In [158]:
print(classification_report(y_train_np, y_predict))

              precision    recall  f1-score   support

         0.0       0.78      0.97      0.87      1147
         1.0       0.94      0.62      0.75       829

    accuracy                           0.82      1976
   macro avg       0.86      0.80      0.81      1976
weighted avg       0.85      0.82      0.82      1976



In [159]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 2,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [232]:
#x_train_df_scale.hist(bins=50, figsize=(20,15))
#plt.show()

In [149]:
#x_train['passes_switches'].hist(bins=50, figsize=(20,15))
#plt.show()

In [157]:
#x_train.nutmegs.value_counts() 
#x_train['gk_clean_sheets']

In [None]:
#### check model predictions without features elimination 