In [41]:
# import modules 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os 
import sys
import seaborn as sns
import scipy as sp
%matplotlib inline

from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_selection import SelectFromModel 
from sklearn.metrics import classification_report

In [42]:
df_original = pd.read_csv(r"C:/Users/Yehonatan/PycharmProject/DS/la_liga_data.csv")
#df_original.info()

In [43]:

def deal_nans(df, percantage_nan):
    nans = df[df.columns[df.isnull().any()]]
    nan_col_names = list(nans)
    for i in nan_col_names:
        count_nan = df[i].isnull().sum()
        if count_nan/len(df[i]) >= 0.1:       # if nans are more than 10% delete the column 
            df = df.drop(columns=[i])
        else:
            col_avg = df[i].mean()
            df[i] = df[i].fillna(col_avg)
    return df 


# create a function that gets the features and target . calcs the p-values and if it rejects or accepts H_0 null hypothesis 

def pvalue_filter(target, features, alpha): # returns a list of columns that are possible drop, p_val > alpha, corr, pval
    features_columns_names = list(features)
    target_column_name = list(target)
    features_np = features.to_numpy()
    target_np = target.to_numpy()
    drop_index = []
    p_val_list =[]
    corr_list = []
    
    for i in range(len(features_columns_names)):
        corr, p_val = sp.stats.pearsonr(features_np[:,i], target_np)
        corr_list.append(round(corr,3))
        if p_val > alpha:           # accept the null hypothesis, no statisitcal significance  
            drop_index.append(i)
            p_val_list.append(p_val)
            
    drop_col = [features_columns_names[i] for i in drop_index]
    return drop_col, corr_list, p_val_list

# create a function that returns the selected and rejected columns by correlation between features 
def features_corr_filter(features, corr_cutoff): 
    corr = features.corr()
    columns = np.full((corr.shape[0],), True, dtype=bool) # create boolean filter 

    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i,j] >= corr_cutoff:
                columns[j] = False

    rejected_columns = x_train.columns[np.invert(columns)]
    #print('reject',rejected_columns)
    selected_columns = x_train.columns[columns]
    #print('selected',selected_columns)
    return selected_columns, rejected_columns


In [44]:
###### EDA and data processing ###############

# turn data into numerical float64 
#create Feature selectin 

df = df_original.copy()
# deal with object type attributes 
col_list = df.select_dtypes(include=['object']).columns.to_list()
df = df.drop(columns = ['start_time', 'round', 'dayofweek', 'opponent'])
df['venue']= df['venue'].apply(lambda x : 1 if x== 'Home' else 0) 

# for now we turn this classifier to be a WIN classifier so OvA strategy
df['result']= df['result'].apply(lambda x : 1 if x== 'W' else 0) 

#turn df into float 64 
df=df.astype('float64')

# deal with NAN values 
# special columns to deal with
df['gk_save_pct'] = df['gk_save_pct'].fillna(100) # no shots on target means no saves in a way same effect as 100% saves
df['own_goals'] = df['own_goals'].fillna(0) # safe to assume that if there is a NAN there were no owngoals as it is a rare occasion 
df = df.drop(columns=['tackles_interceptions', 'Unnamed: 0']) # all Nan_s in this column

deal_nans(df, 0.1)


#count_nan = df['tackles_interceptions'].isnull().sum()
#print('Number of NaN values present: ' + str(count_nan))

# divide to x,y sets 
y = df['result']
x = df.drop(['result'], axis=1)

# divide into test set and train set   
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9)

train_set = pd.concat([x_train,y_train],axis=1)





In [45]:
######### APPLY FEATURE SELECTION ##########

# drop the statisticly insignificant features 
drop_col, corr_list, p_val_list = pvalue_filter(y_train, x_train, 0.05)
print(drop_col)
x_train = x_train.drop(drop_col, axis=1)
x_train.rename(columns = {'passes.1':'attempted_passes'}, inplace=True)




['gk_saves', 'gk_pens_saved', 'gk_pens_missed', 'gk_passes_completed_launched', 'gk_passes_launched', 'gk_passes', 'gk_passes_throws', 'gk_pct_passes_launched', 'gk_passes_length_avg', 'gk_goal_kicks', 'gk_crosses_stopped_pct', 'tackles', 'tackles_won', 'tackles_mid_3rd', 'tackles_att_3rd', 'dribble_tackles', 'dribble_tackles_pct', 'dribbled_past', 'pressure_regains', 'pressures_mid_3rd', 'blocked_shots', 'blocked_shots_saves', 'interceptions', 'clearances', 'nutmegs', 'miscontrols', 'dispossessed', 'passes_completed_long', 'passes_into_final_third', 'crosses_into_penalty_area', 'passes_switches', 'corner_kicks', 'corner_kicks_straight', 'passes_low', 'passes_other_body', 'passes_oob', 'passes_intercepted', 'sca_passes_dead', 'sca_fouled', 'shots_free_kicks', 'fouled', 'interceptions.1', 'tackles_won.1']


In [46]:
# drop features that might indicate on loss or win . for instance assists = goals and it is quite obvious that many assists 
# will most likely lead to win. I left features that on a first glance might not indicate a win
# drop features that are by product of different circumstances like xg related features
# leave features that are controllable - meaning can be worked and controlled by the team

hand_pick_drop = ['gk_goals_against', 'goals_for','goals_against','gk_psxg_net', 'dribbles_vs','dribbles_completed',
                  'passes_received_pct', 'assists', 'throw_ins','passes_left_foot', 'passes_right_foot',
                  'sca_shots','passes_head','gca', 'gca_passes_dead', 'gca_dribbles','gca_shots', 'gca_fouled', 
                  'gca_defense','goals','goals_per_shot', 'goals_per_shot_on_target','xg', 'npxg', 'npxg_per_shot', 
                  'pens_made', 'pens_att','xg_net', 'npxg_net','own_goals']

x_train = x_train.drop(hand_pick_drop, axis=1)

In [47]:
### apply corr filter on x_train #### might want to check performance without it
#might reduce errors and process time and run time 

selected_columns, rejected_columns = features_corr_filter(x_train, 0.9)
print(selected_columns)
x_train = x_train[selected_columns]
x_test = x_test[selected_columns]
x_train.info()

Index(['venue', 'gk_shots_on_target_against', 'gk_save_pct', 'gk_clean_sheets',
       'gk_psxg', 'gk_pens_att', 'gk_pens_allowed', 'gk_passes_pct_launched',
       'gk_pct_goal_kicks_launched', 'gk_crosses', 'gk_crosses_stopped',
       'gk_def_actions_outside_pen_area', 'gk_avg_distance_def_actions',
       'tackles_def_3rd', 'pressures', 'pressure_regain_pct',
       'pressures_def_3rd', 'pressures_att_3rd', 'blocks', 'errors',
       'possession', 'touches', 'touches_def_pen_area', 'touches_def_3rd',
       'touches_att_3rd', 'touches_att_pen_area', 'dribbles',
       'dribbles_completed_pct', 'players_dribbled_past',
       'carries_into_final_third', 'carries_into_penalty_area',
       'progressive_passes_received', 'passes_pct',
       'passes_progressive_distance', 'passes_pct_short', 'passes_long',
       'passes_pct_long', 'xa', 'assisted_shots', 'passes_into_penalty_area',
       'passes_dead', 'passes_free_kicks', 'through_balls', 'passes_pressure',
       'crosses', 'corne

In [48]:
#### after all feature selection filtering
### now x_train has 77 features (half of the beginning) #### pvalue drop has been made and high corr between features 


In [49]:
##### now we scale the data with standardscaler ##### prepre it for learning, convert to np array

scaler = StandardScaler()
x_train_np = x_train.to_numpy()   
x_train_np = scaler.fit_transform(x_train_np)

x_test_np = x_test.to_numpy()
x_test_np = scaler.transform(x_test_np)


x_train_df = pd.DataFrame(x_train_np, columns= list(selected_columns)) # df after scaling
y_train_np = y_train.to_numpy()

y_test_np =  y_test.to_numpy()

In [363]:
#### apply classifiers ####

# tried three classifiers ligostic reg gave slightly better rsults . yet it seems to not overfit as the otheres and when I chose 
#newton method the results were great 

In [254]:
#clf = RandomForestClassifier(n_estimators= 100, max_depth=2, random_state=9)
#clf.fit(x_train_np, y_train_np)
#y_train_pred = clf.predict(x_train_np)

In [25]:
clf = SGDClassifier(max_iter=2000, tol=1e-3, random_state=9)
clf.fit(x_train_np, y_train_np)
y_train_pred = clf.predict(x_train_np)

In [50]:
clf = LogisticRegression(random_state=9, max_iter=500, solver = 'newton-cg')
clf.fit(x_train_np, y_train_np)
y_train_pred = clf.predict(x_train_np)

In [337]:
#clf = SVC(C=0.001, gamma='auto', random_state=9)
#clf.fit(x_train_np, y_train_np)
#y_predict = clf.predict(x_train_np)

In [51]:
#### check the accuracy, precision and recall ###### all feature slection is applied
## so far it is the best result because I want high precision , 
# if the model classified it as a win I want it to missclassify it as much as it can. i dont care if it missclassfies it as a lose 

precision, recall, f_1, support = precision_recall_fscore_support(y_train_np, y_train_pred, average='binary')
accuracy = accuracy_score(y_train_np, y_train_pred,)
print('accuracy :', np.round(accuracy,3))
print('precision :', np.round(precision,3))
print('recall :', np.round(recall,3))
print('f_1 :', np.round(f_1,3))


accuracy : 0.886
precision : 0.871
recall : 0.854
f_1 : 0.862


In [52]:
cv_3_accuracy = cross_val_score(clf, x_train , y_train, cv=3, scoring='accuracy')
#cv_3_precision = cross_val_score(clf, x_train , y_train, cv=3, scoring='precision')
#cv_3_recall = cross_val_score(clf, x_train , y_train, cv=3, scoring='recall')

print('cv_avg_accuracy', np.round(cv_3_accuracy.mean(),3))
#print('cv_avg_precision', cv_3_precision.mean())
#print('cv_avg_recall', cv_3_recall.mean())

cv_avg_accuracy 0.863


In [288]:
#print(classification_report(y_train_np, y_predict))

In [53]:
clf.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 500,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 9,
 'solver': 'newton-cg',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [54]:
### predict on TEST SET #####
y_test_pred = clf.predict(x_test_np)

In [56]:
### PREDICTION ASSESMENT ###
precision, recall, f_1, support = precision_recall_fscore_support(y_test_np, y_test_pred, average='binary')
accuracy = accuracy_score(y_test_np, y_test_pred)
print('accuracy :', np.round(accuracy,3))
print('precision :', np.round(precision,3))
print('recall :', np.round(recall,3))
print('f_1 :', np.round(f_1,3))

accuracy : 0.872
precision : 0.862
recall : 0.833
f_1 : 0.847


In [232]:
#x_train_df_scale.hist(bins=50, figsize=(20,15))
#plt.show()

In [149]:
#x_train['passes_switches'].hist(bins=50, figsize=(20,15))
#plt.show()

In [157]:
#x_train.nutmegs.value_counts() 
#x_train['gk_clean_sheets']

In [None]:
#### check model predictions without features elimination 