In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from time import time
from sklearn.metrics import f1_score
from os import path, makedirs, walk
from joblib import dump, load
import json

In [2]:
def train_classifier(clf, X_train, y_train):
    clf.fit(X_train, y_train)


def predict_labels(clf, features, target):
    y_pred = clf.predict(features)

    acc = sum(target == y_pred) / float(len(y_pred))

    return f1_score(target, y_pred, average='micro'), acc

In [3]:
def model(clf, X_train, y_train, X_test, y_test):
    train_classifier(clf, X_train, y_train)
    f1, acc = predict_labels(clf, X_train, y_train)
    print("Training Info:")
    print("-" * 20)
    print("F1 Score:{}".format(f1))
    print("Accuracy:{}".format(acc))

    f1, acc = predict_labels(clf, X_test, y_test)
    print("Test Metrics:")
    print("-" * 20)
    print("F1 Score:{}".format(f1))
    print("Accuracy:{}".format(acc))


In [4]:
raw_data_15 = pd.read_csv(r"Seasons\season14-15.csv")
# raw_data_15['Season']=15
raw_data_16 = pd.read_csv(r"Seasons\season15-16.csv")
# raw_data_16['Season']=16
raw_data_17 = pd.read_csv(r"Seasons\season16-17.csv")
# raw_data_17['Season']=17
raw_data_18 = pd.read_csv(r"Seasons\season17-18.csv")
# raw_data_18['Season']=18
raw_data_19 = pd.read_csv(r"Seasons\season18-19.csv")
# raw_data_19['Season']=19
# raw_data_20 = pd.read_csv(r"Seasons\season19-20.csv")
# raw_data_20['Season']=20
EA_features=pd.read_csv("EA_rating15-20.csv") # need it put this within the dataframe
frames=[raw_data_15,raw_data_16,raw_data_17,raw_data_18,raw_data_19]
data=pd.concat(frames).reset_index()

data[data['HomeTeam'].isnull()].index.tolist()
data.drop([380],inplace=True)
data.drop(['index'],axis=1,inplace=True)

input_filter = ['Date_new','home_encoded', 'away_encoded','team_encoded', 'FTHG','FTAG','HTHG', 'HTAG', 'HS',
                'AS', 'HST', 'AST', 'HC','AC','HTR']
output_filter = ['FTR']#create a new Date for Data column

cols_to_consider = input_filter + output_filter


data=data.replace({'H':0,'D':1,'A':2})

new_date = [] 
for i in data["Date"]:
    i = str(i).replace("/","")
    new_date.append(i)

    
data["Date_new"] = pd.DataFrame(np.array(new_date))
data['Date_new'].fillna(12052019,inplace=True)
data["Date_new"]=data['Date_new'].astype(str).astype(int)

data.drop(['Div'],axis=1,inplace=True)

encoder = LabelEncoder()
home_encoded = encoder.fit_transform(data['HomeTeam'])
home_encoded_mapping = dict(
    zip(encoder.classes_, encoder.transform(encoder.classes_).tolist()))
data['home_encoded'] = home_encoded

away_encoded = encoder.fit_transform(data['AwayTeam'])
away_encoded_mapping = dict(
    zip(encoder.classes_, encoder.transform(encoder.classes_).tolist()))
data['away_encoded'] = away_encoded


team_encoded = encoder.fit_transform(data['Team'])
team_encoded_mapping = dict(
    zip(encoder.classes_, encoder.transform(encoder.classes_).tolist()))
data['team_encoded'] = team_encoded

data=data[cols_to_consider]
data

Unnamed: 0,Date_new,home_encoded,away_encoded,team_encoded,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST,HC,AC,HTR,FTR
0,16082014,0,7,0,2,1,1,1,14,4,6,2,9,3,1,0
1,16082014,12,8,12,2,2,1,2,11,13,3,3,3,6,2,1
2,16082014,15,23,15,1,2,0,1,14,5,5,4,4,0,2,2
3,16082014,19,11,19,0,1,0,0,19,11,6,4,8,9,1,2
4,16082014,21,1,21,0,1,0,0,12,7,2,2,2,8,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,12052019,13,28,13,2,0,1,0,13,7,5,2,4,1,0,0
3796,12052019,15,5,15,0,2,0,1,26,13,10,4,11,2,2,2
3797,12052019,20,10,20,1,1,1,0,10,10,3,3,4,3,0,1
3798,12052019,24,8,24,2,2,1,0,11,17,3,9,7,4,0,1


In [5]:
def average_season(variavel):
    stats=[]
    for i, j in zip(data["Date_new"],data['team_encoded']):
        oi = data.loc[data["Date_new"]<i]
        oi = oi.loc[oi["team_encoded"]==j]
        oi = oi[variavel].mean()
        stats.append(oi)

    data["avg_"+variavel] = pd.DataFrame(np.array(stats))
        
def average_last_3(variavel):
    stats = []
    for i, j in zip(data["Date_new"],data["team_encoded"]):
        oi = data.loc[data["Date_new"]<i]
        oi = oi.loc[oi["team_encoded"]==j]
        oi= oi.reset_index(drop=True)
        oi = oi[-3:]
        oi = oi[variavel].mean()
        stats.append(oi)

    data["last_3_avg_"+variavel] = pd.DataFrame(np.array(stats))    
    
    
def average_last_5(variavel):
    stats = []
    for i, j in zip(data["Date_new"],data["team_encoded"]):
        oi = data.loc[data["Date_new"]<i]
        oi = oi.loc[oi["team_encoded"]==j]
        oi= oi.reset_index(drop=True)
        oi = oi[-5:]
        oi = oi[variavel].mean()
        stats.append(oi)

    data["last_5_avg_"+variavel] = pd.DataFrame(np.array(stats))    


In [6]:
variaveis_average = data.drop(columns=['Date_new','team_encoded'
                                       ,'home_encoded', 'away_encoded', 'FTR'])
for i in variaveis_average.columns:
    average_season(i)
    average_last_3(i)
    average_last_5(i)

# sequence_5()
# sequence_3()
# sequence_1()

In [7]:
data

Unnamed: 0,Date_new,home_encoded,away_encoded,team_encoded,FTHG,FTAG,HTHG,HTAG,HS,AS,...,last_5_avrg_AST,avrg_HC,last_3_avrg_HC,last_5_avrg_HC,avrg_AC,last_3_avrg_AC,last_5_avrg_AC,avrg_HTR,last_3_avrg_HTR,last_5_avrg_HTR
0,16082014,0,7,0,2,1,1,1,14,4,...,2.8,6.390000,9.000000,8.4,4.520000,2.333333,2.2,0.850000,0.000000,0.2
1,16082014,12,8,12,2,2,1,2,11,13,...,4.2,6.515789,4.000000,5.8,5.189474,4.333333,3.6,0.852632,1.000000,1.4
2,16082014,15,23,15,1,2,0,1,14,5,...,3.6,5.389474,6.666667,7.8,4.547368,6.333333,5.8,0.894737,1.000000,1.0
3,16082014,19,11,19,0,1,0,0,19,11,...,4.2,5.722222,7.666667,6.2,5.277778,4.333333,4.0,0.777778,0.000000,0.4
4,16082014,21,1,21,0,1,0,0,12,7,...,4.2,6.142857,4.666667,5.2,4.961039,4.666667,5.0,0.818182,1.000000,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,12052019,13,28,13,2,0,1,0,13,7,...,4.4,5.521739,8.333333,7.8,4.507246,5.333333,5.2,0.913043,1.000000,1.2
3796,12052019,15,5,15,0,2,0,1,26,13,...,4.6,6.352113,5.000000,5.2,4.816901,6.333333,5.8,0.873239,1.000000,1.2
3797,12052019,20,10,20,1,1,1,0,10,10,...,3.4,6.216216,4.333333,5.6,4.648649,5.000000,4.6,0.945946,1.000000,1.2
3798,12052019,24,8,24,2,2,1,0,11,17,...,5.4,5.948276,5.000000,5.6,4.465517,3.666667,5.8,0.775862,0.666667,1.2


In [8]:
from sklearn.preprocessing import MinMaxScaler
X=data.drop(['Date_new','FTHG', 'FTAG', 'HTHG','HTAG', 'HS', 
             'AS', 'HST', 'AST', 'HC', 'AC', 'HTR', 'FTR'],axis=1)
X.fillna(0,inplace=True)
scaler = MinMaxScaler() 
# X =scaler.fit_transform(X)
y=data['FTR']
# X.to_csv('X-20.csv',index=False)

In [12]:
X
X.to_csv('X-20.csv',index=False)

In [9]:
svc_classifier = SVC(random_state=100, kernel='rbf')
lr_classifier = LogisticRegression(multi_class='ovr', max_iter=500)
nbClassifier = GaussianNB()
dtClassifier = DecisionTreeClassifier()
rfClassifier = RandomForestClassifier()

X_train, X_test, Y_train, Y_test = train_test_split(X, y)

eplPointsTable = pd.read_excel(r"Tables\starttable.xlsx")[['Team', 'Points']]
MatchesLeft = pd.read_excel(r"Tables\MatchesLeft.xlsx")
# x = pd.read_csv(r"Tables\X.csv")


svc_classifier.fit(X_train,Y_train)
lr_classifier.fit(X_train,Y_train)
nbClassifier.fit(X_train,Y_train)
dtClassifier.fit(X_train,Y_train)
rfClassifier.fit(X_train,Y_train)

# home_points=[]
# away_points=[]

# for index, row in MatchesLeft.iterrows():
#     home, away = row['HomeTeam'], row['AwayTeam']
#     assert(home in eplPointsTable.Team.values and away in eplPointsTable.Team.values)
#     Y_test = rfClassifier.predict(x)
#     for i in range(0,379):
#         if Y_test[i]==1:
#             home_points.append(3)
#             away_points.append(0)
#         elif Y_test[i]==2:
#             home_points.append(1)
#             away_points.append(1)
#         else:
#             away_points.append(3)
#             home_points.append(0)
        
#     eplPointsTable.loc[eplPointsTable.Team == home_encoded, 'Points'] += home_points
#     eplPointsTable.loc[eplPointsTable.Team == away_encoded, 'Points'] += away_points
    
# eplPointsTable = eplPointsTable.sort_values('Points', ascending=False)
# eplPointsTable.index = np.arange(1, len(eplPointsTable)+1) 
# eplPointsTable.round(2)

FileNotFoundError: [Errno 2] No such file or directory: 'Tables\\starttable - Copy.xlsx'

In [None]:
#issues
# how to include EA_rating
# how to improve features-take consideration home and away
# how to optimise classifers