# Predicting Match Outcomes from English Premier League Data 2016-2017


In [None]:
''' Basic imports'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
''' Machine learning imports'''
# Feature Importance
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB

In [None]:
# Cleaned data from R
   
clean_train = pd.read_csv('/Users/Kellen/Desktop/CSUMB SCHTUFF/CST463/EPL_Predictions/CleanPLTrain')

col_names = list(clean_train)

train_y = clean_train['FTR']

clean_train = clean_train.drop(['FTR','Upset',
                                'PredictedOutcome',
                                'UpsetNumeric'], axis=1
                              )

clean_test = pd.read_csv('/Users/Kellen/Desktop/CSUMB SCHTUFF/CST463/EPL_Predictions/CleanPLTest')

test_y = clean_test['FTR']

clean_test = clean_test.drop(['FTR', 'Upset',
                              'PredictedOutcome',
                              'UpsetNumeric'],axis=1
                            )

clean_train.info()

## Basic Pre-Processing/Scaling

In [None]:
scaler = StandardScaler()
scaled_train_x = scaler.fit_transform(X=clean_train,y=None)

scaled_test_x = scaler.fit_transform(clean_test)

## Feature Importance

In [None]:

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(scaled_train_x, train_y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = importances.index()

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
# plt.figure()
# plt.title("Feature importances")
# plt.bar(range(X.shape[1]), importances[indices],
#        color="r", yerr=std[indices], align="center")
# plt.xticks(range(X.shape[1]), indices)
# plt.xlim([-1, X.shape[1]])
# plt.show()

## Make some quick predictions

In [None]:
print('Blind Prediction Rate: '+ str(round( (sum(train_y == 'H')/len(train_y)) , 2) ) )
      

In [None]:
rf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
rf.fit(scaled_train_x,train_y)
rf.score(scaled_test_x,test_y)

In [None]:
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(scaled_train_x,train_y)
neigh.score(scaled_test_x,test_y)

In [None]:
gbrt = GradientBoostingClassifier()
n_estimators = np.arange(1,20)
depth = np.arange(1,10)
learning_rates = [0.01,0.1,1]

param_grid = {'n_estimators': n_estimators , 'max_depth' : depth, 'learning_rate':learning_rates}

grid1 = GridSearchCV(gbrt, param_grid = param_grid, cv = 5)
grid1.fit(scaled_train_x,train_y.ravel())

print("Best Accuracy: ",grid1.best_score_*100)
print("Best params: ", grid1.best_params_)

grid1.score(scaled_test_x,test_y)

In [None]:
nb = GaussianNB()
nb.fit(scaled_train_x,train_y)
nb.score(scaled_test_x,test_y)