In [44]:
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score

In [2]:
from FixturesOdds import FixturesOdds
import matplotlib.pyplot as plt

In [3]:
import FootballClf

In [4]:
import warnings
warnings.simplefilter("ignore")

In [5]:
def filter_by_date_range(fix, start_date, end_date):
    fix.df = fix.df[fix.df.FixtureDateAsDate >= start_date]
    fix.df = fix.df[fix.df.FixtureDateAsDate < end_date]
    return fix

In [6]:
n_estimators = [int(x) for x in np.linspace(start=10, stop=100, num=5)]
max_features = ['auto', 'sqrt']
min_samples_leaf = [2,5,10,20]
criterion = ['gini']
min_samples_split=[200,400, 500,600,800]
param_grid = [{'n_estimators' : n_estimators, 'max_features' : max_features, 'criterion' : criterion, 
                      'min_samples_split' : min_samples_split, 'min_samples_leaf' : min_samples_leaf}]

In [58]:
start_date='2000-01-01'
end_date = '2018-08-01'

In [32]:
def get_fixs():
    ftest=FixturesOdds().fix_load('vwCSV_3','vwCSV_3.csv', False)
    ftest.do_calcs()
    ftest.clean()
    return ftest

In [18]:
def best_estimator(fo):
    clf =  RandomForestClassifier(random_state=0)
    X,y=fo.X(),fo.y()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    gs=GridSearchCV(clf, param_grid, cv=5, scoring='precision_weighted')
    gs.fit(X_train, y_train)
    return gs.best_estimator_

In [79]:
fo=FixturesOdds().fix_load('vwCSV_3','vwCSV_3.csv', False)
fo.do_calcs()
fo.clean()
fo = filter_by_date_range(fo, '2000-01-01', '2018-08-01')

In [80]:
clf = best_estimator(fo)

In [82]:
X_test,y_test=fo.X(),fo.y()
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           D       0.57      0.01      0.01      2277
           L       0.46      0.54      0.49      2631
           W       0.52      0.77      0.62      3736

   micro avg       0.50      0.50      0.50      8644
   macro avg       0.51      0.44      0.38      8644
weighted avg       0.51      0.50      0.42      8644



In [78]:
print ('train classsififers dropping fixtures where the predicted result is less than limit')
print ('limit  precsn fixtures')
for l_lim in np.linspace(0, 1.0, 5) :
    for  u_lim in np.linspace(0.2, 1.0, 3):
        ftrain = get_fixs()
        ftrain = filter_by_date_range(ftrain, start_date, end_date)
        ftrain.df = ftrain.df[(ftrain.df.ExpectedResult > l_lim) & (ftrain.df.ExpectedResult > (l_lim+u_lim))]
        clf = best_estimator(ftrain)

        ftest = get_fixs()
        ftest = filter_by_date_range(ftest, end_date, '2018-12-01')
        ftest.df = ftest.df[(ftest.df.ExpectedResult > l_lim ) & (ftest.df.ExpectedResult> (l_lim+u_lim)) ]
        X_test,y_test=ftest.X(),ftest.y()

        if X_test.size > 0:
            y_pred = clf.predict(X_test)
            print("%4.2f : %4.2f : %4.2f : %5d" %(l_lim, l_lim + u_lim, precision_score(y_test, y_pred, average='weighted'), len(ftest.df)) )
        else:
            print ("%4.2f is empty" % l_lim)

train classsififers dropping fixtures where the predicted result is less than limit
limit  precsn fixtures
0.00 : 0.20 : 0.46 :   575
0.00 : 0.60 : 0.35 :   469
0.00 : 1.00 : 0.67 :   262
0.25 : 0.45 : 0.35 :   527
0.25 : 0.85 : 0.35 :   350
0.25 : 1.25 : 0.37 :   171
0.50 : 0.70 : 0.35 :   427
0.50 : 1.10 : 0.38 :   229
0.50 : 1.50 : 0.36 :   111
0.75 : 0.95 : 0.51 :   306
0.75 : 1.35 : 0.39 :   147
0.75 : 1.75 : 0.38 :    70
1.00 : 1.20 : 0.40 :   197
1.00 : 1.60 : 0.39 :    92
1.00 : 2.00 : 0.46 :    54


In [77]:
for l_lim in np.linspace(0, 1.0, 5) :
  for  u_lim in np.linspace(0.2, 1.0, 3):
    print ("lower %4.2f upper %4.2f" % (l_lim, l_lim +u_lim))

lower 0.00 upper 0.20
lower 0.00 upper 0.60
lower 0.00 upper 1.00
lower 0.25 upper 0.45
lower 0.25 upper 0.85
lower 0.25 upper 1.25
lower 0.50 upper 0.70
lower 0.50 upper 1.10
lower 0.50 upper 1.50
lower 0.75 upper 0.95
lower 0.75 upper 1.35
lower 0.75 upper 1.75
lower 1.00 upper 1.20
lower 1.00 upper 1.60
lower 1.00 upper 2.00


In [62]:
print ('train classsififers dropping fixtures where the predicted result is less than limit')
print ('limit  precsn fixtures')
for lim in np.linspace(0, 1.0, 5) :
    ftrain = get_fixs()
    ftrain = filter_by_date_range(ftrain, start_date, end_date)
    ftrain.df = ftrain.df[ftrain.df.ExpectedResult > lim]
    
    ftest = get_fixs()
    ftest = filter_by_date_range(ftest, end_date, '2018-12-01')
    ftest.df = ftest.df[ftest.df.ExpectedResult > lim]
    X_test,y_test=ftest.X(),ftest.y()
    
    if X_test.size > 0:
        print("%4.2f  : %5d" %(lim, len(ftest.df)) )
    else:
        print ("%4.2f is empty" % lim)

train classsififers dropping fixtures where the predicted result is less than limit
limit  precsn fixtures
0.00  :   577
0.25  :   573
0.50  :   509
0.75  :   403
1.00  :   262


In [7]:
fo=FixturesOdds()
fo.fix_load('vwCSV_3','vwCSV_3.csv', False)
fo.do_calcs()
fo.clean()

<FixturesOdds.FixturesOdds at 0x111ed2d68>

In [8]:
fo = filter_by_date_range(fo, '2000-01-01', '2018-08-01')

In [9]:
clf =  RandomForestClassifier(random_state=0)
X,y=fo.X(),fo.y()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [10]:
gs=GridSearchCV(clf, param_grid, cv=5, scoring='precision_weighted')
gs.fit(X_train, y_train)
y_pred = gs.best_estimator_.fit(X_train, y_train).predict(X_test)

In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           D       0.43      0.00      0.01       699
           L       0.44      0.51      0.47       812
           W       0.49      0.75      0.59      1083

   micro avg       0.47      0.47      0.47      2594
   macro avg       0.45      0.42      0.36      2594
weighted avg       0.46      0.47      0.40      2594



In [12]:
gs.best_estimator_.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 10,
 'min_samples_split': 200,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 32,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [15]:
fix_pred=FixturesOdds()
fix_pred.fix_load('vwCSV_3','vwCSV_3.csv', False) # no reresh
fix_pred.do_calcs()
fix_pred.clean_predict()

<FixturesOdds.FixturesOdds at 0x1123eefd0>

In [16]:
#see how we would have done in the 2018 season
fix_pred = filter_by_date_range(fix_pred, '2018-08-02', '2018-11-15')
fix_pred.filter_by_col('leagueid', int(1))

<FixturesOdds.FixturesOdds at 0x1123eefd0>

In [17]:
X_pred=fix_pred.X()
y_pred=fix_pred.y()

In [18]:
y_test = gs.best_estimator_.predict(X_pred)

In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           D       0.00      0.00      0.00         0
           L       0.63      0.59      0.61        41
           W       0.83      0.58      0.68        69

   micro avg       0.58      0.58      0.58       110
   macro avg       0.49      0.39      0.43       110
weighted avg       0.76      0.58      0.66       110



In [21]:
from sklearn.metrics import confusion_matrix

In [23]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 0,  0,  0],
       [ 9, 24,  8],
       [15, 14, 40]])

In [24]:
cm/cm.astype(np.float).sum(axis=1)

array([[       nan, 0.        , 0.        ],
       [       inf, 0.58536585, 0.11594203],
       [       inf, 0.34146341, 0.57971014]])

In [25]:
from sklearn.metrics import precision_recall_fscore_support
_, recall, _, _ = precision_recall_fscore_support(y_test, y_pred)

In [26]:
recall

array([0.        , 0.58536585, 0.57971014])

In [28]:
c=FootballClf.FootballClf()
c.save(gs.best_estimator_, 'betting clf_2.2', 1, 'fixed AVG goal records (again)', 'odds_clf_dev2', ['ExpectedResult', 'FTG_3', 'FTG_5', 'HomeOdds', 'DrawOdds', 'AwayOdds'])


In [133]:
import pandas as pd

In [138]:
predictday='2018-11-24'
leagueid=1
refresh=False

In [134]:
pd.set_option('display.max_colwidth', 18)
pd.set_option('colheader_justify', 'left')
pd.set_option('display.expand_frame_repr', False)

In [181]:
fix_pred=FixturesOdds()
fix_pred.fix_load('vwCSV_3','vwCSV_3.csv', refresh) # no reresh
fix_pred.do_calcs()

fix_pred.add_live_odds('skybet', int(leagueid),predictday)
fix_pred.clean_predict()

fix_pred.filter_by_col('FixtureDateAsDate', predictday).filter_by_col('leagueid', int(leagueid))
X=fix_pred.X()

In [176]:
clf=gs.best_estimator_

In [182]:
c=FootballClf.FootballClf()
clf=c.load_by_name('betting clf_2.1')

In [183]:
fix_pred.df['prediction']=clf.predict(X)
fix_pred.df.sort_values('HomeTeam', inplace=True)

print(fix_pred.df[['FixtureDateAsDate','HomeTeam', 'AwayTeam', 'prediction', 'HomeOdds', 'DrawOdds', 'AwayOdds']])

    FixtureDateAsDate HomeTeam           AwayTeam         prediction  HomeOdds  DrawOdds  AwayOdds
116 2018-11-24         Brighton & Hov...   Leicester City  L           2.90     3.20      2.50    
467 2018-11-24                   Everton     Cardiff City  W           1.44     4.33      7.00    
597 2018-11-24                    Fulham      Southampton  L           2.50     3.40      2.70    
825 2018-11-24         Manchester United   Crystal Palace  W           1.40     4.50      8.00    
710 2018-11-24         Tottenham Hotspur          Chelsea  L           2.70     3.40      2.50    
239 2018-11-24                   Watford        Liverpool  L           6.00     4.20      1.53    
353 2018-11-24           West Ham United  Manchester City  L          11.00     6.00      1.25    


In [178]:
#existing classifier
c=FootballClf.FootballClf()
clf=c.load_by_name('betting clf_1.1')

In [179]:
fix_pred.df['prediction']=clf.predict(X)
fix_pred.df.sort_values('HomeTeam', inplace=True)

print(fix_pred.df[['FixtureDateAsDate','HomeTeam', 'AwayTeam', 'prediction', 'HomeOdds', 'DrawOdds', 'AwayOdds']])

    FixtureDateAsDate HomeTeam           AwayTeam         prediction  HomeOdds  DrawOdds  AwayOdds
116 2018-11-24         Brighton & Hov...   Leicester City  W           2.90     3.20      2.50    
598 2018-11-24                   Everton     Cardiff City  L           1.44     4.33      7.00    
483 2018-11-24                    Fulham      Southampton  L           2.50     3.40      2.70    
825 2018-11-24         Manchester United   Crystal Palace  W           1.40     4.50      8.00    
710 2018-11-24         Tottenham Hotspur          Chelsea  W           2.70     3.40      2.50    
239 2018-11-24                   Watford        Liverpool  L           6.00     4.20      1.53    
353 2018-11-24           West Ham United  Manchester City  W          11.00     6.00      1.25    
