In [2]:
from joblib import dump, load
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd

df = pd.read_csv("../data/processed/training.csv")
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
col_to_remove = ["score1", "score2"]
X_train = train_set.drop(columns=col_to_remove, axis=1)
X_test = test_set.drop(columns=col_to_remove, axis=1)
y_train1, y_train2 = train_set["score1"], train_set["score2"]
y_test1, y_test2 = test_set["score1"], test_set["score2"]

print "Fitting and validating linear regression model"
linear_model = load("../models/linear_regression_score1.joblib")
scores1 = cross_val_score(linear_model, X_train, y_train1, cv=3, scoring='r2')
print "cross val scores for score1:{}".format(scores1)

Fitting and validating linear regression model
cross val scores for score1:[-1.68636132e+32 -3.25890564e+30 -6.47123521e+33]


In [6]:
print "Fitting and validating random forest"
rf_model = load("../models/random_forest_score1.joblib")
scores1 = cross_val_score(rf_model, X_train, y_train1, cv=3, scoring='r2')
print "cross val scores for score1:{}".format(scores1)


Fitting and validating random forest
cross val scores for score1:[0.26469125 0.27185484 0.24673161]


In [13]:
print "Loading the trained neural network model"
from keras.models import load_model, model_from_json
score = 'score1'
with open('../models/NN_{}.json'.format(score)) as f:
    model = model_from_json(f.read())
    model.load_weights('../models/NN_{}.h5'.format(score))
f.close()
print "model loaded: {}".format(model.summary())

Loading the trained neural network model
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1606)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               160700    
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 100)               10100     
_________________________________________________________________
score1_output (Dense)        (None, 1)                 101       
Total params: 191,101
Trainable params: 191,101
Non-trainable params: 0
____________

In [4]:
print "Generating and shaping real football game data for predicting."
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
processed = pd.read_csv("../data/processed/processed.csv")
original_df = processed
predict_case_indexes = [37, 68, 584, 432, 247]
rows_display = [original_df[original_df.index.isin([idx + 25831])] for idx in predict_case_indexes]
def prepare_data_for_predict_set():
    
    processed.drop(columns=['date', 'score1', 'score2'], axis=1, inplace=True)
    columns_to_dummy = ['league', 'team1', 'team2']
    df = pd.get_dummies(processed, columns=columns_to_dummy)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="constant", fill_value=0)),
        ('std_scaler', StandardScaler()),
    ])
    df_columns = df.columns.to_list()

    df[df_columns] = num_pipeline.fit_transform(df[df_columns])

    
    
    rows = [df[df.index.isin([idx])] for idx in predict_case_indexes]
    games = [[row.iloc[0]['year'], row.iloc[0]['month'], row.iloc[0]['day'], row.iloc[0]['team1'], row.iloc[0]['team2']] for row in rows_display]
    print "cases to predict:\n"
    for g in games:
        print g
    rows_ndarray = [row.to_numpy() for row in rows]
    return rows_ndarray

rows = prepare_data_for_predict_set()

Generating and shaping real football game data for predicting.
cases to predict:

[2019, 10, 18, 'Eintracht Frankfurt', 'Bayer Leverkusen']
[2019, 10, 19, 'Everton', 'West Ham United']
[2019, 10, 27, 'Liverpool', 'Tottenham Hotspur']
[2019, 10, 26, 'Bayern Munich', '1. FC Union Berlin']
[2019, 10, 20, 'Parma', 'Genoa']


In [5]:
print "Loading the trained neural network model, and using it to predict real live football match scores."
from keras.models import load_model, model_from_json
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

def load_model(score_name):
    with open('../models/NN_{}.json'.format(score_name)) as f:
        model = model_from_json(f.read())
        model.load_weights('../models/NN_{}.h5'.format(score_name))
    f.close()
    return model

model1 = load_model('score1')
model2 = load_model('score2')

def use_model_to_predict(score, row_value_array):
    model = model1 if score == 'score1' else model2
    prediction = model.predict(row_value_array)
    print "prediction for {} :{}".format(score, prediction)


for idx, row in enumerate(rows):
    row_display = rows_display[idx]
    print "predicting on {} vs {}".format(row_display.iloc[0]['team1'], row_display.iloc[0]['team2'])
    use_model_to_predict('score1', row)
    use_model_to_predict('score2', row)

Loading the trained neural network model, and using it to predict real live football match scores.
predicting on Eintracht Frankfurt vs Bayer Leverkusen
prediction for score1 :[[0.77133083]]
prediction for score2 :[[1.7512617]]
predicting on Everton vs West Ham United
prediction for score1 :[[1.0281343]]
prediction for score2 :[[0.97937214]]
predicting on Liverpool vs Tottenham Hotspur
prediction for score1 :[[3.5695443]]
prediction for score2 :[[0.9038845]]
predicting on Bayern Munich vs 1. FC Union Berlin
prediction for score1 :[[2.9759655]]
prediction for score2 :[[0.9898988]]
predicting on Parma vs Genoa
prediction for score1 :[[3.1643186]]
prediction for score2 :[[0.9877361]]
