## Import libraries

In [1]:
# Importing the library
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import datetime
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

## Configuration

In [2]:
odd_H = 'INFO_BbAvH'
odd_A = 'INFO_BbAvA'
odd_D = 'INFO_BbAvD'
target = 'INFO_FTR'
start_date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")
season = 2017
league = 'F1'
historical_training_year = 9
filename = './models/'+league+'/'+league+'_'+str(season)+'_'

In [3]:
best_params = {
    'hidden_layer_sizes': (170, ),
    'activation': 'logistic',
    'solver': 'sgd',
    'alpha': 1.3,
    'max_iter': 150
}

In [4]:
best_features_MLP = ['A_MEANS_FIVE_AC', 'A_MEANS_FIVE_AS', 'A_MEANS_FIVE_AST','A_MEANS_FIVE_FTAG', 'A_MEANS_FIVE_FTHG', 'A_MEANS_FIVE_FTR_H','A_MEANS_FIVE_HC', 'A_MEANS_FIVE_HS', 'A_MEANS_FIVE_HST','A_MEANS_FIVE_HTR_A', 'H_MEANS_FIVE_AC', 'H_MEANS_FIVE_AS','H_MEANS_FIVE_AST', 'H_MEANS_FIVE_AY', 'H_MEANS_FIVE_FTAG','H_MEANS_FIVE_FTHG', 'H_MEANS_FIVE_FTR_A', 'H_MEANS_FIVE_FTR_H','H_MEANS_FIVE_HC', 'H_MEANS_FIVE_HS', 'H_MEANS_FIVE_HST','H_MEANS_FIVE_HTR_H', 'A_MEANS_THREE_AC', 'A_MEANS_THREE_AS','A_MEANS_THREE_FTHG', 'A_MEANS_THREE_HS', 'H_MEANS_THREE_AS','A_STD_FIVE_HF', 'H_STD_FIVE_HC', 'H_STD_FIVE_HST']

## Import Data

In [5]:
# DB Sqlite connection
import sqlite3
db = "/Users/thibaultclement/Project/ligue1-predict/src/notebook/data/db/soccer_predict.sqlite"
conn = sqlite3.connect(db)
cur = conn.cursor()

In [6]:
# Get all prematch data
df_all = pd.read_sql_query("SELECT * FROM pre_matchs ORDER BY INFO_Date ASC;", conn)
df_all = (df_all[df_all.columns.drop(['index'])])
df_all.shape

(37907, 190)

In [7]:
# Remove all game between June (include) and October (exclude)
df_all['INFO_Date'] = pd.to_datetime(df_all['INFO_Date'])
df_all['INFO_Date'].dt.month
df_all = df_all[(df_all['INFO_Date'].dt.month < 6) | (df_all['INFO_Date'].dt.month > 10)]
df_all.shape

(26988, 190)

In [8]:
# Create a INFO_WIN column containing the gain if you bet the good result
df_all['INFO_WIN'] = 0
df_all.loc[df_all.INFO_FTR == 'H', 'INFO_WIN'] = df_all[odd_H]
df_all.loc[df_all.INFO_FTR == 'A', 'INFO_WIN'] = df_all[odd_A]
df_all.loc[df_all.INFO_FTR == 'D', 'INFO_WIN'] = df_all[odd_D]
df_all['INFO_WIN_P'] = 0
df_all.loc[df_all.INFO_FTR == 'H', 'INFO_WIN_P'] = df_all['INFO_PSH']
df_all.loc[df_all.INFO_FTR == 'A', 'INFO_WIN_P'] = df_all['INFO_PSA']
df_all.loc[df_all.INFO_FTR == 'D', 'INFO_WIN_P'] = df_all['INFO_PSD']

## Methods

In [9]:
def get_dataset(league, season, historical_training_year, features):
    # Filter by league
    df = df_all[(df_all['INFO_Div'] == league)]
    # Keep season for test and filter by number of historical season used to train
    date_start_learn = datetime.date(season-historical_training_year, 8, 1)
    date_end_learn = datetime.date(season, 8, 1)
    df = df[(df['INFO_Date'] > date_start_learn)]
    df = df[(df['INFO_Date'] < date_end_learn)]
    # Filter by feature used to train
    X = pd.get_dummies(df[features])
    y = df[target]
    # Impute of missing values (NaN) with the mean
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp = imp.fit(X)
    X = imp.transform(X)
    # Standardize features
    sc_X = StandardScaler().fit(X)
    X = sc_X.transform(X)
    return df, X, y, imp, sc_X

## Modeling

In [10]:
# Get dataset
df, X, y, mean_imputer, standard_scaler = get_dataset(league, season, historical_training_year, best_features_MLP)
# Init Model
clf = MLPClassifier(
                    random_state=0 ,
                    hidden_layer_sizes=best_params['hidden_layer_sizes'],
                    activation=best_params['activation'],
                    solver=best_params['solver'],
                    alpha=best_params['alpha'],
                    max_iter=best_params['max_iter']
                )
# Train Model
clf.fit(X, y)

MLPClassifier(activation='logistic', alpha=1.3, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(170,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=150, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=0, shuffle=True,
       solver='sgd', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

## Save result

In [11]:
# Save Scaler
scaler_filename = filename+"scaler.pkl"
joblib.dump(standard_scaler, scaler_filename) 

['./models/F1/F1_2017_scaler.pkl']

In [12]:
# Save Imputer
imputer_filename = filename+"imputer.pkl"
joblib.dump(mean_imputer, imputer_filename) 

['./models/F1/F1_2017_imputer.pkl']

In [13]:
# Save Model
model_filename = filename+"model_MLP.pkl"
joblib.dump(clf, model_filename) 

['./models/F1/F1_2017_model_MLP.pkl']