## Import libraries

In [1]:
# Importing the library
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
from IPython.display import display # Manage multiple output per cell
import datetime
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss

## Configuration

In [2]:
odd_H = 'INFO_BbAvH'
odd_A = 'INFO_BbAvA'
odd_D = 'INFO_BbAvD'
target = 'INFO_FTR'
start_date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")
#season_list = [2016]
season_list = [2014, 2015, 2016]
#league_list = ['D1', 'E0', 'E1', 'E2', 'F1', 'I1', 'SP1', 'SC0']
league_list = ['SC0']
historical_training_year_list = [7]

In [3]:
best_features_MLP = ['A_MEANS_FIVE_AC', 'A_MEANS_FIVE_AS', 'A_MEANS_FIVE_AST','A_MEANS_FIVE_FTAG', 'A_MEANS_FIVE_FTHG', 'A_MEANS_FIVE_FTR_H','A_MEANS_FIVE_HC', 'A_MEANS_FIVE_HS', 'A_MEANS_FIVE_HST','A_MEANS_FIVE_HTR_A', 'H_MEANS_FIVE_AC', 'H_MEANS_FIVE_AS','H_MEANS_FIVE_AST', 'H_MEANS_FIVE_AY', 'H_MEANS_FIVE_FTAG','H_MEANS_FIVE_FTHG', 'H_MEANS_FIVE_FTR_A', 'H_MEANS_FIVE_FTR_H','H_MEANS_FIVE_HC', 'H_MEANS_FIVE_HS', 'H_MEANS_FIVE_HST','H_MEANS_FIVE_HTR_H', 'A_MEANS_THREE_AC', 'A_MEANS_THREE_AS','A_MEANS_THREE_FTHG', 'A_MEANS_THREE_HS', 'H_MEANS_THREE_AS','A_STD_FIVE_HF', 'H_STD_FIVE_HC', 'H_STD_FIVE_HST']
features_list = [
    ['best_features_MLP', best_features_MLP]
]

## Import Data

In [4]:
# DB Sqlite connection
import sqlite3
db = "/Users/thibaultclement/Project/ligue1-predict/src/notebook/data/db/soccer_predict.sqlite"
conn = sqlite3.connect(db)
cur = conn.cursor()

In [5]:
# Get all prematch data
df_all = pd.read_sql_query("SELECT * FROM pre_matchs ORDER BY INFO_Date ASC;", conn)
df_all = (df_all[df_all.columns.drop(['index'])])
df_all.shape

(37907, 190)

In [6]:
# Remove all game between June (include) and October (include)
df_all['INFO_Date'] = pd.to_datetime(df_all['INFO_Date'])
df_all['INFO_Date'].dt.month
df_all = df_all[(df_all['INFO_Date'].dt.month < 6) | (df_all['INFO_Date'].dt.month >= 10)]
df_all.shape

(30912, 190)

In [7]:
# Create a INFO_WIN column containing the gain if you bet the good result
df_all['INFO_WIN'] = 0
df_all.loc[df_all.INFO_FTR == 'H', 'INFO_WIN'] = df_all[odd_H]
df_all.loc[df_all.INFO_FTR == 'A', 'INFO_WIN'] = df_all[odd_A]
df_all.loc[df_all.INFO_FTR == 'D', 'INFO_WIN'] = df_all[odd_D]
df_all['INFO_WIN_P'] = 0
df_all.loc[df_all.INFO_FTR == 'H', 'INFO_WIN_P'] = df_all['INFO_PSH']
df_all.loc[df_all.INFO_FTR == 'A', 'INFO_WIN_P'] = df_all['INFO_PSA']
df_all.loc[df_all.INFO_FTR == 'D', 'INFO_WIN_P'] = df_all['INFO_PSD']

## Methods

In [8]:
def get_dataset(league, season, historical_training_year, features):
    # Filter by league
    df = df_all[(df_all['INFO_Div'] == league)]
    # Keep season for test and filter by number of historical season used to train
    date_start_learn = datetime.date(season-historical_training_year, 8, 1)
    date_end_learn = datetime.date(season, 8, 1)
    date_start_test_season = datetime.date(season, 8, 1)
    date_end_test_season = datetime.date(season+1, 8, 1)
    df_test = df[(df['INFO_Date'] > date_start_test_season)]
    df_test = df_test[(df_test['INFO_Date'] < date_end_test_season)]
    df = df[(df['INFO_Date'] > date_start_learn)]
    df = df[(df['INFO_Date'] < date_end_learn)]
    # Filter by feature used to train
    X = pd.get_dummies(df[features])
    y = df[target]
    X_test_season = pd.get_dummies(df_test[features])
    y_test_season = df_test[target]
    # Impute of missing values (NaN) with the mean
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp = imp.fit(X)
    X = imp.transform(X)
    X_test_season = imp.transform(X_test_season)
    # Standardize features
    sc_X = StandardScaler().fit(X)
    X = sc_X.transform(X)
    X_test_season = sc_X.transform(X_test_season)
    return df, df_test, X, y, X_test_season, y_test_season

## Loop on league

In [9]:
# Init dataframe
result_df = pd.DataFrame(columns=[
    'league', 
    'season', 
    'historical_training_year', 
    'features',
    'best_score',
    'hidden_layer_sizes',
    'activation',
    'solver',
    'alpha',
    'max_iter'
])

In [10]:
for league in league_list:
    for season in season_list:
        for historical_training_year in historical_training_year_list:
            for features in features_list:
                print league,str(season),str(historical_training_year),features[0]
                df, df_test, X, y, X_test_season, y_test_season = get_dataset(league, season, historical_training_year, features[1])
                # train model
                parameters = {
                    'hidden_layer_sizes': [(10, ),(20, ),(30, ),(40, ),(50, ),(60, ),(70, ),(80, ),(90, ),(100, ),(110, ),(120, ),(130, ),(140, ),(150, ),(160, ),(170, ),(180, ),(190, ),(200, ),(210, ),(220, ),(230, )],
                    'activation': ['logistic'],
                    'solver': ['sgd'],
                    'alpha': np.arange(0, 3, 0.1).tolist(),
                    'max_iter': np.arange(100,300,10).tolist(),
                }
                classifier = MLPClassifier(random_state=0)
                clf = RandomizedSearchCV(
                    estimator=classifier,
                    param_distributions=parameters,
                    #scoring=make_scorer(log_loss, greater_is_better=False, needs_proba=True),
                    scoring='accuracy',
                    cv=8,
                    n_jobs=-1,
                    verbose=1,
                    n_iter=30)
                clf.fit(X, y)
                best_score = clf.best_score_
                best_params = clf.best_params_
                # Add all info to result dataframe
                result_df.loc[len(result_df.index)] = [
                    league, 
                    season, 
                    historical_training_year, 
                    features[0],
                    best_score,
                    best_params['hidden_layer_sizes'],
                    best_params['activation'],
                    best_params['solver'],
                    best_params['alpha'],
                    best_params['max_iter']
                ]

SC0 2014 7 best_features_MLP
Fitting 8 folds for each of 30 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  2.1min finished


SC0 2015 7 best_features_MLP
Fitting 8 folds for each of 30 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  2.8min finished


SC0 2016 7 best_features_MLP
Fitting 8 folds for each of 30 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   34.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  2.9min finished


# Save result

In [11]:
result_df.to_csv('./report/MLP_FTR_BYLEAGUE_BEST_HYPERPARAM_LOG_LOSS.csv')

### Final result
Best is with ??? and ??? years of history