In [1]:
# Importing the library
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import datetime
import time
import sqlite3
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.externals import joblib



In [2]:
# Configuration
odd_H = 'INFO_BbAvH'
odd_A = 'INFO_BbAvA'
odd_D = 'INFO_BbAvD'
target = 'INFO_FTR'
now = datetime.datetime.now()
start_date = now.strftime("%Y-%m-%d-%H-%M")
today = now.strftime("%Y-%m-%d")
season = 2017
classes = ['A', 'D', 'H']
PHANTOMJS_PATH = './phantomjs'
browser = webdriver.PhantomJS(PHANTOMJS_PATH)
base_layer = ['XGBoost', 'NB', 'MLP']

league_list = ['E0', 'E1', 'E2', 'SP1', 'D1', 'F1', 'I1', 'SC0']
#league_list = ['E0', 'E1', 'E2', 'F1', 'SC0']
league_stacking = ['E0', 'E1', 'E2', 'SP1', 'SC0']

In [3]:
# dictionary of team name
team_name_dict = {
    'West Ham United': "West Ham",
    'Chelsea': "Chelsea",
    'Huddersfield Town': "Huddersfield",
    'Manchester City': "Man City",
    'Newcastle United FC': "Newcastle",
    'Stoke City FC': "Stoke",
    'Swansea City': "Swansea",
    'Southampton FC': "Southampton",
    'Everton': "Everton",
    'Tottenham Hotspur': "Tottenham",
    'Brighton and Hove Albion': "Brighton",
    'Watford FC': "Watford",
    'Manchester United FC': "Man United",
    'Burnley FC': "Burnley",
    'Crystal Palace FC': "Crystal Palace",
    'Bournemouth AFC': "Bournemouth",
    'Leicester City': "Leicester",
    'West Bromwich Albion FC': "West Brom",
    'Arsenal': "Arsenal",
    'Liverpool': "Liverpool",
    'Aston Villa': "Aston Villa",
    'Fulham': "Fulham",
    'Barnsley FC': "Barnsley",
    'Hull City AFC': "Hull",
    'Bolton Wanderers': "Bolton",
    'Queens Park Rangers FC': "QPR", 
    'Brentford': "Brentford",
    'Sunderland': "Sunderland",
    'Bristol City FC': "Bristol City",
    'Leeds United': "Leeds",
    'Derby County': "Derby",
    'Sheffield Wednesday FC': "Sheffield Weds",
    'Middlesbrough FC': "Middlesbrough",
    'Cardiff City FC': "Cardiff",
    'Nottingham Forest': "Nott'm Forest",
    'Burton Albion': "Burton",
    'Sheffield United FC': "Sheffield United",
    'Reading': "Reading",
    'Wolverhampton Wanderers': "Wolves",
    'Preston North End FC': "Preston",
    'Millwall': "Millwall",
    'Birmingham City FC': "Birmingham",
    'Ipswich Town FC': "Ipswich",
    'Norwich City FC': "Norwich",
    'AFC Wimbledon' : "AFC Wimbledon",
    'Plymouth Argyle FC' : "Plymouth",
    'Blackburn Rovers' : "Blackburn",
    'Portsmouth' : "Portsmouth",
    'Blackpool FC' : "Blackpool",
    'Wigan Athletic' : "Wigan",
    'Bradford City' : "Bradford",
    'Charlton Athletic FC' : "Charlton",
    'Doncaster Rovers' : "Doncaster",
    'Walsall' : "Walsall",
    'Gillingham' : "Gillingham",
    'Northampton Town' : "Northampton",
    'Milton Keynes Dons FC' : "Milton Keynes Dons",
    'Oldham Athletic' : "Oldham",
    'Oxford United' : "Oxford",
    'Rotherham United FC' : "Rotherham",
    'Rochdale' : "Rochdale",
    'Bristol Rovers' : "Bristol Rvs",
    'Scunthorpe United' : "Scunthorpe",
    'Peterborough United' : "Peterboro",
    'Shrewsbury Town' : "Shrewsbury",
    'Fleetwood Town' : "Fleetwood Town",
    'Southend United' : "Southend",
    'Bury' : "Bury",
    'Levante' : "Levante",
    'Getafe' : "Getafe",
    'Real Betis' : "Betis",
    'Alaves' : "Alaves",
    'Valencia CF' : "Valencia",
    'Sevilla FC' : "Sevilla",
    'FC Barcelona' : "Barcelona",
    'Malaga CF' : "Malaga",
    'Villarreal CF' : "Villarreal",
    'U.D. Las Palmas de Gran Canaria' : "Las Palmas",
    'Celta de Vigo' : "Celta",
    'Atletico Madrid' : "Ath Madrid",
    'Leganes' : "Leganes",
    'Athletic Bilbao' : "Ath Bilbao",
    'Real Madrid' : "Real Madrid",
    'Eibar' : "Eibar",
    'Real Sociedad' : "Sociedad",
    'RCD Espanyol' : "Espanol",
    'Deportivo La Coruna' : "La Coruna",
    'Girona FC' : "Girona",
    'Hearts Of Midlothian FC' : "Hearts",
    'St. Johnstone' : "St Johnstone",
    'Partick Thistle' : "Partick",
    'Dundee FC' : "Dundee",
    'Ross County' : "Ross County",
    'Hamilton Academicals FC' : "Hamilton",
    'Hibernian FC' : "Hibernian",
    'Aberdeen FC' : "Aberdeen",
    'Celtic FC' : "Celtic",
    'Motherwell' : "Motherwell",
    'Glasgow Rangers FC' : "Rangers",
    'Kilmarnock' : "Kilmarnock",
    'FC Schalke 04' : "Schalke 04",
    '1. FSV Mainz 05' : "Mainz",
    'Borussia Monchengladbach' : "M'gladbach",
    'Bayer 04 Leverkusen' : "Leverkusen",
    'Eintracht Frankfurt' : "Ein Frankfurt",
    'Borussia Dortmund' : "Dortmund",
    'FC Augsburg' : "Augsburg",
    'Hannover 96' : "Hannover",
    'RB Leipzig' : "RB Leipzig",
    'VfB Stuttgart' : "Stuttgart",
    'Hamburger SV' : "Hamburg",
    'FC Bayern Munchen' : "Bayern Munich",
    '1. FC Koln' : "FC Koln",
    'SV Werder Bremen' : "Werder Bremen",
    'SC Freiburg' : "Freiburg",
    'Hertha BSC Berlin' : "Hertha",
    'VfL Wolfsburg' : "Wolfsburg",
    'TSG 1899 Hoffenheim' : "Hoffenheim",
    'AS St. Etienne' : "St Etienne",
    'Montpellier HSC' : "Montpellier",
    'AS Monaco' : "Monaco",
    'Stade Caen' : "Caen",
    'Angers SCO' : "Angers",
    'FC Toulouse' : "Toulouse",
    'FC Metz' : "Metz",
    'Dijon FCO' : "Dijon",
    'FC Nantes' : "Nantes",
    'EA Guingamp' : "Guingamp",
    'SC Amiens' : "Amiens",
    'FC Girondins de Bordeaux' : "Bordeaux",
    'Stade Rennes FC' : "Rennes",
    'OSC Lille' : "Lille",
    'OGC Nice' : "Nice",
    'Racing Club Strasbourg' : "Strasbourg",
    'AC Troyes AC' : "Troyes",
    'Olympique Lyonnais' : "Lyon",
    'Olympique Marseille' : "Marseille",
    'Paris Saint-Germain FC' : "Paris SG",
    'Crotone' : "Crotone",
    'Napoli' : "Napoli",
    'Chievo Verona' : "Chievo",
    'AC Milan' : "Milan",
    'Genoa' : "Genoa",
    'Bologna F.C.' : "Bologna",
    'Benevento' : "Benevento",
    'AC Fiorentina' : "Fiorentina",
    'SPAL 1907 Ferrara' : "Spal",
    'Sassuolo' : "Sassuolo",
    'Torino FC' : "Torino",
    'AS Roma' : "Roma",
    'Udinese Calcio' : "Udinese",
    'Juventus FC' : "Juventus",
    'SS Lazio' : "Lazio",
    'Cagliari' : "Cagliari",
    'Internazionale Milano' : "Inter",
    'Sampdoria' : "Sampdoria",
    'Atalanta Bergamo' : "Atalanta",
    'Hellas Verona' : "Verona",
}


In [4]:
# dictionary for betBrain url by league
betbrain_url_dict = {
    'E0': 'https://www.betbrain.com/football/england/premier-league/#/matches/',
    'E1': 'https://www.betbrain.com/football/england/championship/#/matches/',
    'E2': 'https://www.betbrain.com/football/england/league-1/#/matches/',
    'SP1': 'https://www.betbrain.com/football/spain/primera-division/#/matches/',
    'SC0': 'https://www.betbrain.com/football/scotland/premiership/#/matches/',
    'D1': 'https://www.betbrain.com/football/germany/bundesliga/#/matches/',
    'F1': 'https://www.betbrain.com/football/france/ligue-1/#/matches/',
    'I1': 'https://www.betbrain.com/football/italy/serie-a/#/matches/'
}

In [5]:
# Feature list to use
best_features_MLP = ['A_MEANS_FIVE_AC', 'A_MEANS_FIVE_AS', 'A_MEANS_FIVE_AST','A_MEANS_FIVE_FTAG', 'A_MEANS_FIVE_FTHG', 'A_MEANS_FIVE_FTR_H','A_MEANS_FIVE_HC', 'A_MEANS_FIVE_HS', 'A_MEANS_FIVE_HST','A_MEANS_FIVE_HTR_A', 'H_MEANS_FIVE_AC', 'H_MEANS_FIVE_AS','H_MEANS_FIVE_AST', 'H_MEANS_FIVE_AY', 'H_MEANS_FIVE_FTAG','H_MEANS_FIVE_FTHG', 'H_MEANS_FIVE_FTR_A', 'H_MEANS_FIVE_FTR_H','H_MEANS_FIVE_HC', 'H_MEANS_FIVE_HS', 'H_MEANS_FIVE_HST','H_MEANS_FIVE_HTR_H', 'A_MEANS_THREE_AC', 'A_MEANS_THREE_AS','A_MEANS_THREE_FTHG', 'A_MEANS_THREE_HS', 'H_MEANS_THREE_AS','A_STD_FIVE_HF', 'H_STD_FIVE_HC', 'H_STD_FIVE_HST']
all_features = ["A_MEANS_FIVE_AC","A_MEANS_FIVE_AF","A_MEANS_FIVE_AR","A_MEANS_FIVE_AS","A_MEANS_FIVE_AST","A_MEANS_FIVE_AY","A_MEANS_FIVE_FTAG","A_MEANS_FIVE_FTHG","A_MEANS_FIVE_FTR_A","A_MEANS_FIVE_FTR_D","A_MEANS_FIVE_FTR_H","A_MEANS_FIVE_HC","A_MEANS_FIVE_HF","A_MEANS_FIVE_HR","A_MEANS_FIVE_HS","A_MEANS_FIVE_HST","A_MEANS_FIVE_HTAG","A_MEANS_FIVE_HTHG","A_MEANS_FIVE_HTR_A","A_MEANS_FIVE_HTR_D","A_MEANS_FIVE_HTR_H","A_MEANS_FIVE_HY","H_MEANS_FIVE_AC","H_MEANS_FIVE_AF","H_MEANS_FIVE_AR","H_MEANS_FIVE_AS","H_MEANS_FIVE_AST","H_MEANS_FIVE_AY","H_MEANS_FIVE_FTAG","H_MEANS_FIVE_FTHG","H_MEANS_FIVE_FTR_A","H_MEANS_FIVE_FTR_D","H_MEANS_FIVE_FTR_H","H_MEANS_FIVE_HC","H_MEANS_FIVE_HF","H_MEANS_FIVE_HR","H_MEANS_FIVE_HS","H_MEANS_FIVE_HST","H_MEANS_FIVE_HTAG","H_MEANS_FIVE_HTHG","H_MEANS_FIVE_HTR_A","H_MEANS_FIVE_HTR_D","H_MEANS_FIVE_HTR_H","H_MEANS_FIVE_HY","A_MEANS_THREE_AC","A_MEANS_THREE_AF","A_MEANS_THREE_AR","A_MEANS_THREE_AS","A_MEANS_THREE_AST","A_MEANS_THREE_AY","A_MEANS_THREE_FTAG","A_MEANS_THREE_FTHG","A_MEANS_THREE_FTR_A","A_MEANS_THREE_FTR_D","A_MEANS_THREE_FTR_H","A_MEANS_THREE_HC","A_MEANS_THREE_HF","A_MEANS_THREE_HR","A_MEANS_THREE_HS","A_MEANS_THREE_HST","A_MEANS_THREE_HTAG","A_MEANS_THREE_HTHG","A_MEANS_THREE_HTR_A","A_MEANS_THREE_HTR_D","A_MEANS_THREE_HTR_H","A_MEANS_THREE_HY","H_MEANS_THREE_AC","H_MEANS_THREE_AF","H_MEANS_THREE_AR","H_MEANS_THREE_AS","H_MEANS_THREE_AST","H_MEANS_THREE_AY","H_MEANS_THREE_FTAG","H_MEANS_THREE_FTHG","H_MEANS_THREE_FTR_A","H_MEANS_THREE_FTR_D","H_MEANS_THREE_FTR_H","H_MEANS_THREE_HC","H_MEANS_THREE_HF","H_MEANS_THREE_HR","H_MEANS_THREE_HS","H_MEANS_THREE_HST","H_MEANS_THREE_HTAG","H_MEANS_THREE_HTHG","H_MEANS_THREE_HTR_A","H_MEANS_THREE_HTR_D","H_MEANS_THREE_HTR_H","H_MEANS_THREE_HY","A_STD_FIVE_AC","A_STD_FIVE_AF","A_STD_FIVE_AR","A_STD_FIVE_AS","A_STD_FIVE_AST","A_STD_FIVE_AY","A_STD_FIVE_FTAG","A_STD_FIVE_FTHG","A_STD_FIVE_FTR_A","A_STD_FIVE_FTR_D","A_STD_FIVE_FTR_H","A_STD_FIVE_HC","A_STD_FIVE_HF","A_STD_FIVE_HR","A_STD_FIVE_HS","A_STD_FIVE_HST","A_STD_FIVE_HTAG","A_STD_FIVE_HTHG","A_STD_FIVE_HTR_A","A_STD_FIVE_HTR_D","A_STD_FIVE_HTR_H","A_STD_FIVE_HY","H_STD_FIVE_AC","H_STD_FIVE_AF","H_STD_FIVE_AR","H_STD_FIVE_AS","H_STD_FIVE_AST","H_STD_FIVE_AY","H_STD_FIVE_FTAG","H_STD_FIVE_FTHG","H_STD_FIVE_FTR_A","H_STD_FIVE_FTR_D","H_STD_FIVE_FTR_H","H_STD_FIVE_HC","H_STD_FIVE_HF","H_STD_FIVE_HR","H_STD_FIVE_HS","H_STD_FIVE_HST","H_STD_FIVE_HTAG","H_STD_FIVE_HTHG","H_STD_FIVE_HTR_A","H_STD_FIVE_HTR_D","H_STD_FIVE_HTR_H","H_STD_FIVE_HY","A_STD_THREE_AC","A_STD_THREE_AF","A_STD_THREE_AR","A_STD_THREE_AS","A_STD_THREE_AST","A_STD_THREE_AY","A_STD_THREE_FTAG","A_STD_THREE_FTHG","A_STD_THREE_FTR_A","A_STD_THREE_FTR_D","A_STD_THREE_FTR_H","A_STD_THREE_HC","A_STD_THREE_HF","A_STD_THREE_HR","A_STD_THREE_HS","A_STD_THREE_HST","A_STD_THREE_HTAG","A_STD_THREE_HTHG","A_STD_THREE_HTR_A","A_STD_THREE_HTR_D","A_STD_THREE_HTR_H","A_STD_THREE_HY","H_STD_THREE_AC","H_STD_THREE_AF","H_STD_THREE_AR","H_STD_THREE_AS","H_STD_THREE_AST","H_STD_THREE_AY","H_STD_THREE_FTAG","H_STD_THREE_FTHG","H_STD_THREE_FTR_A","H_STD_THREE_FTR_D","H_STD_THREE_FTR_H","H_STD_THREE_HC","H_STD_THREE_HF","H_STD_THREE_HR","H_STD_THREE_HS","H_STD_THREE_HST","H_STD_THREE_HTAG","H_STD_THREE_HTHG","H_STD_THREE_HTR_A","H_STD_THREE_HTR_D","H_STD_THREE_HTR_H","H_STD_THREE_HY"]
best_features_NB = ['A_MEANS_FIVE_AC', 'A_MEANS_FIVE_AS', 'A_MEANS_FIVE_AST','A_MEANS_FIVE_FTAG', 'A_MEANS_FIVE_FTHG', 'A_MEANS_FIVE_FTR_H','A_MEANS_FIVE_HC', 'A_MEANS_FIVE_HS', 'A_MEANS_FIVE_HST','A_MEANS_FIVE_HTR_A', 'H_MEANS_FIVE_AC', 'H_MEANS_FIVE_AS','H_MEANS_FIVE_AST', 'H_MEANS_FIVE_AY', 'H_MEANS_FIVE_FTAG','H_MEANS_FIVE_FTHG', 'H_MEANS_FIVE_FTR_A', 'H_MEANS_FIVE_FTR_H','H_MEANS_FIVE_HC', 'H_MEANS_FIVE_HS', 'H_MEANS_FIVE_HST','H_MEANS_FIVE_HTR_H', 'A_MEANS_THREE_AC', 'A_MEANS_THREE_AS','A_MEANS_THREE_FTHG', 'A_MEANS_THREE_HS', 'H_MEANS_THREE_AS','A_STD_FIVE_HF', 'H_STD_FIVE_HC', 'H_STD_FIVE_HST']
features_list = [
    ['best_features_MLP', best_features_MLP],
    ['all_features', all_features],
    ['best_features_NB', best_features_NB]
]

In [6]:
# Construct base layer
base_layer = [
    ['XGBoost' ,['all_features', all_features]],
    ['NB', ['best_features_NB', best_features_NB]],
    ['MLP', ['best_features_MLP', best_features_MLP]],
]

In [7]:
# DB Sqlite connection
db = "/Users/thibaultclement/Project/ligue1-predict/src/notebook/data/db/soccer_predict.sqlite"
conn = sqlite3.connect(db)
cur = conn.cursor()

In [8]:
def get_short_team_name(name):
    return team_name_dict[name]

In [9]:
# Extract next match list with odds
def get_next_matches(div):
    url = betbrain_url_dict[div]
    # Browse the url
    browser.get(url)
    # Wait the page to load
    time.sleep(15)
    # Click on Home Draw Away
    try:
        browser.find_element(By.XPATH, '//*[@id="app"]/div/section/section/main/div[3]/div[1]/a[3]').click()
    except:
        browser.find_element(By.XPATH, '//*[@id="app"]/div/section/section/main/div[4]/div[1]/a[3]').click()
    # Wait page to load
    time.sleep(10)
    # let's parse our html
    soup = BeautifulSoup(browser.page_source, "html.parser")
    # configure a panda dataframe for save matches
    columns = ['Div','Date','HomeTeam','AwayTeam','INFO_BbAvH','INFO_BbAvD','INFO_BbAvA']
    matches_df = pd.DataFrame(columns=columns)
    # add all the matches in the dataframe
    matches = soup.find_all("li", "Match")
    for match in matches:
        average_odds = match.find_all('span', 'AverageOdds')
        match_detail = match.find('a', 'MatchTitleLink')
        match_date = match.find('time').text
        match_date_datetime = datetime.datetime.strptime(match_date, '%d/%m/%Y %H:%M')
        if (match_date_datetime-now).days > 4:
            break
        home_team = get_short_team_name(match_detail.find_all('span')[1].text)
        away_team = get_short_team_name(match_detail.find_all('span')[3].text)
        if len(average_odds) < 1:
            break
        home_odd = average_odds[0].find_all('span')[1].text
        draw_odd = average_odds[1].find_all('span')[1].text
        away_odd = average_odds[2].find_all('span')[1].text
        df_temp = pd.DataFrame([[div, match_date, home_team, away_team, home_odd, draw_odd, away_odd]],columns=columns)
        matches_df = pd.concat([matches_df,df_temp])
    return matches_df         

In [10]:
# Get all data for pre match on away team
def homeData( date, team, div, nb_matches, nb_matches_string ):
    # Dataframe to return with all info
    dic = {}
    # Home team query
    #TODO Recuperer aussi combien de buts ils se sont pris dans la tronche et tout et tout
    queryHome = '''
            SELECT Date, FTHG, FTR, HTHG, HTR, HS, HST, HF, HC, HY, HR, FTAG, HTAG, `AS`, AST, AF, AC, AY, AR
            FROM cur_season_matchs_raw
            WHERE Date < ? AND HomeTeam = ? AND Div = ? ORDER BY Date DESC LIMIT ?'''
    # Get the previous home game of the Home Team
    df_home = pd.read_sql(queryHome, conn, params=[datetime.datetime.strptime(date, '%d/%m/%Y %H:%M').strftime('%Y-%m-%d'), team, div, nb_matches])
    # Hot-encode Category Full Time Result and Half Time Result
    df_home = pd.get_dummies(df_home, columns=['FTR', 'HTR'])
    # Calculate the mean of all columns
    #display(df_home.head())
    dic['H_MEANS_'+nb_matches_string+'_FTHG'] = round(df_home.FTHG.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_FTR_H'] = 0 if 'FTR_H'not in df_home.columns else round(df_home.FTR_H.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_FTR_D'] = 0 if 'FTR_D' not in df_home.columns else round(df_home.FTR_D.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_FTR_A'] = 0 if 'FTR_A' not in df_home.columns else round(df_home.FTR_A.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HTHG'] = round(df_home.HTHG.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HTR_H'] = 0 if 'HTR_H' not in df_home.columns else round(df_home.HTR_H.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HTR_D'] = 0 if 'HTR_D' not in df_home.columns else round(df_home.HTR_D.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HTR_A'] = 0 if 'HTR_A' not in df_home.columns else round(df_home.HTR_A.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HS'] = round(df_home.HS.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HST'] = round(df_home.HST.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HF'] = round( df_home.HF.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HC'] = round(df_home.HC.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HY'] = round(df_home.HY.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HR'] = round(df_home.HR.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_FTAG'] = round(df_home.FTAG.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HTAG'] = round(df_home.HTAG.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_AS'] = round(df_home.AS.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_AST'] = round(df_home.AST.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_AF'] = round(df_home.AF.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_AC'] = round(df_home.AC.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_AY'] = round(df_home.AY.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_AR'] = round(df_home.AR.mean(), 2)
    dic['H_STD_'+nb_matches_string+'_FTHG'] = round(df_home.FTHG.std(), 3)
    dic['H_STD_'+nb_matches_string+'_FTR_H'] = 0 if 'FTR_H'not in df_home.columns else round(df_home.FTR_H.std(), 3)
    dic['H_STD_'+nb_matches_string+'_FTR_D'] = 0 if 'FTR_D' not in df_home.columns else round(df_home.FTR_D.std(), 3)
    dic['H_STD_'+nb_matches_string+'_FTR_A'] = 0 if 'FTR_A' not in df_home.columns else round(df_home.FTR_A.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HTHG'] = round(df_home.HTHG.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HTR_H'] = 0 if 'HTR_H' not in df_home.columns else round(df_home.HTR_H.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HTR_D'] = 0 if 'HTR_D' not in df_home.columns else round(df_home.HTR_D.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HTR_A'] = 0 if 'HTR_A' not in df_home.columns else round(df_home.HTR_A.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HS'] = round(df_home.HS.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HST'] = round(df_home.HST.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HF'] = round( df_home.HF.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HC'] = round(df_home.HC.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HY'] = round(df_home.HY.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HR'] = round(df_home.HR.std(), 3)
    dic['H_STD_'+nb_matches_string+'_FTAG'] = round(df_home.FTAG.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HTAG'] = round(df_home.HTAG.std(), 3)
    dic['H_STD_'+nb_matches_string+'_AS'] = round(df_home.AS.std(), 3)
    dic['H_STD_'+nb_matches_string+'_AST'] = round(df_home.AST.std(), 3)
    dic['H_STD_'+nb_matches_string+'_AF'] = round(df_home.AF.std(), 3)
    dic['H_STD_'+nb_matches_string+'_AC'] = round(df_home.AC.std(), 3)
    dic['H_STD_'+nb_matches_string+'_AY'] = round(df_home.AY.std(), 3)
    dic['H_STD_'+nb_matches_string+'_AR'] = round(df_home.AR.std(), 3)
    return dic

In [11]:
# Get all data for pre match on away team
def awayData( date, team, div, nb_matches, nb_matches_string ):
    # Dataframe to return with all info
    dic = {}
    # away team query
    #TODO Recuperer aussi combien de buts ils se sont pris dans la tronche et tout et tout
    queryAway = '''
            SELECT Date, FTAG, FTR, HTAG, HTR, `AS`, AST, AF, AC, AY, AR, FTHG, HTHG, HS, HST, HF, HC, HY, HR
            FROM cur_season_matchs_raw
            WHERE Date < ? AND AwayTeam = ? AND Div = ? ORDER BY Date DESC LIMIT ?'''
    # Get the previous away game of the away Team
    df_away = pd.read_sql(queryAway, conn, params=[datetime.datetime.strptime(date, '%d/%m/%Y %H:%M').strftime('%Y-%m-%d'), team, div, nb_matches])
    # Hot-encode Category Full Time Result and Half Time Result
    df_away = pd.get_dummies(df_away, columns=['FTR', 'HTR'])
    # Calculate the mean of all columns
    #display(df_away.head())
    dic['A_MEANS_'+nb_matches_string+'_FTAG'] = round(df_away.FTAG.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_FTR_H'] = 0 if 'FTR_H'not in df_away.columns else round(df_away.FTR_H.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_FTR_D'] = 0 if 'FTR_D' not in df_away.columns else round(df_away.FTR_D.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_FTR_A'] = 0 if 'FTR_A' not in df_away.columns else round(df_away.FTR_A.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HTAG'] = round(df_away.HTAG.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HTR_H'] = 0 if 'HTR_H' not in df_away.columns else round(df_away.HTR_H.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HTR_D'] = 0 if 'HTR_D' not in df_away.columns else round(df_away.HTR_D.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HTR_A'] = 0 if 'HTR_A' not in df_away.columns else round(df_away.HTR_A.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_AS'] = round(df_away.AS.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_AST'] = round(df_away.AST.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_AF'] = round(df_away.AF.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_AC'] = round(df_away.AC.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_AY'] = round(df_away.AY.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_AR'] = round(df_away.AR.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_FTHG'] = round(df_away.FTHG.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HTHG'] = round(df_away.HTHG.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HS'] = round(df_away.HS.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HST'] = round(df_away.HST.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HF'] = round( df_away.HF.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HC'] = round(df_away.HC.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HY'] = round(df_away.HY.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HR'] = round(df_away.HR.mean(), 2)
    dic['A_STD_'+nb_matches_string+'_FTAG'] = round(df_away.FTAG.std(), 3)
    dic['A_STD_'+nb_matches_string+'_FTR_H'] = 0 if 'FTR_H'not in df_away.columns else round(df_away.FTR_H.std(), 3)
    dic['A_STD_'+nb_matches_string+'_FTR_D'] = 0 if 'FTR_D' not in df_away.columns else round(df_away.FTR_D.std(), 3)
    dic['A_STD_'+nb_matches_string+'_FTR_A'] = 0 if 'FTR_A' not in df_away.columns else round(df_away.FTR_A.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HTAG'] = round(df_away.HTAG.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HTR_H'] = 0 if 'HTR_H' not in df_away.columns else round(df_away.HTR_H.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HTR_D'] = 0 if 'HTR_D' not in df_away.columns else round(df_away.HTR_D.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HTR_A'] = 0 if 'HTR_A' not in df_away.columns else round(df_away.HTR_A.std(), 3)
    dic['A_STD_'+nb_matches_string+'_AS'] = round(df_away.AS.std(), 3)
    dic['A_STD_'+nb_matches_string+'_AST'] = round(df_away.AST.std(), 3)
    dic['A_STD_'+nb_matches_string+'_AF'] = round(df_away.AF.std(), 3)
    dic['A_STD_'+nb_matches_string+'_AC'] = round(df_away.AC.std(), 3)
    dic['A_STD_'+nb_matches_string+'_AY'] = round(df_away.AY.std(), 3)
    dic['A_STD_'+nb_matches_string+'_AR'] = round(df_away.AR.std(), 3)
    dic['A_STD_'+nb_matches_string+'_FTHG'] = round(df_away.FTHG.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HTHG'] = round(df_away.HTHG.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HS'] = round(df_away.HS.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HST'] = round(df_away.HST.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HF'] = round( df_away.HF.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HC'] = round(df_away.HC.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HY'] = round(df_away.HY.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HR'] = round(df_away.HR.std(), 3)
    return dic

In [12]:
# Extract feature for each match in the list
def get_feature_from_matches_list(match_list_df):
    predict_df = pd.DataFrame()
    for index, row in match_list_df.iterrows():
        match_dict = row.to_dict()
        # Add five game home history
        match_dict.update(homeData(row['Date'], row['HomeTeam'], league, 5, 'FIVE'))
        # Add five game away history
        match_dict.update(awayData(row['Date'], row['AwayTeam'], league, 5, 'FIVE'))
        # Add three game home history
        match_dict.update(homeData(row['Date'], row['HomeTeam'], league, 3, 'THREE'))
        # Add three game away history
        match_dict.update(awayData(row['Date'], row['AwayTeam'], league, 3, 'THREE'))
        # transform value in dict to array of columns and value and add to dataframe
        cols = []
        vals = []
        for key in match_dict:
            cols.append(key)
            vals.append(match_dict[key])
        if predict_df.size == 0:
            predict_df = pd.DataFrame([vals],columns=cols)
        else:
            predict_df = pd.concat([predict_df,pd.DataFrame([vals],columns=cols)])
    return predict_df

In [13]:
def get_layer_columns(base_layer, classes):
    cols = []
    for clf_name, features in base_layer:
        for result in classes:
            cols.append(clf_name+result)
    return cols

In [14]:
def get_layer1_df(X_layer0, base_layer, cols):
    X_layer1 = np.zeros((X_layer0.shape[0], len(base_layer)*3))
    X_layer1 = pd.DataFrame(X_layer1, columns=cols)
    return X_layer1

In [15]:
def get_dataset(df, clf_name, features):
    # Filter by feature used to train
    X = pd.get_dummies(df[features])
    # Impute of missing values (NaN) with the mean
    imp = joblib.load(model_path+"imputer"+clf_name+".pkl")
    X = imp.transform(X)
    # Standardize features
    sc_X = joblib.load(model_path+"scaler"+clf_name+".pkl")
    X = sc_X.transform(X)
    return df, X

In [16]:
def concat_preds(df, preds, probs):
    # Copy the df to a new one
    concat_df = df
    # Add probs to the new def
    concat_df['probs_A'] = probs[:,0]
    concat_df['probs_D'] = probs[:,1]
    concat_df['probs_H'] = probs[:,2]
    concat_df['probs'] = concat_df[['probs_A','probs_D','probs_H']].max(axis=1)
    # Add Bet
    concat_df['pred'] = preds
    # Add bet get from betBrain
    concat_df['INFO_ODD_BET'] = 0
    concat_df.loc[concat_df.pred == 'A', 'INFO_ODD_BET'] = concat_df[odd_A]
    concat_df.loc[concat_df.pred == 'D', 'INFO_ODD_BET'] = concat_df[odd_D]
    concat_df.loc[concat_df.pred == 'H', 'INFO_ODD_BET'] = concat_df[odd_H]
    # Add prob less bet
    concat_df['prob_less_bet'] = 0
    concat_df.loc[concat_df.pred == 'A', 'prob_less_bet'] = concat_df['probs'] - concat_df[odd_A].apply(lambda x: 1/float(x))
    concat_df.loc[concat_df.pred == 'D', 'prob_less_bet'] = concat_df['probs'] - concat_df[odd_D].apply(lambda x: 1/float(x))
    concat_df.loc[concat_df.pred == 'H', 'prob_less_bet'] = concat_df['probs'] - concat_df[odd_H].apply(lambda x: 1/float(x))
    # Add historical difference of goal between Home Team and away Team
    concat_df['H-A'] = (concat_df['H_MEANS_FIVE_FTHG'] - concat_df['H_MEANS_FIVE_FTAG']) - (concat_df['A_MEANS_FIVE_FTAG'] - concat_df['A_MEANS_FIVE_FTHG'])
    
    return concat_df
    

In [17]:
# Make prediction
def make_prediction(base_layer, matches_to_predict_df):

    # Prepare structure of layer 1
    cols = get_layer_columns(base_layer, classes)
    X_layer1 = get_layer1_df(matches_to_predict_df, base_layer, cols)
    
    # predict for each model of the base layer
    for clf_name, features in base_layer:
        
        # get the dataset
        df, X = get_dataset(matches_to_predict_df, clf_name, features[1])

        clf_base = joblib.load(model_path+"model_layer0_"+clf_name+".pkl")
        predict_probs = clf_base.predict_proba(X)
        X_layer1.loc[:, [clf_name+result for result in classes]] = predict_probs

    # predict the stacking layer
    clf_1 = joblib.load(model_path+"model_layer1.pkl")
    # Predict target values
    y_pred = clf_1.predict(X_layer1)
    # Predict probabilities
    y_probs = clf_1.predict_proba(X_layer1)
    
    return concat_preds(df, y_pred, y_probs)

In [18]:
# Make prediction
def make_prediction_MLP(matches_to_predict_df):
    
    # get the dataset
    df, X = get_dataset(matches_to_predict_df, '', best_features_MLP)

    # get the model
    clf = joblib.load(model_path+"model_MLP.pkl")
    
    # Predict target values
    y_pred = clf.predict(X)
    # Predict probabilities
    y_probs = clf.predict_proba(X)
    
    return concat_preds(df, y_pred, y_probs)

In [19]:
def get_bet(league, df):
    if league == 'E0':
        to_bet_df = df.apply(pd.to_numeric, errors='ignore')
        to_bet_df = to_bet_df[to_bet_df['INFO_ODD_BET'] <= 4]
        to_bet_df = to_bet_df[to_bet_df['pred'] == 'H']
        to_bet_df = to_bet_df[to_bet_df['prob_less_bet'] >= 0]
        return to_bet_df
    if league == 'E1':
        to_bet_df = df.apply(pd.to_numeric, errors='ignore')
        to_bet_df = to_bet_df[to_bet_df['INFO_ODD_BET'] <= 1.9]
        return to_bet_df
    if league == 'E2':
        to_bet_df = df.apply(pd.to_numeric, errors='ignore')
        to_bet_df = to_bet_df[to_bet_df['INFO_ODD_BET'] >= 1.6]
        to_bet_df = to_bet_df[to_bet_df['INFO_ODD_BET'] <= 2]
        to_bet_df = to_bet_df[to_bet_df['pred'] == 'H']
        to_bet_df = to_bet_df[to_bet_df['H-A'] >= -0.5]
        to_bet_df = to_bet_df[to_bet_df['H-A'] <= 2.5]
        return to_bet_df
    if league == 'SP1':
        to_bet_df = df.apply(pd.to_numeric, errors='ignore')
        to_bet_df = to_bet_df[to_bet_df['INFO_ODD_BET'] <= 2]
        to_bet_df = to_bet_df[to_bet_df['pred'] == 'H']
        to_bet_df = to_bet_df[to_bet_df['prob_less_bet'] <= -0.1]
        return to_bet_df
    if league == 'SC0':
        to_bet_df = df.apply(pd.to_numeric, errors='ignore')
        to_bet_df = to_bet_df[to_bet_df['INFO_ODD_BET'] >= 2]
        to_bet_df = to_bet_df[to_bet_df['INFO_ODD_BET'] <= 3]
        to_bet_df = to_bet_df[to_bet_df['pred'] == 'A']
        to_bet_df = to_bet_df[to_bet_df['prob_less_bet'] <= 0.1]
        return to_bet_df
    if league == 'D1':
        to_bet_df = df.apply(pd.to_numeric, errors='ignore')
        to_bet_df = to_bet_df[to_bet_df['INFO_ODD_BET'] <= 4]
        to_bet_df = to_bet_df[to_bet_df['pred'] == 'H']
        to_bet_df = to_bet_df[to_bet_df['prob_less_bet'] >= -0.1]
        return to_bet_df
    if league == 'F1':
        to_bet_df = df.apply(pd.to_numeric, errors='ignore')
        to_bet_df = to_bet_df[to_bet_df['INFO_ODD_BET'] <= 4]
        to_bet_df = to_bet_df[to_bet_df['prob_less_bet'] <= -0.1]
        return to_bet_df
    if league == 'I1':
        to_bet_df = df.apply(pd.to_numeric, errors='ignore')
        to_bet_df = to_bet_df[to_bet_df['INFO_ODD_BET'] <= 2.8]
        to_bet_df = to_bet_df[to_bet_df['H-A'] >= -2]
        to_bet_df = to_bet_df[to_bet_df['H-A'] <= 0.2]
        return to_bet_df

In [20]:
# Get all bets
bet_cols = ['Date', 'Div', 'HomeTeam', 'AwayTeam', 'pred','probs' , 'prob_less_bet', 'H-A', 'INFO_ODD_BET']
all_bets_df = pd.DataFrame(columns=bet_cols)
for league in league_list:
    print league
    filename = './predict/'+today+'_'+league
    model_path = '../MODELING/models/'+league+'/'+league+'_'+str(season)+'_'
    # Get match coming with odd on Betbrain
    match_list_df = get_next_matches(league)
    # Extract feature from historical data
    to_predict_df = get_feature_from_matches_list(match_list_df)
    to_predict_df = to_predict_df.reset_index()
    # Make prediction
    if league in league_stacking:
        result_df = make_prediction(base_layer, to_predict_df)
    else:
        result_df = make_prediction_MLP(to_predict_df)
    # Apply post prediction manual filter
    bet_df = get_bet(league, result_df)
    bet_df[bet_cols].to_csv('./predict/'+today+'_'+league+'.csv')
    all_bets_df = pd.concat([all_bets_df,bet_df[bet_cols]])

E0
E1
E2
SP1
D1
F1
I1
SC0


In [21]:
all_bets_df.to_csv('./predict/'+today+'_ALL.csv')