In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.tabular.all import *
from wwf.tab.export import *
from bnb import *

from classes import *
from utils import *

In [3]:
SEASON = '22'
set_seed(int(SEASON))

## Download HKJC odds

In [4]:
path_raw = Path('raw_data')
path_data = Path('data')

path_output = Path('output')

In [5]:
session = requests.Session()
r = session.get('http://bet.hkjc.com')
cookies = r.cookies

In [6]:
odds_url = 'https://bet.hkjc.com/football/getJSON.aspx?jsontype=odds_chl.aspx'
response = session.post(
    odds_url,
    headers={'referer':'http://bet.hkjc.com'},
    cookies=cookies
)

In [7]:
with open(path_data/'json'/f'odds_chl-{datetime.now().strftime("%Y-%m-%d-%H-%M-%S")}.txt', 'w') as f:
    f.write(response.text)

In [8]:
matches_json = json.loads(response.text)[1]['matches']

In [9]:
# Example
# matches_json = json.loads(open(path_data/'json'/'odds_chl-2021-09-21-14-03-17.txt').read())[1]['matches']

In [10]:
matches = [Match(m) for m in matches_json]
odds = [m.export() for m in matches]

['20220213SUN1', '2022-Feb-13', '14:45:00', 'Australian Division 1 [AD1]', 'Central Coast Mariners', 'Perth Glory', 'true', '14.5', '1.76', '1.94', 'false', '15.5', '2.22', '1.58']
['20220213SUN2', '2022-Feb-13', '15:45:00', 'Australian Division 1 [AD1]', 'Brisbane Roar', 'Macarthur FC', 'false', '11.5', '2.40', '1.50', 'true', '10.5', '1.90', '1.80', 'false', '14.5', '4.55', '1.15']
['20220213SUN5', '2022-Feb-13', '19:30:00', 'Italian Division 1 [ISA]', 'AC Milan', 'Sampdoria', 'true', '10.5', '2.00', '1.72', 'false', '14.5', '5.30', '1.11', 'false', '11.5', '2.50', '1.46']
['20220213SUN6', '2022-Feb-13', '20:00:00', 'French Division 1 [FFL]', 'Monaco', 'Lorient', 'true', '9.5', '1.90', '1.80', 'false', '10.5', '2.40', '1.50', 'false', '13.5', '5.30', '1.11']
['20220213SUN7', '2022-Feb-13', '20:00:00', 'Eng League 1 [ED2]', 'Sheff Wednesday', 'Rotherham', 'false', '11.5', '2.30', '1.54', 'true', '10.5', '1.85', '1.85', 'false', '14.5', '4.90', '1.13']
['20220213SUN9', '2022-Feb-13', '

In [11]:
cols_match = ['MatchDay', 'Date', 'Time', 'LeagueJC', 'HomeTeamJC', 'AwayTeamJC']
cols_odds0 = ['MAINLINE_0', 'CHL_LINE_0', 'CHL_H_0', 'CHL_L_0']
cols_odds1 = ['MAINLINE_1', 'CHL_LINE_1', 'CHL_H_1', 'CHL_L_1']
cols_odds2 = ['MAINLINE_2', 'CHL_LINE_2', 'CHL_H_2', 'CHL_L_2']
cols_odds  = ['MAINLINE', 'CHL_LINE', 'CHL_H', 'CHL_L']
cols_pred  = ['prob_0', 'prob_1', 'prob_2', 'total_count']

cols = cols_match + cols_odds0 + cols_odds1 + cols_odds2

In [12]:
odds = pd.DataFrame(odds, columns=cols)
odds = odds.fillna(value=np.nan)

In [13]:
cols_odds_ = cols_odds0[1:]+cols_odds1[1:]+cols_odds2[1:]
odds[cols_odds_] = odds[cols_odds_].astype(float)

In [14]:
odds['MatchDay'] = odds['MatchDay'].str[8:]
odds['Date'] = pd.to_datetime(odds['Date'])
odds['Time'] = pd.to_datetime(odds['Time'], format='%H:%M:%S').dt.time

In [15]:
odds['DateTimeJC'] = pd.to_datetime(odds['Date'].dt.date.map(str) + '-' + odds['Time'].map(str))
odds['DateTimeJC'] = odds['DateTimeJC'].dt.tz_localize('Hongkong')
odds['DateTime'] = odds['DateTimeJC'].dt.tz_convert('GB')

In [16]:
map_league = pd.read_csv(path_data/'league.csv')
map_team = pd.read_csv(path_data/'team.csv')

In [17]:
div = ['E0', 'D1', 'SP1', 'I1', 'F1']
map_league = map_league[map_league.Div.isin(div)].reset_index(drop=True)

In [18]:
# Map Div name
odds = odds.merge(map_league[['LeagueJC', 'Div']], 'inner', on='LeagueJC')

In [19]:
# Map Team name
odds = odds.merge(map_team[['TeamNameJC', 'TeamName']].rename(columns={'TeamName':'HomeTeam'}), 'inner', 
                  left_on='HomeTeamJC', right_on='TeamNameJC').drop(columns=['TeamNameJC'])
odds = odds.merge(map_team[['TeamNameJC', 'TeamName']].rename(columns={'TeamName':'AwayTeam'}), 'inner', 
                  left_on='AwayTeamJC', right_on='TeamNameJC').drop(columns=['TeamNameJC'])

## Download recent stats

In [74]:
# Download latest results in current season
!wget -q https://www.football-data.co.uk/mmz4281/{SEASON}{int(SEASON)+1}/data.zip -O raw_data/data.zip

# Unzip to folder
!unzip -q -o raw_data/data.zip -d raw_data/{SEASON}

In [20]:
usecols = ['Div', 'Date', 'HomeTeam', 'AwayTeam', 'HC', 'AC', 'FTHG', 'FTAG', 'HS', 'AS', 'HST', 'AST']
dtype = {'HC':'float', 'AC':'float'}
parse_dates = ['Date']

seasons = [SEASON]

dfs = []

for folder in sorted(path_raw.iterdir()):
    if folder.is_dir() and folder.name in seasons: 
        for file in sorted(folder.glob('*.csv')):
            try:
                df = pd.read_csv(file, usecols=usecols, dtype=dtype, parse_dates=parse_dates, dayfirst=True)
                df['Season'] = folder.name
                dfs.append(df)
            except:
                continue

In [21]:
df_season = pd.concat(dfs)
df_season = df_season.dropna()
df_season = df_season.sort_values(['Div', 'Date', 'HomeTeam']).reset_index(drop=True)

In [22]:
df_hist = pd.read_csv(path_data/'data.csv', dtype={'HC':'float', 'AC':'float'}, parse_dates=['Date'])
df_hist = df_hist.query(f'Season == {int(SEASON)-1}').reset_index(drop=True)

In [23]:
df_season = pd.concat([df_hist[df_season.columns], df_season])

In [24]:
# Make features on historical stats (Home and Away)
stats = ['FTHG', 'HS', 'HST', 'HC', 'FTAG', 'AS', 'AST', 'AC']
df_home, df_away = joinLastGamesStatsHomeAway(df_season, stats)

In [25]:
# Make features on historical stats (For and Against)
stats = [('FTHG', 'FTAG', 'FTG'), ('HS', 'AS', 'S'), ('HST', 'AST', 'ST'), ('HC', 'AC', 'C')]
df_for, df_against = joinLastGamesStatsForAgainst(df_season, stats)

In [26]:
df_home = df_home.sort_values(['HomeTeam', 'Date']).reset_index(drop=True)
df_away = df_away.sort_values(['AwayTeam', 'Date']).reset_index(drop=True)

df_home = df_home.groupby('HomeTeam')[df_home.columns[df_home.columns.str.contains('Avg')]].last().reset_index()
df_away = df_away.groupby('AwayTeam')[df_away.columns[df_away.columns.str.contains('Avg')]].last().reset_index()

In [27]:
odds = odds.merge(df_home, 'left', 'HomeTeam').merge(df_away, 'left', 'AwayTeam')

In [28]:
cols_home = df_for.columns[df_for.columns.str.contains('Avg')]
cols_home = dict(zip(cols_home, 'Home'+cols_home))
cols_home.update({'Team':'HomeTeam'})

cols_away = df_for.columns[df_for.columns.str.contains('Avg')]
cols_away = dict(zip(cols_home, 'Away'+cols_away))
cols_away.update({'Team':'AwayTeam'})

df_for = df_for.groupby('Team')[df_for.columns[df_for.columns.str.contains('Avg')]].last().reset_index()
odds = odds.merge(df_for.rename(columns=cols_home), 'left', 'HomeTeam').merge(df_for.rename(columns=cols_away), 'left', 'AwayTeam')

In [29]:
cols_home = df_against.columns[df_against.columns.str.contains('Avg')]
cols_home = dict(zip(cols_home, 'Home'+cols_home))
cols_home.update({'Team':'HomeTeam'})

cols_away = df_against.columns[df_against.columns.str.contains('Avg')]
cols_away = dict(zip(cols_home, 'Away'+cols_away))
cols_away.update({'Team':'AwayTeam'})

df_against = df_against.groupby('Team')[df_against.columns[df_against.columns.str.contains('Avg')]].last().reset_index()
odds = odds.merge(df_against.rename(columns=cols_home), 'left', 'HomeTeam').merge(df_against.rename(columns=cols_away), 'left', 'AwayTeam')

In [30]:
add_datepart(odds, 'DateTime', prefix='', drop=False);

In [31]:
display_df(odds.head(5).T)

Unnamed: 0,0,1,2,3,4
MatchDay,SUN5,SUN24,SUN25,SUN26,SUN45
Date,2022-02-13 00:00:00,2022-02-13 00:00:00,2022-02-13 00:00:00,2022-02-13 00:00:00,2022-02-14 00:00:00
Time,19:30:00,22:00:00,22:00:00,22:00:00,01:00:00
LeagueJC,Italian Division 1 [ISA],Italian Division 1 [ISA],Italian Division 1 [ISA],Italian Division 1 [ISA],Italian Division 1 [ISA]
HomeTeamJC,AC Milan,Verona,Empoli,Genoa,Sassuolo
AwayTeamJC,Sampdoria,Udinese,Cagliari,Salernitana,Roma
MAINLINE_0,true,false,true,false,false
CHL_LINE_0,10.5,14.5,9.5,10.5,11.5
CHL_H_0,2.0,5.8,1.8,2.3,2.6
CHL_L_0,1.72,1.09,1.9,1.54,1.43


## Load model

In [32]:
learn_bnb = load_learner('models/learn_bnb.pkl')

In [33]:
to = load_pandas('models/to.pkl')

In [34]:
def predict(self, row):
    "Predict on a Pandas Series"
    dl = self.dls.test_dl(row.to_frame().T)
    dl.dataset.conts = dl.dataset.conts.astype(np.float32)
    inp,preds,_ = self.get_preds(dl=dl, with_input=True, with_decoded=False)
    b = tuplify(inp)
    full_dec = self.dls.decode(b)
    return full_dec,preds[0]

learn_bnb.predict = MethodType(predict, learn_bnb)

In [35]:
to_tst = to.new(odds)
to_tst.process()
# to_tst.items.head()

In [36]:
tst_dl = learn_bnb.dls.valid.new(to_tst)
tst_dl.show(max_n=999)

Unnamed: 0,Div,HomeTeam,AwayTeam,FTHGLast5Avg,HSLast5Avg,HSTLast5Avg,HCLast5Avg,FTAGLast5Avg,ASLast5Avg,ASTLast5Avg,ACLast5Avg,HomeFTGForLast5Avg,HomeSForLast5Avg,HomeSTForLast5Avg,HomeCForLast5Avg,AwayFTGForLast5Avg,AwaySForLast5Avg,AwaySTForLast5Avg,AwayCForLast5Avg,HomeFTGAgainstLast5Avg,HomeSAgainstLast5Avg,HomeSTAgainstLast5Avg,HomeCAgainstLast5Avg,AwayFTGAgainstLast5Avg,AwaySAgainstLast5Avg,AwaySTAgainstLast5Avg,AwayCAgainstLast5Avg
0,I1,Milan,Sampdoria,1.4,18.6,5.0,6.8,1.4,11.6,3.8,2.8,2.2,18.8,6.6,5.4,0.6,8.6,1.4,1.8,1.0,10.8,3.8,3.8,1.4,15.8,6.0,8.0
1,I1,Verona,Udinese,1.0,17.4,4.8,7.6,2.0,15.0,5.6,5.0,2.0,17.6,5.2,8.2,1.4,9.4,3.6,3.4,1.4,10.0,3.2,5.8,1.8,12.4,3.4,6.2
2,I1,Empoli,Cagliari,2.0,16.0,5.0,6.4,0.4,9.2,3.2,1.6,1.8,15.0,5.2,4.2,1.0,10.8,3.4,3.6,3.4,19.2,8.6,5.2,1.2,11.8,4.2,3.4
3,I1,Genoa,Salernitana,0.2,7.8,1.6,4.2,0.6,8.0,3.0,3.8,0.2,6.6,1.2,4.8,0.6,7.4,3.0,2.6,1.6,17.2,6.2,5.4,3.4,21.0,9.2,9.0
4,I1,Sassuolo,Roma,1.4,17.4,5.8,5.8,1.8,15.6,5.8,5.0,1.8,17.0,6.4,5.8,2.0,15.2,6.2,6.2,2.0,14.0,4.4,5.0,2.0,13.4,4.2,1.8
5,I1,Atalanta,Juventus,2.4,16.6,6.0,5.2,2.2,14.8,6.0,4.4,1.4,14.6,4.8,6.0,1.8,14.6,3.6,4.0,1.2,7.4,2.6,2.6,0.8,14.2,3.8,4.4
6,I1,Spezia,Fiorentina,1.0,9.4,3.4,3.2,1.0,9.8,3.0,5.0,1.2,10.2,4.0,3.6,2.0,16.4,6.0,6.0,0.6,19.0,4.4,7.6,1.6,7.8,3.0,2.6
7,F1,Monaco,Lorient,2.6,12.8,5.8,5.6,0.4,11.8,3.4,3.8,1.6,14.0,4.8,6.2,1.0,16.2,5.2,5.6,1.2,10.8,4.0,3.4,2.4,10.4,4.0,4.6
8,F1,Clermont,St Etienne,0.6,13.6,4.4,4.8,0.4,9.0,2.2,5.2,0.6,10.8,3.2,4.0,0.4,8.4,3.2,3.8,1.4,13.4,4.8,5.2,1.2,11.4,4.2,5.2
9,F1,Angers,Strasbourg,1.0,8.4,3.0,4.8,2.0,14.6,5.4,3.8,0.6,9.4,3.2,4.2,2.0,16.4,5.8,5.0,1.4,11.8,4.8,2.8,1.4,8.8,3.2,3.4


In [37]:
pred, _ = learn_bnb.get_preds(dl=tst_dl)

In [38]:
probs = F.softmax(pred[:, :3], dim=-1)
total_count = 1 / F.softplus(pred[:, 3:])

In [39]:
odds[cols_pred] = torch.cat([probs, total_count], dim=-1)

In [40]:
odds0 = odds[cols_match+cols_odds0+cols_pred].rename(columns=dict(zip(cols_odds0, cols_odds)))
odds1 = odds[cols_match+cols_odds1+cols_pred].rename(columns=dict(zip(cols_odds1, cols_odds)))
odds2 = odds[cols_match+cols_odds2+cols_pred].rename(columns=dict(zip(cols_odds2, cols_odds)))

In [41]:
odds = pd.concat([odds0, odds1, odds2]).dropna().reset_index(drop=True)
odds['MAINLINE'] = np.where(odds['MAINLINE']=='true', True, False)

In [42]:
odds.head(10)

Unnamed: 0,MatchDay,Date,Time,LeagueJC,HomeTeamJC,AwayTeamJC,MAINLINE,CHL_LINE,CHL_H,CHL_L,prob_0,prob_1,prob_2,total_count
0,SUN5,2022-02-13,19:30:00,Italian Division 1 [ISA],AC Milan,Sampdoria,True,10.5,2.0,1.72,0.841879,0.097874,0.060247,63.769421
1,SUN24,2022-02-13,22:00:00,Italian Division 1 [ISA],Verona,Udinese,False,14.5,5.8,1.09,0.816663,0.107582,0.075754,43.525993
2,SUN25,2022-02-13,22:00:00,Italian Division 1 [ISA],Empoli,Cagliari,True,9.5,1.8,1.9,0.811614,0.106871,0.081515,43.559822
3,SUN26,2022-02-13,22:00:00,Italian Division 1 [ISA],Genoa,Salernitana,False,10.5,2.3,1.54,0.814481,0.114824,0.070696,43.280457
4,SUN45,2022-02-14,01:00:00,Italian Division 1 [ISA],Sassuolo,Roma,False,11.5,2.6,1.43,0.820584,0.076629,0.102788,52.110069
5,SUN55,2022-02-14,03:45:00,Italian Division 1 [ISA],Atalanta,Juventus,False,13.5,5.1,1.12,0.790586,0.127494,0.08192,33.306583
6,MON8,2022-02-15,03:45:00,Italian Division 1 [ISA],Spezia,Fiorentina,False,12.5,4.55,1.15,0.773642,0.079393,0.146965,30.771444
7,SUN6,2022-02-13,20:00:00,French Division 1 [FFL],Monaco,Lorient,True,9.5,1.9,1.8,0.817027,0.09956,0.083413,46.204102
8,SUN27,2022-02-13,22:00:00,French Division 1 [FFL],Clermont,St. Etienne,True,9.5,1.9,1.8,0.796928,0.115523,0.087549,34.365765
9,SUN28,2022-02-13,22:00:00,French Division 1 [FFL],Angers SCO,Strasbourg,False,10.5,2.55,1.45,0.804246,0.105031,0.090723,37.186325


In [43]:
prob_hilo = []

for r in list(zip(odds['prob_0'], odds['prob_1'], odds['prob_2'], odds['total_count'], odds['CHL_LINE'])):
    probs = torch.tensor(r[0:3], device='cpu')
    total_count = torch.tensor(r[3], device='cpu')

    bnb_corner = BivariateNegativeBinomial(total_count=total_count, probs=probs)
    value = torch.cartesian_prod(torch.arange(0., 15.), torch.arange(0., 15.))
    corner = bnb_corner.log_prob(value).exp()
    
    line = r[4]
    mask = value.sum(-1) < line
    prob_lo = corner[mask].sum()
    prob_hi = 1 - prob_lo
    
    prob_hilo.append([prob_hi.item(), prob_lo.item()])

  return _VF.cartesian_prod(tensors)  # type: ignore[attr-defined]


In [44]:
odds[['prob_hi', 'prob_lo']] = prob_hilo

In [45]:
odds['kelly_hi'] = (odds['prob_hi'] * odds['CHL_H'] - 1) / (odds['CHL_H'] - 1)
odds['kelly_lo'] = (odds['prob_lo'] * odds['CHL_L'] - 1) / (odds['CHL_L'] - 1)

In [46]:
odds['kelly'] = np.where(
    np.maximum(odds['kelly_hi'], odds['kelly_lo']) > 0, 
    np.where(odds['kelly_hi'] > odds['kelly_lo'], odds['kelly_hi'], odds['kelly_lo']), 
    np.nan
)

In [47]:
odds['bet'] = np.where(
    np.maximum(odds['kelly_hi'], odds['kelly_lo']) > 0, 
    np.where(odds['kelly_hi'] > odds['kelly_lo'], 'High', 'Low'), 
    None
)

In [48]:
odds = odds.sort_values('kelly', ascending=False).reset_index(drop=True)

In [49]:
odds['selected'] = np.where(
    odds['MAINLINE']==True, np.where(
        odds['kelly']>0.3, '$$$', np.where(
            odds['kelly']>0.2, '$$', np.where(
                odds['kelly']>0.1, '$', None))), 
    None
)

In [50]:
odds = odds.drop(columns=cols_pred+['kelly_hi', 'kelly_lo'])

In [51]:
odds[odds.bet.notna() & odds.selected.notna()]

Unnamed: 0,MatchDay,Date,Time,LeagueJC,HomeTeamJC,AwayTeamJC,MAINLINE,CHL_LINE,CHL_H,CHL_L,prob_hi,prob_lo,kelly,bet,selected
0,SUN5,2022-02-13,19:30:00,Italian Division 1 [ISA],AC Milan,Sampdoria,True,10.5,2.0,1.72,0.632713,0.367287,0.265426,High,$$
2,SUN29,2022-02-13,22:00:00,French Division 1 [FFL],Brest,Troyes,True,9.5,1.98,1.73,0.611613,0.388387,0.215299,High,$$
7,SUN44,2022-02-14,00:30:00,German Division 1 [GSL],Hoffenheim,Bielefeld,True,10.5,2.1,1.65,0.328967,0.671033,0.16493,Low,$
8,SUN45,2022-02-14,01:00:00,Italian Division 1 [ISA],Sassuolo,Roma,True,10.5,2.05,1.68,0.570673,0.429327,0.161791,High,$
9,SUN35,2022-02-13,23:15:00,Spanish Division 1 [SFL],Levante,Betis,True,9.5,2.07,1.67,0.566559,0.433441,0.161475,High,$
15,SUN27,2022-02-13,22:00:00,French Division 1 [FFL],Clermont,St. Etienne,True,9.5,1.9,1.8,0.383456,0.616544,0.137225,Low,$
17,SUN55,2022-02-14,03:45:00,Italian Division 1 [ISA],Atalanta,Juventus,True,9.5,1.88,1.82,0.391177,0.608823,0.131777,Low,$
18,SUN21,2022-02-13,22:00:00,Eng Premier [EPL],Burnley,Liverpool,True,11.5,2.1,1.65,0.343291,0.656709,0.128568,Low,$


In [52]:
display_df(odds[odds.MatchDay.isin(odds[odds.bet.notna() & odds.selected.notna()].MatchDay) & odds.bet.notna()])

Unnamed: 0,MatchDay,Date,Time,LeagueJC,HomeTeamJC,AwayTeamJC,MAINLINE,CHL_LINE,CHL_H,CHL_L,prob_hi,prob_lo,kelly,bet,selected
0,SUN5,2022-02-13,19:30:00,Italian Division 1 [ISA],AC Milan,Sampdoria,True,10.5,2.0,1.72,0.632713,0.367287,0.265426,High,$$
1,SUN44,2022-02-14,00:30:00,German Division 1 [GSL],Hoffenheim,Bielefeld,False,13.5,4.25,1.17,0.108636,0.891364,0.252328,Low,
2,SUN29,2022-02-13,22:00:00,French Division 1 [FFL],Brest,Troyes,True,9.5,1.98,1.73,0.611613,0.388387,0.215299,High,$$
3,SUN5,2022-02-13,19:30:00,Italian Division 1 [ISA],AC Milan,Sampdoria,False,11.5,2.5,1.46,0.52624,0.47376,0.2104,High,
4,SUN44,2022-02-14,00:30:00,German Division 1 [GSL],Hoffenheim,Bielefeld,False,11.5,2.65,1.42,0.23707,0.76293,0.198477,Low,
5,SUN29,2022-02-13,22:00:00,French Division 1 [FFL],Brest,Troyes,False,10.5,2.55,1.45,0.499733,0.500267,0.17698,High,
6,SUN55,2022-02-14,03:45:00,Italian Division 1 [ISA],Atalanta,Juventus,False,13.5,5.1,1.12,0.088588,0.911412,0.173176,Low,
7,SUN44,2022-02-14,00:30:00,German Division 1 [GSL],Hoffenheim,Bielefeld,True,10.5,2.1,1.65,0.328967,0.671033,0.16493,Low,$
8,SUN45,2022-02-14,01:00:00,Italian Division 1 [ISA],Sassuolo,Roma,True,10.5,2.05,1.68,0.570673,0.429327,0.161791,High,$
9,SUN35,2022-02-13,23:15:00,Spanish Division 1 [SFL],Levante,Betis,True,9.5,2.07,1.67,0.566559,0.433441,0.161475,High,$


In [53]:
odds.to_csv(path_output/f'odds-{datetime.now().strftime("%Y-%m-%d")}.csv', float_format='%.2f', index=False)

## END