### Simulation for in-play 'bookmaking'
 - laying all horses at % lesser than bsp (null model - e.g. all 10% less than bsp)
 - determining if matched (on previous data)
 - calculating returns (on previous data
 - Using two other staking strategies after this (based on average / quartile decrease)

#### 0 : Importing packages

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pymysql
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#### 1 : Importing data

In [2]:
connection = pymysql.connect(host='localhost', user='root', passwd ='xboxpass32', database = 'smartform')
cursor = connection.cursor()

query = ''' SELECT race_id, runner_id, prize_money, num_runners, historic_betfair_win_prices.bsp, historic_betfair_win_prices.inplay_min, inplay_traded
            FROM historic_races
            JOIN historic_runners USING (race_id) JOIN historic_betfair_win_prices ON race_id=sf_race_id
                 AND
                 runner_id = sf_runner_id
            WHERE(CAST(historic_races.meeting_date AS Datetime) >= '2015-01-01')
            ORDER BY race_id, runner_id
        '''
cursor.execute(query)
rows = cursor.fetchall()

start = query.find('SELECT') + 7
end = query.find('\n            FROM', start)
names = query[start:end].split(', ')

df = pd.DataFrame(list(rows), columns=names)
del rows

print('No. Runners : ', len(df.index), '\nNo. Races : ', df['race_id'].nunique())
df.head(3)

603787

No. Runners :  603787 
No. Races :  64243


Unnamed: 0,race_id,runner_id,prize_money,num_runners,historic_betfair_win_prices.bsp,historic_betfair_win_prices.inplay_min,inplay_traded
0,494609,1060410,,12,12.5,10.0,8680
1,494609,2055946,76862.4,12,8.06,3.1,17662
2,494609,2061900,19359.0,12,18.91,10.0,5080


#### 2 : Data Preprocessing

In [3]:
# removing nulls
df = df.loc[df['historic_betfair_win_prices.bsp'] != 0]
df = df.loc[df['historic_betfair_win_prices.inplay_min'] != 0]
# setting data types
df['historic_betfair_win_prices.bsp'] = df['historic_betfair_win_prices.bsp'].astype(float)
df['historic_betfair_win_prices.inplay_min'] = df['historic_betfair_win_prices.inplay_min'].astype(float)
# data processing for 'win'
df['win'] = np.where(df['prize_money'] == df.groupby('race_id')['prize_money'].transform(np.max), 1, 0)
df.drop('prize_money', axis = 1, inplace = True)
# renaming
df.rename(columns={'num_runners' : 'n_runners',
                   'historic_betfair_win_prices.bsp' : 'bsp',
                   'historic_betfair_win_prices.inplay_min' : 'ip_min',
                   'inplay_traded' : 'ip_traded'}, inplace = True)



#### 3 : Creating variables

In [4]:
df['bsp_prob'] = round(np.power(df['bsp'].astype(float), -1), 5)
df['ip_prob'] = round(np.power(df['ip_min'].astype(float), -1), 5)
df['dif_prob'] = df['ip_prob'] - df['bsp_prob']

#### 4 : Finding unique race-ids

In [5]:
race_ids = df['race_id'].unique()
race_ids_low = df.loc[df['n_runners'] <5]['race_id'].unique()

#### 5.0 Concept
Identifying the number fo horse that have a decrease in price during a race.

In [6]:
# viewing % of runners that decrease
prices_df = df.copy()
prices_df['ip_dec'] = np.where(prices_df['bsp'] > df['ip_min'], 1, 0)
prices_df['ip_dec'].value_counts(normalize=True)

1    0.890415
0    0.109585
Name: ip_dec, dtype: float64

In [7]:
# viewing % runners that decrease based on their bsp probability
prices_df['bsp_prob_bin'] = pd.cut(prices_df['bsp_prob'], np.arange(0,1,0.1))
decreases = prices_df.groupby('bsp_prob_bin')['ip_dec'].agg(['count', 'sum']).reset_index()
decreases['%'] = round(decreases['sum'] / decreases['count'], 4) * 100
decreases

Unnamed: 0,bsp_prob_bin,count,sum,%
0,"(0.0, 0.1]",374202,324354,86.68
1,"(0.1, 0.2]",137091,125322,91.42
2,"(0.2, 0.3]",51770,48674,94.02
3,"(0.3, 0.4]",21320,20448,95.91
4,"(0.4, 0.5]",9149,8854,96.78
5,"(0.5, 0.6]",5130,5009,97.64
6,"(0.6, 0.7]",2450,2408,98.29
7,"(0.7, 0.8]",1143,1135,99.3
8,"(0.8, 0.9]",456,450,98.68


#### 5 Testing strategies in markets 
As shown above, runners are more likely than not experience a decrease in price during a race.(lower odds runners experiencing price decreases more often - likely as they are more likely to win)

- 5.1 LAY : Laying at a % decrease with optimal stakes. Focus : 'close favourite' markets
- 5.2 BACK LAY : Backing at BSP & Laying as close to IP_min. Focus : all markets

### 5.1 

In [9]:
# finding races with two short favs ( < 3.0)
race_vc = df.loc[df['bsp'] < 3.0]['race_id'].value_counts
race_counts = pd.concat((race_vc().rename('Count'),),axis='columns',)
race_ids = race_counts.loc[race_counts['Count'] == 2].index
short_df = df.loc[df['race_id'].isin(race_ids)].loc[df['bsp'] < 3.0] 
short_df.head(2) # 2772

Unnamed: 0,race_id,runner_id,n_runners,bsp,ip_min,ip_traded,win,bsp_prob,ip_prob,dif_prob
49,564037,1452782,6,2.53,1.05,40624,0,0.39526,0.95238,0.55712
53,564037,2003639,6,2.48,2.46,12120,0,0.40323,0.4065,0.00327


In [10]:
# LAY - only calculating lay book % with 10% decrease in prob and optimal staking. race by race
short_df['lay_prob'] = short_df['bsp_prob'] + 0.01 # <- VARY HERE
short_df['lay_prob'] = np.where(short_df['lay_prob'] > 1.0, 1.0, short_df['lay_prob'])
# short_df['lay_price'] = np.power(short_df['lay_prob'], -1)
# short_df['lay_price'] = np.where(short_df['lay_price'] == 1.0, 1.01, short_df['lay_price']) # is this necessary?
short_df['lay_price'] = 1.25
short_df['lp_book'] = round(short_df.groupby('race_id')['lay_price'].transform(lambda x: sum(np.power(x, -1))).astype(float), 2)
short_df['lay_size'] = (short_df['lay_prob'] / short_df['lp_book']) * 20 # £20
short_df['match'] = np.where(short_df['ip_min'] < short_df['lay_price'], 1, 0)
short_df['pl'] = np.where(short_df['match'] == 0, 0,
                              np.where(short_df['win'] == 0, short_df['lay_size'] * 0.95, -(short_df['lay_size'] * (short_df['lay_price'] -1)))
                             )
short_df['pl'].sum()
# Result : unprofitable
# Believe due to price/match trade-off. As prices decrease to gain a profiatble book - not enough matches are made.

-710.1545187500001

### 5.2 

In [23]:
# BACK LAY - Backing at BSP and laying at % lower. (optimal staking?). runner by runner. short races 
all_df = df.copy()
all_df.head()

Unnamed: 0,race_id,runner_id,n_runners,bsp,ip_min,ip_traded,win,bsp_prob,ip_prob,dif_prob
0,494609,1060410,12,12.5,10.0,8680,0,0.08,0.1,0.02
1,494609,2055946,12,8.06,3.1,17662,0,0.12407,0.32258,0.19851
2,494609,2061900,12,18.91,10.0,5080,0,0.05288,0.1,0.04712
3,494609,2065439,12,2.81,1.01,474988,1,0.35587,0.9901,0.63423
4,494609,2066092,12,26.48,10.0,8572,0,0.03776,0.1,0.06224


In [30]:
# take for example first (as comparison) : result if backing at BSP and laying at the exact minimum price in play
BACK_STAKE = 20
LAY_STAKE = 20

# back bets (all runners at bsp) 
all_df['back_size'] = BACK_STAKE
all_df['back_price'] = all_df['bsp']
all_df['back_match'] = np.where(all_df['bsp'] >= all_df['back_price'], 1, 0)
all_df['back_net_pl'] = np.where(all_df['back_match'] == 0,
                             0,
                             np.where(all_df['win'] == 0,
                                     -all_df['back_size'],
                                      (all_df['back_size'] * all_df['back_price'])))
all_df['back_real_pl'] = np.where(all_df['back_net_pl'] > 0,
                                  (all_df['back_net_pl'] - all_df['back_size']) * 0.95,
                                  all_df['back_net_pl'])

# lay bets (all runners at ip_min)
all_df['lay_size'] = LAY_STAKE
all_df['lay_price'] = all_df['ip_min']
all_df['lay_match'] = np.where(all_df['ip_min'] <= all_df['lay_price'], 1, 0)
all_df['lay_net_pl'] = np.where(all_df['lay_match'] == 0, 0,
                                np.where(all_df['win'] == 0, all_df['lay_size'],
                                         -(all_df['lay_size'] * (all_df['lay_price'] -1)))
                               )
all_df['lay_real_pl'] = np.where(all_df['lay_net_pl'] > 0,
                                 (all_df['lay_size']) * 0.95,
                                  all_df['lay_net_pl'])

all_df['total_real_pl'] = all_df['back_real_pl'] + all_df['lay_real_pl']                                  

In [38]:
all_df.loc[all_df['race_id'].isin(race_ids_low[2:3])]
all_df.loc[all_df['race_id'].isin(race_ids_low[2:3])]['total_real_pl'].sum()
print('If backing at BSP & laying at excatly IP_MIN : £', all_df['total_real_pl'].sum())

Unnamed: 0,race_id,runner_id,n_runners,bsp,ip_min,ip_traded,win,bsp_prob,ip_prob,dif_prob,...,back_price,back_match,back_net_pl,back_real_pl,lay_size,lay_price,lay_match,lay_net_pl,lay_real_pl,total_real_pl
88,564045,1528716,4,9.0,2.0,71033,0,0.11111,0.5,0.38889,...,9.0,1,-20.0,-20.0,20,2.0,1,20.0,19.0,-1.0
89,564045,1614504,4,5.0,3.85,20681,0,0.2,0.25974,0.05974,...,5.0,1,-20.0,-20.0,20,3.85,1,20.0,19.0,-1.0
90,564045,1702372,4,5.96,3.6,26880,0,0.16779,0.27778,0.10999,...,5.96,1,-20.0,-20.0,20,3.6,1,20.0,19.0,-1.0
91,564045,1836947,4,1.85,1.01,243429,1,0.54054,0.9901,0.44956,...,1.85,1,37.0,16.15,20,1.01,1,-0.2,-0.2,15.95


12.95

If backing at BSP & laying at excatly IP_MIN : £ 5524280.73


In [95]:
### devising a hedging calculator...
def hedging(df):
    '''
    Function applies hedging to identify lay_size
    Assumption : commission = 5%
    '''
    return (df['back_price'] * df['back_size']) / df['lay_price'] * (1 - 0.05 / 2)

In [93]:
hedging(50, 2, 1.5, 0.00)

66.66666666666667

In [214]:
gef_df = df.copy()
gef_df = gef_df.loc[gef_df['bsp'] < 1.5]

BACK_STAKE = 20
# back bets (all runners at bsp) 
gef_df['back_size'] = BACK_STAKE
gef_df['back_price'] = gef_df['bsp']
gef_df['back_match'] = np.where(gef_df['bsp'] >= gef_df['back_price'], 1, 0)
gef_df['back_net_pl'] = np.where(gef_df['back_match'] == 0,
                             0,
                             np.where(gef_df['win'] == 0,
                                     -gef_df['back_size'],
                                      (gef_df['back_size'] * gef_df['back_price'])))
gef_df['back_real_pl'] = np.where(gef_df['back_net_pl'] > 0,
                                  (gef_df['back_net_pl'] - gef_df['back_size']) * 0.95,
                                  gef_df['back_net_pl'])

# lay bets (all runners at ip_min)

gef_df['lay_prob'] = gef_df['bsp_prob'] + 0.10
gef_df['lay_prob'] = np.where(gef_df['lay_prob'] > 1.0, 1.0, gef_df['lay_prob'])
gef_df['lay_price'] = np.power(gef_df['lay_prob'], -1)
gef_df['lay_price'] = np.where(gef_df['lay_price'] == 1.0, 1.01, gef_df['lay_price']) # is this necessary?
gef_df['lay_size'] = gef_df.apply(hedging, axis = 1)
gef_df['lay_match'] = np.where(gef_df['ip_min'] <= gef_df['lay_price'], 1, 0)
gef_df['lay_net_pl'] = np.where(gef_df['lay_match'] == 0, 0,
                                np.where(gef_df['win'] == 0, gef_df['lay_size'],
                                         -(gef_df['lay_size'] * (gef_df['lay_price'] -1)))
                               )
gef_df['lay_real_pl'] = np.where(gef_df['lay_net_pl'] > 0,
                                 (gef_df['lay_size']) * 0.95,
                                  gef_df['lay_net_pl'])

gef_df['total_real_pl'] = gef_df['back_real_pl'] + gef_df['lay_real_pl']  

In [215]:
# gef_df.loc[gef_df['race_id'].isin(race_ids_low[5:7])][['race_id', 'runner_id', 'back_price', 'back_size',
#                                                        'back_real_pl', 'lay_price', 'lay_size', 'ip_min',
#                                                        'lay_real_pl','total_real_pl']]
gef_df[['race_id', 'runner_id', 'back_price', 'back_size',
                                                       'back_real_pl', 'lay_price', 'lay_size', 'ip_min',
                                                       'lay_real_pl', 'win','total_real_pl']].head(5)
print(gef_df['total_real_pl'].mean(), gef_df['total_real_pl'].sum())

Unnamed: 0,race_id,runner_id,back_price,back_size,back_real_pl,lay_price,lay_size,ip_min,lay_real_pl,win,total_real_pl
54,564038,1425573,1.44,20,-20.0,1.258748,22.307875,1.3,0.0,0,-20.0
101,564047,1923056,1.35,20,6.65,1.189428,22.13248,1.01,-4.19252,1,2.45748
128,564050,2011697,1.44,20,8.36,1.258748,22.307875,1.01,-5.772125,1,2.587875
340,564071,1730466,1.45,20,-20.0,1.266368,22.327637,1.2,21.211255,0,1.211255
399,564078,2057590,1.36,20,6.84,1.197189,22.151891,1.01,-4.368109,1,2.471891


-0.5337429659373931 -1237.7499380088138


In [216]:
gef_df.shape

(2319, 22)

In [None]:
roi vs price dif hedging plot - then could just go one the 'needed price decrease to be profitable'
- need to factor in losses of unmatched though??? - can these be measured ?

In [158]:
# To do :
# - Re-do the lay strategy and only apply commission to the 'net winnings' ? Have I already done this ? - I think so

In [None]:
# Assumptions:
# - can place desired stake on each bet (yet likely as data taken for payouts of £100 + ?)