In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pymysql
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
connection = pymysql.connect(host='localhost', user='root', passwd ='xboxpass32', database = 'smartform')
cursor = connection.cursor()

query = ''' SELECT race_id, runner_id, prize_money, num_runners, historic_betfair_win_prices.bsp, historic_betfair_win_prices.inplay_min, inplay_traded
            FROM historic_races
            JOIN historic_runners USING (race_id) JOIN historic_betfair_win_prices ON race_id=sf_race_id
                 AND
                 runner_id = sf_runner_id
            WHERE(CAST(historic_races.meeting_date AS Datetime) >= '2015-01-01')
            ORDER BY race_id, runner_id
        '''
cursor.execute(query)
rows = cursor.fetchall()

start = query.find('SELECT') + 7
end = query.find('\n            FROM', start)
names = query[start:end].split(', ')

df = pd.DataFrame(list(rows), columns=names)
del rows

print('No. Runners : ', len(df.index), '\nNo. Races : ', df['race_id'].nunique())
df.head(3)

603787

No. Runners :  603787 
No. Races :  64243


Unnamed: 0,race_id,runner_id,prize_money,num_runners,historic_betfair_win_prices.bsp,historic_betfair_win_prices.inplay_min,inplay_traded
0,494609,1060410,,12,12.5,10.0,8680
1,494609,2055946,76862.4,12,8.06,3.1,17662
2,494609,2061900,19359.0,12,18.91,10.0,5080


In [7]:
# removing nulls
df = df.loc[df['historic_betfair_win_prices.bsp'] != 0]
df = df.loc[df['historic_betfair_win_prices.inplay_min'] != 0]
# setting data types
df['historic_betfair_win_prices.bsp'] = df['historic_betfair_win_prices.bsp'].astype(float)
df['historic_betfair_win_prices.inplay_min'] = df['historic_betfair_win_prices.inplay_min'].astype(float)
# data processing for 'win'
df['win'] = np.where(df['prize_money'] == df.groupby('race_id')['prize_money'].transform(np.max), 1, 0)
df.drop('prize_money', axis = 1, inplace = True)
# renaming
df.rename(columns={'num_runners' : 'n_runners',
                   'historic_betfair_win_prices.bsp' : 'bsp',
                   'historic_betfair_win_prices.inplay_min' : 'ip_min',
                   'inplay_traded' : 'ip_traded'}, inplace = True)




In [8]:
race_ids = df['race_id'].unique()
race_ids_low = df.loc[df['n_runners'] <5]['race_id'].unique()

In [10]:
prices_df = df[['race_id', 'bsp', 'ip_min']].copy()
prices_df['bsp_prob'] = round(np.power(prices_df['bsp'].astype(float), -1), 5)
prices_df['ip_prob'] = round(np.power(prices_df['ip_min'].astype(float), -1), 5)
prices_df['dif_prob'] = prices_df['ip_prob'] - prices_df['bsp_prob']

prices_df['bsp_binned'] = pd.cut(prices_df['bsp_prob'], np.arange(0.00, 1.05, 0.05))

def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

prices_groupby = prices_df.groupby('bsp_binned').agg({'dif_prob':[percentile(10), percentile(25), percentile(50),
                                                                percentile(75), percentile(90), percentile(99)]})

prices_groupby.reset_index(inplace=True)
prices_groupby.columns = prices_groupby.columns.droplevel()
prices_groupby.rename(columns={'': 'bsp_binned'}, inplace=True)

In [14]:
prices_groupby.columns

Index(['bsp_binned', 'percentile_10', 'percentile_25', 'percentile_50',
       'percentile_75', 'percentile_90', 'percentile_99'],
      dtype='object')

In [44]:
# test run
test_df = df[['race_id', 'runner_id', 'bsp', 'ip_min', 'win']].copy() # retreive df
test_df = test_df.loc[test_df['race_id'].isin(race_ids_low[1:3])] # filter sample size

# back_price + size
test_df['back_price'] = test_df['bsp'] # backing at BSP
test_df['back_size'] = 20 # £20


# lay_price
prices_dict = dict(zip(prices_groupby['bsp_binned'], prices_groupby['percentile_25'])) # median change
test_df['bsp_prob'] = round(np.power(test_df['bsp'].astype(float), -1), 5) # bsp_prob
test_df['bsp_binned'] = pd.cut(test_df['bsp_prob'], np.arange(0.00, 1.05, 0.05)) # granularity of bin
test_df["bsp_binned"] = test_df["bsp_binned"].apply(lambda x: x.mid) # converting 'bin' class to midpoint
test_df['bsp_prob_add'] = test_df['bsp_binned'].map(prices_dict).fillna(0) # fill with zero - no change
test_df['lay_prob'] = test_df['bsp_prob'] + test_df['bsp_prob_add']
test_df['lay_prob'] = np.where(test_df['lay_prob'] > 1.0, 1.0, test_df['lay_prob'])
test_df['lay_prob'] = np.where(test_df['lay_prob'] == 0, 0.01, test_df['lay_prob'])
test_df['lay_price'] = np.power(test_df['lay_prob'], -1)
test_df['lay_price'] = np.where(test_df['lay_price'] == 1.0, 1.01, test_df['lay_price'])

# lay_size (hedging)
def hedging(df):
    '''
    Function applies hedging to identify lay_size
    Assumption : commission = 5%
    '''
    return (df['back_price'] * df['back_size']) / df['lay_price'] * (1 - 0.05 / 2) # needs fixing

test_df['lay_size'] = test_df.apply(hedging, axis = 1)

In [45]:
test_df.head()

Unnamed: 0,race_id,runner_id,bsp,ip_min,win,back_price,back_size,bsp_prob,bsp_binned,bsp_prob_add,lay_prob,lay_price,lay_size
78,564043,718620,6.48,3.55,0,6.48,20,0.15432,0.175,0.03167,0.18599,5.376633,23.501696
79,564043,1411366,3.85,1.01,1,3.85,20,0.25974,0.275,0.05729,0.31703,3.154276,23.801027
80,564043,1493226,4.2,2.02,0,4.2,20,0.2381,0.225,0.04253,0.28063,3.563411,22.983597
81,564043,1627990,2.86,2.78,0,2.86,20,0.34965,0.325,0.068023,0.417673,2.39422,23.293595
88,564045,1528716,9.0,2.0,0,9.0,20,0.11111,0.125,0.02091,0.13202,7.57461,23.16951


In [46]:
result_df = test_df[['race_id','runner_id','back_price','bsp','back_size','lay_price','lay_size','ip_min', 'win']].copy()

# back_match
result_df['back_match'] = np.where(result_df['bsp'] >= result_df['back_price'], 1, 0)

# lay_match
result_df['lay_match'] = np.where(result_df['ip_min'] <= result_df['lay_price'], 1, 0)

# back payout
result_df['back_payout'] = np.where(result_df['back_match'] == 0,
                             0,
                             np.where(result_df['win'] == 0,
                                     -result_df['back_size'],
                                     ((result_df['back_size'] * result_df['back_price']) - result_df['back_size']) * 0.95)
                                   ) # trying out commission in this  
# lay payout
result_df['lay_payout'] = np.where(result_df['lay_match'] == 0,
                                   0,
                                   np.where(result_df['win'] == 0, result_df['lay_size'] * 0.95,
                                         -(result_df['lay_size'] * (result_df['lay_price'] -1)))
                                   )
result_df.head(4)
print(result_df)

Unnamed: 0,race_id,runner_id,back_price,bsp,back_size,lay_price,lay_size,ip_min,win,back_match,lay_match,back_payout,lay_payout
78,564043,718620,6.48,6.48,20,5.376633,23.501696,3.55,0,1,1,-20.0,22.326612
79,564043,1411366,3.85,3.85,20,3.154276,23.801027,1.01,1,1,1,54.15,-51.273973
80,564043,1493226,4.2,4.2,20,3.563411,22.983597,2.02,0,1,1,-20.0,21.834417
81,564043,1627990,2.86,2.86,20,2.39422,23.293595,2.78,0,1,0,-20.0,0.0


In [None]:
# case 79 ?