In [1]:
### importing packages

In [3]:
import json
from pathlib import Path, PurePath # To define open and save locations that are cross-compatible between Windows/Linux

import betfairlightweight
from betfairlightweight import filters

import datetime

import pandas as pd
import numpy as np

from bz2 import BZ2File # To unzip the Betfair data from its downloaded format

from betfairlightweight import StreamListener
from betfairlightweight.streaming.stream import MarketStream

In [4]:
### 'logging in'

In [5]:
project_dir = Path.cwd().parents[1]
logins_dir = project_dir / 'api_logins.json'

with open(logins_dir) as f:
    login_dict =  json.load(f)
    
trading = betfairlightweight.APIClient(username=login_dict['my_username'],
                                       password=login_dict['my_password'],
                                       app_key=login_dict['my_app_key'],
                                       certs=login_dict['certs_path'])

trading.login()

<LoginResource>

In [6]:
# returns list of 'data dictionaries'
data_dicts = trading.historic.get_my_data()
data_dicts;

In [7]:
# calculate range of dates for advanced data
adv_range = [d['forDate'] for d in data_dicts if d['plan'] == 'Advanced Plan']

# calculate range of dates for pro data
pro_range = [d['forDate'] for d in data_dicts if d['plan'] == 'Pro Plan']

In [8]:
# find min date for adv_data
adv_min_date = datetime.datetime.strptime(min(adv_range), '%Y-%m-%dT%H:%M:%S')

# find max data for adv data
def last_day_of_month(any_day):
    next_month = any_day.replace(day=28) + datetime.timedelta(days=4)  
    return next_month - datetime.timedelta(days=next_month.day)

adv_max_temp = datetime.datetime.strptime(max(adv_range), '%Y-%m-%dT%H:%M:%S')
adv_max_date = last_day_of_month(adv_max_temp) 

In [9]:
# find min date for pro_data
pro_min_date = datetime.datetime.strptime(min(pro_range), '%Y-%m-%dT%H:%M:%S')

# find max data for pro_data
def last_day_of_month(any_day):
    next_month = any_day.replace(day=28) + datetime.timedelta(days=4)  
    return next_month - datetime.timedelta(days=next_month.day)

pro_max_temp = datetime.datetime.strptime(max(pro_range), '%Y-%m-%dT%H:%M:%S')
pro_max_date = last_day_of_month(pro_max_temp) 

In [10]:
# list files within advanced data range (GB Data)
adv_file_list = trading.historic.get_file_list(
    "Horse Racing",
    "Advanced Plan",
    from_day=adv_min_date.day,
    from_month=adv_min_date.day,
    from_year=adv_min_date.year,
    to_day=adv_max_date.day,
    to_month=adv_max_date.month,
    to_year=adv_max_date.year,
    market_types_collection=["WIN"],
    countries_collection=["GB"],
    file_type_collection=["M"]
)
print("No. items :", len(adv_file_list))

No. items : 1858


In [11]:
# list files within pro data range (US Data)
pro_file_list = trading.historic.get_file_list(
    "Horse Racing",
    "Pro Plan",
    from_day=pro_min_date.day,
    from_month=pro_min_date.month,
    from_year=pro_min_date.year,
    to_day=pro_max_date.day,
    to_month=pro_max_date.month,
    to_year=pro_max_date.year,
    market_types_collection=["WIN"],
    countries_collection=["US"],
    file_type_collection=["M"]
)
print("No. items :", len(pro_file_list))

No. items : 661


In [12]:
# where to store our advanced data
adv_dir = project_dir / 'data' / 'raw' / 'api' / 'advanced'

# where to store our pro data
pro_dir = project_dir / 'data' / 'raw' / 'api' / 'pro'

In [16]:
# downloading advanced data to disk (eta 2 hours using %%time estimate for one download)
adv_file_dirs = [] # list of directories of each download

for file in adv_file_list[0:5]:
    download = trading.historic.download_file(file_path = file, store_directory = adv_dir)
    print(download)
    adv_file_dirs.append(download)

/Users/tombardrick/Documents/projects/betfair/betfair_project/data/raw/api/advanced/1.166897828.bz2
/Users/tombardrick/Documents/projects/betfair/betfair_project/data/raw/api/advanced/1.166897833.bz2
/Users/tombardrick/Documents/projects/betfair/betfair_project/data/raw/api/advanced/1.166897838.bz2
/Users/tombardrick/Documents/projects/betfair/betfair_project/data/raw/api/advanced/1.166897843.bz2
/Users/tombardrick/Documents/projects/betfair/betfair_project/data/raw/api/advanced/1.166897848.bz2


In [17]:
# downloading pro data to disk (eta 50 mins, using %%time estimate)
pro_file_dirs = [] # list of directories of each download

for file in pro_file_list[0:5]:
    download = trading.historic.download_file(file_path = file, store_directory = pro_dir)
    print(download)
    pro_file_dirs.append(download)

/Users/tombardrick/Documents/projects/betfair/betfair_project/data/raw/api/pro/1.170262288.bz2
/Users/tombardrick/Documents/projects/betfair/betfair_project/data/raw/api/pro/1.170262291.bz2
/Users/tombardrick/Documents/projects/betfair/betfair_project/data/raw/api/pro/1.170262294.bz2
/Users/tombardrick/Documents/projects/betfair/betfair_project/data/raw/api/pro/1.170262297.bz2
/Users/tombardrick/Documents/projects/betfair/betfair_project/data/raw/api/pro/1.170262314.bz2


In [33]:
# proceesing bz2 to text
adv_extfile_dirs = []

for file in adv_file_dirs:
    zipfile = BZ2File(file) # open the file
    data = zipfile.read() # get the decompressed data
    newfilepath = file.split('.bz2')[0] # removing the extension and saving without a filetype
    open(newfilepath, 'wb').write(data) # write an uncompressed file
    adv_extfile_dirs.append(newfilepath)
    zipfile.close()

In [483]:
datadict = {'Time': [],
       'MarketId' : [],
       'Status' : [],
       'Inplay' : [], 
       'SelectionId' : [],
       'LastPriceTraded' : [],
       'TotalMatched' : [],
       'BSP' : [],
       'AdjFactor' :  [],
       'RunnerStatus' : [],
       'MktTotalMatched' : [],
       'RaceInfo' : [],
       'Venue' : [],
       'BackSize': [],
       'BackPrice': [],
       'LayPrice' : [],
       'LaySize' : []
}

In [484]:
class HistoricalStream(MarketStream):
    # create custom listener and stream

    def _init_(self, listener):
        super(HistoricalStream, self)._init_(listener)


    def on_process(self, market_books):
        for market_book in market_books:
            for runner in market_book.runners:
                datadict['Time'].append(market_book.publish_time)
                datadict['MarketId'].append(float(market_book.market_id))
                datadict['Status'].append(market_book.status)
                datadict['Inplay'].append(market_book.inplay)
                datadict['SelectionId'].append(runner.selection_id)
                datadict['LastPriceTraded'].append(runner.last_price_traded)
                datadict['TotalMatched'].append(runner.total_matched)
                datadict['BSP'].append(runner.sp.actual_sp)
                datadict['AdjFactor'].append(runner.adjustment_factor)
                datadict['RunnerStatus'].append(runner.status)
                datadict['MktTotalMatched'].append(market_book.total_matched)
                datadict['RaceInfo'].append(market_book.market_definition.name)
                datadict['Venue'].append(market_book.market_definition.venue)
                
                atb_size = [x.size for x in runner.ex.available_to_back]
                datadict['BackSize'].append(atb_size)
                atb_price = [x.price for x in runner.ex.available_to_back]
                datadict['BackPrice'].append(atb_price)   
                atl_price = [x.price for x in runner.ex.available_to_lay]
                datadict['LayPrice'].append(atl_price)
                atl_size = [x.size for x in runner.ex.available_to_lay]
                datadict['LaySize'].append(atl_size)




                
                
class HistoricalListener(StreamListener):
    def _add_stream(self, unique_id, stream_type):
        if stream_type == "marketSubscription":
            return HistoricalStream(self)

In [485]:
listener = HistoricalListener(max_latency=None)

for file in adv_extfile_dirs:
    stream = trading.streaming.create_historical_stream(directory=file, listener=listener)
    stream.start() 
    print(str(file) + " stream completed.")

/Users/tombardrick/Documents/projects/betfair/betfair_project/data/raw/api/advanced/1.166897828 stream completed.
/Users/tombardrick/Documents/projects/betfair/betfair_project/data/raw/api/advanced/1.166897833 stream completed.
/Users/tombardrick/Documents/projects/betfair/betfair_project/data/raw/api/advanced/1.166897838 stream completed.
/Users/tombardrick/Documents/projects/betfair/betfair_project/data/raw/api/advanced/1.166897843 stream completed.
/Users/tombardrick/Documents/projects/betfair/betfair_project/data/raw/api/advanced/1.166897848 stream completed.


In [486]:
def dict_to_df(datadict):
    
    df = pd.DataFrame(datadict)
    
    df.sort_values(by = 'Time')
    
    df['MarketId'] = df['MarketId'].astype(str)
    df['SelectionId'] = df['SelectionId'].astype(str)
    
    df['LayPrice'] = df['LayPrice'].apply(lambda x: x[0] if x else np.nan)
    df['LaySize'] = df['LaySize'].apply(lambda x: x[0] if x else np.nan)
    df['BackPrice'] = df['BackPrice'].apply(lambda x: x[0] if x else np.nan)
    df['BackSize'] = df['BackSize'].apply(lambda x: x[0] if x else np.nan)
    
    return df

In [487]:
df = dict_to_df(datadict)
df.head(5)

Unnamed: 0,Time,MarketId,Status,Inplay,SelectionId,LastPriceTraded,TotalMatched,BSP,AdjFactor,RunnerStatus,MktTotalMatched,RaceInfo,Venue,BackSize,BackPrice,LayPrice,LaySize
0,2019-12-31 08:17:23.840,1.166897828,OPEN,False,5637043,0.0,0.0,,6.75,ACTIVE,,1m Hcap,Southwell,5300.3,1.01,,
1,2019-12-31 08:17:23.840,1.166897828,OPEN,False,19436245,0.0,0.0,,2.01,ACTIVE,,1m Hcap,Southwell,5300.3,1.01,,
2,2019-12-31 08:17:23.840,1.166897828,OPEN,False,13663992,0.0,0.0,,30.37,ACTIVE,,1m Hcap,Southwell,5300.3,1.01,,
3,2019-12-31 08:17:23.840,1.166897828,OPEN,False,19252822,0.0,0.0,,14.67,ACTIVE,,1m Hcap,Southwell,5300.3,1.01,,
4,2019-12-31 08:17:23.840,1.166897828,OPEN,False,13331255,0.0,0.0,,31.62,ACTIVE,,1m Hcap,Southwell,5300.3,1.01,,


In [488]:
# df['TradeSize'] = df.groupby(['MarketId','SelectionId', 'LastPriceTraded'])['TotalMatched'].diff()

# converting to datetime
df['Time'] = pd.to_datetime(df['Time'], format="%Y-%m-%d %H:%M:%S", errors='coerce')

# calculating inplay start for each race (assigning to new columns)
df['StartTime'] = df['Time'].where(df['Inplay'] == True).groupby(df['MarketId']).transform('min')

# calculating difference between each time point and start time
df['TimeDif'] = (df['Time'] - df['StartTime']).astype('timedelta64[s]')

df = df.drop('StartTime', 1)

In [489]:
df.head()

Unnamed: 0,Time,MarketId,Status,Inplay,SelectionId,LastPriceTraded,TotalMatched,BSP,AdjFactor,RunnerStatus,MktTotalMatched,RaceInfo,Venue,BackSize,BackPrice,LayPrice,LaySize,TimeDif
0,2019-12-31 08:17:23.840,1.166897828,OPEN,False,5637043,0.0,0.0,,6.75,ACTIVE,,1m Hcap,Southwell,5300.3,1.01,,,-100390.0
1,2019-12-31 08:17:23.840,1.166897828,OPEN,False,19436245,0.0,0.0,,2.01,ACTIVE,,1m Hcap,Southwell,5300.3,1.01,,,-100390.0
2,2019-12-31 08:17:23.840,1.166897828,OPEN,False,13663992,0.0,0.0,,30.37,ACTIVE,,1m Hcap,Southwell,5300.3,1.01,,,-100390.0
3,2019-12-31 08:17:23.840,1.166897828,OPEN,False,19252822,0.0,0.0,,14.67,ACTIVE,,1m Hcap,Southwell,5300.3,1.01,,,-100390.0
4,2019-12-31 08:17:23.840,1.166897828,OPEN,False,13331255,0.0,0.0,,31.62,ACTIVE,,1m Hcap,Southwell,5300.3,1.01,,,-100390.0


In [490]:
# filter in some way before large merge? np.nans somewhere?

In [491]:
# extract race info

In [492]:
def extract_furlongs(market_name):
    '''
    Assuming distance is always stated 1st within 'MarketName', with space followed after.
    Distance given in format of furlongs, miles or both.
    8 furlongs in a mile.
    '''
    
    distance = market_name.split(' ')[0]
    
    if 'm' in distance:
        m = distance.split('m')[0]
        distance = distance.replace(m + 'm', '')
        
        if 'f' in distance:
            f = distance.split('f')[0]
            
            return (int(m) * 8) + int(f)

        return int(m) * 8
    
    else:
        f = distance.split('f')[0]
        
        return int(f)

In [493]:
df['Distance'] = df['RaceInfo'].apply(lambda x: extract_furlongs(x))

In [494]:
def extract_race_type(market_name):
    if 'Hrd' in market_name:
        return 'Hurdle'
    if 'Chs' in market_name:
        return 'Chase'
    if 'NHF' in market_name:
        return 'NHF'
    else:
        return 'Flat'
    

In [495]:
df['RaceType'] = df['RaceInfo'].apply(lambda x: extract_race_type(x))

In [496]:
df.groupby('MarketId')['TimeDif'].min(); # 300 bins -> ~ 5.5 minutes

In [497]:
df.groupby('MarketId')['TimeDif'].max(); # 50 bins -> ~ 3 seconds (for 1 mile race) 

In [498]:
T_pre = 300 # ~ every five minutes
T_post = 50 # ~ every 1-3 seconds

df['T_pre'] = df.where(df['Inplay'] == False).groupby('MarketId')['TimeDif'].apply(lambda x: pd.cut(x, T_pre, labels = [i for i in range(-T_pre, 0)])).astype(float)

df['T_post'] = df.where(df['Inplay'] == True).groupby('MarketId')['TimeDif'].apply(lambda x: pd.cut(x, T_post, labels = [i for i in range(0, T_post)])).astype(float)

df['T'] = df['T_pre'].fillna(df['T_post']).astype(int)

df.drop(columns = ['T_pre', 'T_post'], inplace = True)

In [528]:
df.loc[(df['SelectionId'] == "12819181")].sample(10).sort_values('T')

Unnamed: 0,Time,MarketId,Status,Inplay,SelectionId,LastPriceTraded,TotalMatched,BSP,AdjFactor,RunnerStatus,...,RaceInfo,Venue,BackSize,BackPrice,LayPrice,LaySize,TimeDif,Distance,RaceType,T
73273,2019-12-31 15:57:27.225,1.166897838,OPEN,False,12819181,0.0,0.0,,4.54,ACTIVE,...,5f Hcap,Southwell,1.32,12.0,40.0,3.35,-76981.0,5,Flat,-222
77452,2019-12-31 19:44:16.956,1.166897838,OPEN,False,12819181,14.5,23.56,,4.54,ACTIVE,...,5f Hcap,Southwell,2.12,14.5,20.0,2.07,-63371.0,5,Flat,-183
77956,2019-12-31 20:02:23.322,1.166897838,OPEN,False,12819181,16.5,24.78,,4.54,ACTIVE,...,5f Hcap,Southwell,2.13,14.5,16.5,4.61,-62285.0,5,Flat,-180
78628,2019-12-31 20:13:05.269,1.166897838,OPEN,False,12819181,16.5,34.0,,4.54,ACTIVE,...,5f Hcap,Southwell,3.22,14.5,20.0,2.64,-61643.0,5,Flat,-178
86748,2019-12-31 23:05:09.891,1.166897838,OPEN,False,12819181,15.5,117.82,,4.54,ACTIVE,...,5f Hcap,Southwell,1.16,15.0,15.5,4.23,-51318.0,5,Flat,-148
94756,2020-01-01 06:32:40.381,1.166897838,OPEN,False,12819181,16.5,233.66,,4.54,ACTIVE,...,5f Hcap,Southwell,7.54,16.0,17.0,12.0,-24468.0,5,Flat,-71
112277,2020-01-01 10:38:30.074,1.166897838,OPEN,False,12819181,29.0,417.52,,4.54,ACTIVE,...,5f Hcap,Southwell,2.2,25.0,29.0,9.54,-9718.0,5,Flat,-28
114909,2020-01-01 11:08:40.439,1.166897838,OPEN,False,12819181,28.0,430.94,,4.54,ACTIVE,...,5f Hcap,Southwell,3.82,26.0,29.0,9.32,-7907.0,5,Flat,-23
138093,2020-01-01 13:04:31.455,1.166897838,OPEN,False,12819181,27.0,1501.31,,3.76,ACTIVE,...,5f Hcap,Southwell,15.18,27.0,28.0,14.98,-956.0,5,Flat,-3
144939,2020-01-01 13:21:17.515,1.166897838,OPEN,True,12819181,1.15,30349.0,28.12,3.76,ACTIVE,...,5f Hcap,Southwell,520.24,1.15,1.16,67.77,50.0,5,Flat,17


In [None]:
# weighted average price?

In [None]:
# also apply T by runner or keep at _market_?

In [None]:
# include filters before groupby e.g. min trade size > , status = OPEN

In [None]:
# way of including all back side prices / sizes (no. columns may change however for example)
# using only best available for the moment

In [None]:
# make data undesrtanding doc e.g.
- how often we have time updates
- no . updates pre / post start
- no updates per horse
- no. missing values for lay / back ?
- df.loc[df['BackPrice'] > df['LayPrice']].head() - where trades aren't taken at the higher odds skip past?