In [41]:
# packages
from sqlalchemy import create_engine
import pymysql
import pandas as pd
import numpy as np
import re 


# configs
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
%matplotlib inline

## Analysing how fundamental / technical features influence price changes

- Create 'natural' & 'engineered' features (bins & one-hot)
- Create target (ip_min - bsp) - excluding 1.01's... 
- Measure their influence on price (using ANOVA?)
- Identify those with the 'biggest influence'

#### 1. Reading in data

In [11]:
db_connection_str = 'mysql+pymysql://root:xboxpass32@localhost/smartform'
db_connection = create_engine(db_connection_str)

df = pd.read_sql('''
                 SELECT
                  race_id,
                  course,
                  race_type,
                  going,
                  handicap,
                  maiden,
                  num_runners,
                  distance_yards,
                  added_money,
                  runner_id,
                  distance_travelled,
                  form_figures,
                  gender,
                  age,
                  bred,
                  in_race_comment,
                  owner_id,
                  trainer_id,
                  jockey_id,
                  dam_id,
                  sire_id,
                  position_in_betting,
                  days_since_ran,
                  weight_pounds,
                  finish_position,
                  amended_position,
                  bf_race_id,
                  bf_runner_id,
                  bsp,
                  inplay_min,
                  win
                 FROM
                  historic_races
                  JOIN historic_runners USING (race_id)
                  JOIN historic_betfair_win_prices ON race_id = sf_race_id
                  AND runner_id = sf_runner_id
                WHERE
                  (
                    CAST(historic_races.meeting_date AS Datetime) BETWEEN '2016-10-01'
                    AND '2020-01-01'
                  )
                ORDER BY
                  race_id,
                  runner_id
                ''',
                con=db_connection)
print('No. Rows : ', len(df.index))
# db_connection.close()

No. Rows :  1103357


### 2. Data Processing

#### 2.0 Correct finish position 

In [12]:
df['final_position'] = np.where(df['amended_position'].notnull(), df['amended_position'], df['finish_position'])
df.drop(['finish_position', 'amended_position'], axis = 1, inplace = True)

#### 2.1 Dropping missing values

In [13]:
prev_rows = len(df.index)
df.dropna(inplace=True)
print('Rows Removed: ', prev_rows - len(df.index), '\nRows Remaining : ', len(df.index))

Rows Removed:  174128 /nRows Remaining :  929229


In [14]:
# come back to this fix this?

#### 2.2 Remove winners 
Creating sample not affected by winners (causing fat right tail in price decreases).

In [15]:
prev_rows = len(df.index)
df = df[df['win'] == 0]
print('Rows Removed: ', prev_rows - len(df.index), '\nRows Remaining : ', len(df.index))

Rows Removed:  108030 /nRows Remaining :  821199


#### 2.3 Form transformation

Capturing how form may influence price movements. Attempt to capture effects like 'fitness' & __'consistency'__.

- Taking previous 3 runs as single features
- Taking length of form as a feature
- Summing all form as a aggregate (letters -> 9 as more likely bad than good)
- Average form as form_sum / form_len

In [78]:
df['form_3'] = df['form_figures'].str[-3:] # pos in 3rd last race
df['form_2'] = df['form_figures'].str[-2:] # pos in 2nd last race
df['form_1'] = df['form_figures'].str[-1] # pos in last race
df['form_len'] = df['form_figures'].str.len().astype(int) # length of form figures

In [83]:
df['form_figures_num'] = df['form_figures'].str.replace(r'[A-Z]', '9') # convering letters to '9'
df['form_int_list'] = df['form_figures_num'].apply(lambda x: re.findall(r'\d+', x)) # list of non-sep ints 
df['form_ints_list'] = df['form_int_list'].apply(lambda x: [sum(int(c) for c in str(num)) for num in x]) # sep ints
df['form_sum'] = df['form_ints_list'].apply(lambda x: sum(x)) # sum of all ints
df['form_avg'] = round(df['form_sum'] / df['form_len'], 2) # mean of ints

In [86]:
# inspecting
# df[['form_figures', 'form_figures_num', 'form_int_list', 'form_ints_list', 'form_sum', 'form_len', 'form_avg']].tail(50)

# all seems to work well, perhaps do the apply the same aggregation to 3 most recent runs also?

In [87]:
df.drop('form_figures', axis = 1, inplace = True)# removing 'form_figures' variable

In [None]:
# further data processing : 
# headgear encoding
# going encoding
# previous race vars

# prev race missing val treatment (Ensure this doesn;t affect features)
# prev race feature creation


In [None]:
# prev vars
prev_vars = ['form', 'handicap', 'maiden', 'finish_position']

In [None]:
# difference vars
dif_vars = ['handicap', ]

In [None]:
# variable lists
one_hots = ['course', 'race_type', 'going', 'gender', 'age', 'bred', 'owner_id', 'trainer_id',
           'jockey_id', 'dam_id', 'sire_id', 'form_3', 'form_2', 'form_1'] # add in engineered vars
bins = ['num_runners', 'distance_yards', 'added_money', 'distance_travelled', 'days_since_ran', 'weight_pounds',
        'early_traded', 'total_traded'] # add in engineered vars

In [None]:
# combos
course x jockey
course x trainer
race_type x jockey
race_type x trainer
len_form x aggregate


In [None]:
- data processing 
- create features (prev races, nlp)
- create combinations
- create targets (av_price -> bsp, bsp -> ip_min (w/o winners?))
- before inspecting 'ststistical factors' reduce features that only have a small sample e.g. sire/jockey combo?
