In [81]:
# packages
from sqlalchemy import create_engine
import pymysql
import pandas as pd
import numpy as np


# configs
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
%matplotlib inline

## Analysing how fundamental / technical features influence price changes

- Create 'natural' & 'engineered' features (bins & one-hot)
- Create target (ip_min - bsp) - excluding 1.01's... 
- Measure their influence on price (using ANOVA?)
- Identify those with the 'biggest influence'

#### 1. Reading in data

In [113]:
db_connection_str = 'mysql+pymysql://root:xboxpass32@localhost/smartform'
db_connection = create_engine(db_connection_str)

df = pd.read_sql('''
                 SELECT
                  race_id,
                  course,
                  race_type,
                  going,
                  handicap,
                  maiden,
                  num_runners,
                  distance_yards,
                  added_money,
                  runner_id,
                  distance_travelled,
                  form_figures,
                  gender,
                  age,
                  bred,
                  in_race_comment,
                  owner_id,
                  trainer_id,
                  jockey_id,
                  dam_id,
                  sire_id,
                  position_in_betting,
                  days_since_ran,
                  weight_pounds,
                  finish_position,
                  amended_position,
                  bf_race_id,
                  bf_runner_id,
                  bsp,
                  inplay_min,
                  win
                 FROM
                  historic_races
                  JOIN historic_runners USING (race_id)
                  JOIN historic_betfair_win_prices ON race_id = sf_race_id
                  AND runner_id = sf_runner_id
                WHERE
                  (
                    CAST(historic_races.meeting_date AS Datetime) BETWEEN '2010-10-01'
                    AND '2020-01-01'
                  )
                ORDER BY
                  race_id,
                  runner_id
                ''',
                con=db_connection)
print('No. Rows : ', len(df.index))
# db_connection.close()

No. Rows :  1103357


### 2. Data Processing

#### 2.0 Correct finish position 

In [115]:
df['final_position'] = np.where(df['amended_position'].notnull(), df['amended_position'], df['finish_position'])
df.drop(['finish_position', 'amended_position'], axis = 1, inplace = True)

#### 2.1 Remove winners 
Creating sample not affected by winners (causing fat right tail in price decreases).

In [121]:
df = df[df['win'] == 0]

#### 2.1 Form transformation

In [134]:
df['form_3'], df['form_2'], df['form_1'] = df['form_figures'].str[-3:], df['form_figures'].str[-2:], df['form_figures'].str[-1] 
df[['form_3','form_2', 'form_1', 'form_figures']].head()

# if string less than 2 then apply some rule
# form aggreagations
# form rules e.g. if contains 
# one-hots for previous two ? combos ? 

Unnamed: 0,form_3,form_2,form_1,form_figures
0,130,30,0,120130
1,18,18,8,16018
2,327,27,7,15327
3,910,10,0,29910
4,322,22,2,4322


In [135]:
df[['form_3','form_2', 'form_1', 'form_figures']].head(20)

Unnamed: 0,form_3,form_2,form_1,form_figures
0,130,30,0,120130
1,18,18,8,16018
2,327,27,7,15327
3,910,10,0,29910
4,322,22,2,4322
5,112,12,2,512181112
6,327,27,7,161304327
7,714,14,4,850714
8,21,21,1,21
9,15,15,5,215324015


In [None]:
# prev vars
prev_vars = ['form', 'handicap', 'maiden', 'finish_position']

In [None]:
# difference vars
dif_vars = ['handicap', ]

In [None]:
# variable lists
one_hots = ['course', 'race_type', 'going', 'gender', 'age', 'bred', 'owner_id', 'trainer_id',
           'jockey_id', 'dam_id', 'sire_id'] # add in engineered vars
bins = ['num_runners', 'distance_yards', 'added_money', 'distance_travelled', 'days_since_ran', 'weight_pounds',
        'early_traded', 'total_traded'] # add in engineered vars

In [None]:
# combos
course x jockey
course x trainer
race_type x jockey
race_type x trainer


In [None]:
- data processing 
- create features (prev races, nlp)
- create combinations
- create targets (av_price -> bsp, bsp -> ip_min (w/o winners?))
