In [81]:
# packages
from sqlalchemy import create_engine
import pymysql
import pandas as pd
import numpy as np


# configs
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
%matplotlib inline

## Analysing how fundamental / technical features influence price changes

- Create 'natural' & 'engineered' features (bins & one-hot)
- Create target (ip_min - bsp) - excluding 1.01's... 
- Measure their influence on price (using ANOVA?)
- Identify those with the 'biggest influence'

#### 1. Reading in data

In [106]:
db_connection_str = 'mysql+pymysql://root:xboxpass32@localhost/smartform'
db_connection = create_engine(db_connection_str)

df = pd.read_sql('''
                 SELECT
                  race_id,
                  course,
                  race_type,
                  going,
                  handicap,
                  maiden,
                  num_runners,
                  distance_yards,
                  added_money,
                  runner_id,
                  distance_travelled,
                  form_figures,
                  gender,
                  age,
                  bred,
                  in_race_comment,
                  owner_id,
                  trainer_id,
                  jockey_id,
                  dam_id,
                  sire_id,
                  position_in_betting,
                  days_since_ran,
                  weight_pounds,
                  finish_position,
                  amended_position,
                  bf_race_id,
                  bf_runner_id,
                  bsp,
                  av_price,
                  early_price,
                  ante_maxprice,
                  ante_minprice,
                  inplay_max,
                  inplay_min,
                  early_traded,
                  total_traded,
                  inplay_traded,
                  win
                 FROM
                  historic_races
                  JOIN historic_runners USING (race_id)
                  JOIN historic_betfair_win_prices ON race_id = sf_race_id
                  AND runner_id = sf_runner_id
                WHERE
                  (
                    CAST(historic_races.meeting_date AS Datetime) BETWEEN '2010-10-01'
                    AND '2020-01-01'
                  )
                ORDER BY
                  race_id,
                  runner_id
                ''',
                con=db_connection)
print('No. Rows : ', len(df.index))
# db_connection.close()

No. Rows :  1103357


In [107]:
df.loc[df['bsp'] == 0 | df['inplay_min'] == 0].shape

TypeError: cannot compare a dtyped [float64] array with a scalar of type [bool]

In [85]:
df.head(1)

Unnamed: 0,race_id,course,race_type,going,handicap,maiden,num_runners,distance_yards,added_money,runner_id,distance_travelled,form_figures,gender,age,bred,in_race_comment,owner_id,trainer_id,jockey_id,dam_id,sire_id,position_in_betting,days_since_ran,weight_pounds,finish_position,amended_position,bf_race_id,bf_runner_id,bsp,av_price,early_price,ante_maxprice,ante_minprice,inplay_max,inplay_min,early_traded,total_traded,inplay_traded,win
0,866014,Newmarket,Flat,Good to Soft,0,0,28,1320,150000.0,2436056,1.0,42,G,2,UK,"held up stand side, ridden and headway over 1f...",1451894,272,1154425.0,1539088,1596094,10,18.0,121,10.0,,163213113,25679575,37.17,31.0,34.4,65.0,27.0,1000.0,30.0,694,9180,149,0


### 2. Data Processing

#### 2.0 Correct finish position 

In [86]:
df['final_position'] = np.where(df['amended_position'].notnull(), df['amended_position'], df['finish_position'])
df.drop(['finish_position', 'amended_position'], axis = 1, inplace = True)

#### 2.1 Remove winners 
Creating sample not affected by winners (causing fat right tail in price decreases).

In [92]:
df = df.loc[df['win'] == 0]

##### 2.1 Form transformation

In [None]:
# removing outliers & (bsp, null markets)
df = df.loc[df['bsp'] != 1.01 & df['bsp'] != 0]

In [None]:
# prev vars
prev_vars = ['form', 'handicap', 'maiden', 'finish_position']

In [None]:
# difference vars
dif_vars = ['handicap', ]

In [None]:
# variable lists
one_hots = ['course', 'race_type', 'going', 'gender', 'age', 'bred', 'owner_id', 'trainer_id',
           'jockey_id', 'dam_id', 'sire_id'] # add in engineered vars
bins = ['num_runners', 'distance_yards', 'added_money', 'distance_travelled', 'days_since_ran', 'weight_pounds',
        'early_traded', 'total_traded'] # add in engineered vars

In [None]:
# combos
course x jockey
course x trainer
race_type x jockey
race_type x trainer


In [None]:
- data processing 
- create features (prev races, nlp)
- create combinations
- create targets (av_price -> bsp, bsp -> ip_min (w/o winners?))
