## 4. Regression Analysis 

In [1]:
import cmocean
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import statsmodels.api as sm

from feature_engineering import feature_engineer

%matplotlib inline

# Read data from the data folder
file_directory = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), '')) + '\\'
horse_race_df = pd.read_csv(file_directory + 'data/horse_race.csv', low_memory=False, index_col=0)
horse_race_df['age_int'] = horse_race_df['sex_age'].apply(lambda x: re.search(r'\d+', x).group(0)).astype(int)

# Do some simple data transformation
horse_race_df['run_date'] = horse_race_df['run_date'].apply(pd.Timestamp)
horse_race_df = horse_race_df.sort_values(['horse_id', 'run_date'])
try:
    first_occur_df = pd.read_csv(file_directory + 'data/first_occurence_race.csv', low_memory=False, index_col=0)
    first_occur_df['run_date'] = first_occur_df['run_date'].apply(pd.Timestamp)
    first_occur_df = first_occur_df.sort_values(['horse_id', 'run_date'])
except FileNotFoundError:
    horse_race_sorted = horse_race_df.copy()
    horse_id_set = set()
    first_occur_dict = {}
    for index, value in horse_race_sorted.iterrows():
        if value['horse_id'] not in horse_id_set:
            horse_id_set.add(value['horse_id'])
            first_occur_dict[index] = value
    first_occur_df = pd.DataFrame.from_dict(first_occur_dict, orient='index')
    first_occur_df.to_csv(file_directory + 'data/first_occurence_race.csv', encoding='utf-8')
    
columns_to_drop = [
    'race', 'title', 'horse', 'sex_age',
    'distance', 'run_time', 'breeder',
    'jockey', 'margin', 'trainer_x', 'trainer_y', 'owner_x', 'owner_y', 'horse_name', 'date_of_birth', 
    'transaction_price', 'prize_obtained', 'race_record', 'highlight_race', 'relatives', 'status', 'prize'
]
for column in columns_to_drop:
    try:
        first_occur_df.drop(column, axis=1, inplace=True)
        horse_race_df.drop(column, axis=1, inplace=True)
    except ValueError:
        continue
        
horse_race_df = horse_race_df[horse_race_df['finishing_position'].apply(lambda x: bool(re.search(r'\d+', x)))]
horse_race_df['finishing_position'] = horse_race_df['finishing_position'].apply(lambda x: re.search(r'\d+', x).group(0))
horse_race_df['finishing_position'] = horse_race_df['finishing_position'].astype(int)

  if not mpl.cbook.is_string_like(rgbin[0]):
  from pandas.core import datetools


## 4.1 Feature Engineering 

In [2]:
# Functions have been moved to feature_engineering.py for code reuse
# See feature_engineering.py for more function details

if 1 == 0:
    # TODO: Check parent information (70%+ missing values)
    columns_to_drop_again = ['run_date', 'horse_id', 'parents']
    new_df = first_occur_df[columns_to_drop_again].copy()

    target_columns = ['horse_id', 'date_of_birth', 'breeder_id', 'gender', 'breed', 'race_record']
    new_df['parent_id_1'] = new_df['parents'].apply(lambda x: x.split(' ')[0])
    new_df['parent_id_2'] = new_df['parents'].apply(lambda x: x.split(' ')[1])
    new_df.drop('parents', axis=1, inplace=True)
    new_df = new_df[new_df['parent_id_1'].isin(horse_df['horse_id'].astype(str))]
    new_df = new_df[new_df['parent_id_2'].isin(horse_df['horse_id'].astype(str))]

    new_df.tail()

## 4.2 Regression Analysis

### 4.2.1 OLS for First Occurence Race

In [3]:
new_df_first = feature_engineer(first_occur_df)
X_first = new_df_first.loc[:, new_df_first.columns != 'run_time_1000']
y_first = new_df_first.loc[:, 'run_time_1000']
X_first = sm.add_constant(X_first)
results = sm.OLS(y_first, X_first).fit()
results.summary()

0,1,2,3
Dep. Variable:,run_time_1000,R-squared:,0.433
Model:,OLS,Adj. R-squared:,0.431
Method:,Least Squares,F-statistic:,247.5
Date:,"Tue, 20 Mar 2018",Prob (F-statistic):,0.0
Time:,21:38:24,Log-Likelihood:,-135220.0
No. Observations:,67877,AIC:,270900.0
Df Residuals:,67667,BIC:,272800.0
Df Model:,209,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,27.6812,5.030,5.503,0.000,17.821,37.541
jockey_weight,0.0788,0.008,9.787,0.000,0.063,0.095
win_odds,0.0003,0.000,2.069,0.039,1.32e-05,0.000
win_fav,0.1154,0.003,41.803,0.000,0.110,0.121
horse_weight,0.0072,0.000,28.365,0.000,0.007,0.008
curr_age,-0.6282,0.026,-24.384,0.000,-0.679,-0.578
horse_weight_increase,-0.0492,0.002,-20.738,0.000,-0.054,-0.045
age_stated,0.3200,0.017,18.387,0.000,0.286,0.354
jockey_age,-0.0040,0.001,-3.499,0.000,-0.006,-0.002

0,1,2,3
Omnibus:,88410.912,Durbin-Watson:,1.981
Prob(Omnibus):,0.0,Jarque-Bera (JB):,271386027.843
Skew:,6.294,Prob(JB):,0.0
Kurtosis:,312.513,Cond. No.,1.01e+16


### 4.2.2 OLS for Full Race

In [4]:
new_df_full = feature_engineer(horse_race_df)
X_full = new_df_full.loc[:, new_df_full.columns != 'run_time_1000']
y_full = new_df_full.loc[:, 'run_time_1000']
X_full = sm.add_constant(X_full)
results = sm.OLS(y_full, X_full).fit()
results.summary()

0,1,2,3
Dep. Variable:,run_time_1000,R-squared:,0.572
Model:,OLS,Adj. R-squared:,0.572
Method:,Least Squares,F-statistic:,4340.0
Date:,"Tue, 20 Mar 2018",Prob (F-statistic):,0.0
Time:,21:40:37,Log-Likelihood:,-1300500.0
No. Observations:,677627,AIC:,2601000.0
Df Residuals:,677417,BIC:,2604000.0
Df Model:,209,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,68.8892,1.099,62.677,0.000,66.735,71.043
jockey_weight,0.0205,0.002,11.203,0.000,0.017,0.024
win_odds,0.0014,3.23e-05,43.404,0.000,0.001,0.001
win_fav,0.0710,0.001,96.891,0.000,0.070,0.072
horse_weight,0.0019,7.85e-05,23.941,0.000,0.002,0.002
curr_age,-0.5944,0.006,-100.763,0.000,-0.606,-0.583
horse_weight_increase,-0.0092,0.000,-21.543,0.000,-0.010,-0.008
age_stated,0.2870,0.006,51.122,0.000,0.276,0.298
jockey_age,-0.0011,0.000,-3.460,0.001,-0.002,-0.000

0,1,2,3
Omnibus:,332957.043,Durbin-Watson:,1.217
Prob(Omnibus):,0.0,Jarque-Bera (JB):,64067606.365
Skew:,1.263,Prob(JB):,0.0
Kurtosis:,50.568,Cond. No.,1.01e+16


In [5]:
horse_race_df_grp_by = horse_race_df.set_index(['horse_id', 'run_date'])
horse_race_df_grp_by['run_time_diff'] = horse_race_df_grp_by['run_time_1000'].diff()
horse_race_df_grp_by = horse_race_df_grp_by[~horse_race_df_grp_by.index.isin(first_occur_df.set_index(['horse_id', 
                                                                                                       'run_date']).index)]
horse_race_df_grp_by.reset_index(inplace=True)
new_df_full_diff = feature_engineer(horse_race_df_grp_by)
new_df_full_diff['last_run_time'] = new_df_full_diff['run_time_1000'] - new_df_full_diff['run_time_diff']
new_df_full_diff.drop('run_time_diff', inplace=True, axis=1)

X_full_diff = new_df_full_diff.loc[:, new_df_full_diff.columns != 'run_time_1000']
y_full_diff = new_df_full_diff.loc[:, 'run_time_1000']
X_full_diff = sm.add_constant(X_full_diff)
results_diff = sm.OLS(y_full_diff, X_full_diff).fit()
results_diff.summary()

0,1,2,3
Dep. Variable:,run_time_1000,R-squared:,0.648
Model:,OLS,Adj. R-squared:,0.648
Method:,Least Squares,F-statistic:,5348.0
Date:,"Tue, 20 Mar 2018",Prob (F-statistic):,0.0
Time:,21:42:40,Log-Likelihood:,-1113400.0
No. Observations:,609756,AIC:,2227000.0
Df Residuals:,609545,BIC:,2230000.0
Df Model:,210,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,52.6417,1.061,49.614,0.000,50.562,54.721
jockey_weight,0.0085,0.002,4.892,0.000,0.005,0.012
win_odds,0.0009,3.09e-05,29.304,0.000,0.001,0.001
win_fav,0.0404,0.001,57.334,0.000,0.039,0.042
horse_weight,-0.0002,7.62e-05,-2.559,0.010,-0.000,-4.57e-05
curr_age,-0.2941,0.006,-50.248,0.000,-0.306,-0.283
horse_weight_increase,0.0020,0.000,4.749,0.000,0.001,0.003
age_stated,0.1306,0.006,23.332,0.000,0.120,0.142
jockey_age,-0.0006,0.000,-2.035,0.042,-0.001,-2.37e-05

0,1,2,3
Omnibus:,143018.469,Durbin-Watson:,1.796
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2474795.849
Skew:,0.677,Prob(JB):,0.0
Kurtosis:,12.776,Cond. No.,1.01e+16
