## 4. Predictive Analysis 

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns
import statsmodels.api as sm

def get_dummies_order_by_count(df, column_name):
    # Get dummies by descending count order
    return pd.get_dummies(df[column_name]).reindex_axis(df[column_name].value_counts().index, axis=1).iloc[:, :-1]

# Set some specs for plotting
%matplotlib inline
mpl.rcParams['figure.figsize'] = (16.0, 8.0)
mpl.style.use('ggplot')
plt.rcParams['font.family'] = 'IPAGothic'

# Read data from the data folder
race_df = pd.read_csv('data/race.csv', low_memory=False, index_col=0)
horse_df = pd.read_csv('data/horse.csv', low_memory=False, index_col=0)
individual_df = pd.read_csv('data/individual.csv', low_memory=False, index_col=0)
trainer_df = pd.read_csv('data/trainer.csv', low_memory=False, index_col=0)
jockey_df = pd.read_csv('data/jockey.csv', low_memory=False, index_col=0)
horse_race_df = pd.read_csv('data/horse_race.csv', low_memory=False, index_col=0)
horse_race_df['age_int'] = horse_race_df['sex_age'].apply(lambda x: re.search(r'\d+', x).group(0)).astype(int)

# Do some simple data transformation
try:
    first_occur_df = pd.read_csv('data/first_occurence_race.csv', low_memory=False, index_col=0)
except FileNotFoundError:
    horse_race_sorted = horse_race_df.sort_values(['horse_id', 'run_date'])
    horse_id_set = set()
    first_occur_dict = {}
    for index, value in horse_race_sorted.iterrows():
        if value['horse_id'] not in horse_id_set:
            horse_id_set.add(value['horse_id'])
            first_occur_dict[index] = value
    first_occur_df = pd.DataFrame.from_dict(first_occur_dict, orient='index')
    first_occur_df.to_csv('data/first_occurence_race.csv', encoding='utf-8')
    
columns_to_drop = [
    # TODO: Update columns to drop when breeder id is available
    'run_date', 'race', 'title', 'horse', 'sex_age',
    'distance', 'run_time',
    'jockey', 'margin', 'trainer_x', 'trainer_y', 'owner_x', 'owner_y', 'horse_name', 'date_of_birth', 
    'transaction_price', 'prize_obtained', 'race_record', 'highlight_race', 'relatives', 'status', 'prize'
]
for column in columns_to_drop:
    try:
        first_occur_df.drop(column, axis=1, inplace=True)
        horse_race_df.drop(column, axis=1, inplace=True)
    except ValueError:
        continue

## 4.1 Feature Engineering 

In [2]:
first_occur_df_copy = first_occur_df.copy()
dummied_cols = ['place', 'type', 'track', 'weather', 'condition', 'gender', 'breed', 'bracket', 'horse_number']
for cols in dummied_cols:
    first_occur_df_copy = first_occur_df_copy.join(get_dummies_order_by_count(first_occur_df_copy, 
                                                   cols).rename(columns=lambda x: '-'.join([cols, str(x)])))
    try:
        first_occur_df_copy.drop(cols, axis=1, inplace=True)
    except ValueError:
        continue
first_occur_df_copy['horse_weight'] = first_occur_df_copy['horse_weight'].apply(lambda x: re.search(r'(\d+)\(.+\)', x).group(1))
first_occur_df_copy['horse_weight'] = first_occur_df_copy['horse_weight'].astype(float)
columns_to_drop_first = ['time', 'finishing_position', 'corner_position', 'run_time_last_600', 
                         'horse_id', 'jockey_id', 'owner_id', 'trainer_id', 'breeder', 
                         'place_of_birth', 'parents']
for cols in columns_to_drop_first:
    try:
        first_occur_df_copy.drop(cols, axis=1, inplace=True)
    except ValueError:
        continue

## 4.2 Regression Models

In [3]:
X = first_occur_df_copy.loc[:, first_occur_df_copy.columns != 'run_time_1000']
y = first_occur_df_copy.loc[:, 'run_time_1000']
X = sm.add_constant(X)
results = sm.OLS(y, X).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:          run_time_1000   R-squared:                       0.507
Model:                            OLS   Adj. R-squared:                  0.507
Method:                 Least Squares   F-statistic:                     1485.
Date:                Wed, 21 Feb 2018   Prob (F-statistic):               0.00
Time:                        21:50:17   Log-Likelihood:            -1.7305e+05
No. Observations:               86528   AIC:                         3.462e+05
Df Residuals:                   86467   BIC:                         3.468e+05
Df Model:                          60                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              43.8712      0.437    1

## Some EDA to be integrated into descriptive analysis notebook 