## 4. Predictive Analysis 

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns
import statsmodels.api as sm

# Set some specs for plotting
%matplotlib inline
mpl.rcParams['figure.figsize'] = (16.0, 8.0)
mpl.style.use('ggplot')
plt.rcParams['font.family'] = 'IPAGothic'

# Read data from the data folder
race_df = pd.read_csv('data/race.csv', low_memory=False, index_col=0)
horse_df = pd.read_csv('data/horse.csv', low_memory=False, index_col=0)
individual_df = pd.read_csv('data/individual.csv', low_memory=False, index_col=0)
trainer_df = pd.read_csv('data/trainer.csv', low_memory=False, index_col=0)
jockey_df = pd.read_csv('data/jockey.csv', low_memory=False, index_col=0)
horse_race_df = pd.read_csv('data/horse_race.csv', low_memory=False, index_col=0)
horse_race_df['age_int'] = horse_race_df['sex_age'].apply(lambda x: re.search(r'\d+', x).group(0)).astype(int)

# Do some simple data transformation
try:
    first_occur_df = pd.read_csv('data/first_occurence_race.csv', low_memory=False, index_col=0)
except FileNotFoundError:
    horse_race_sorted = horse_race_df.sort_values(['horse_id', 'run_date'])
    horse_id_set = set()
    first_occur_dict = {}
    for index, value in horse_race_sorted.iterrows():
        if value['horse_id'] not in horse_id_set:
            horse_id_set.add(value['horse_id'])
            first_occur_dict[index] = value
    first_occur_df = pd.DataFrame.from_dict(first_occur_dict, orient='index')
    first_occur_df.to_csv('data/first_occurence_race.csv', encoding='utf-8')
    
columns_to_drop = [
    'race', 'title', 'horse', 'sex_age',
    'distance', 'run_time', 'breeder',
    'jockey', 'margin', 'trainer_x', 'trainer_y', 'owner_x', 'owner_y', 'horse_name', 'date_of_birth', 
    'transaction_price', 'prize_obtained', 'race_record', 'highlight_race', 'relatives', 'status', 'prize'
]
for column in columns_to_drop:
    try:
        first_occur_df.drop(column, axis=1, inplace=True)
        horse_race_df.drop(column, axis=1, inplace=True)
    except ValueError:
        continue
        
horse_race_df = horse_race_df[horse_race_df['finishing_position'].apply(lambda x: bool(re.search(r'\d+', x)))]
horse_race_df['finishing_position'] = horse_race_df['finishing_position'].apply(lambda x: re.search(r'\d+', x).group(0))
horse_race_df['finishing_position'] = horse_race_df['finishing_position'].astype(int)

  from pandas.core import datetools


## 4.1 Feature Engineering 

In [11]:
def get_dummies_order_by_count(df, column_name):
    # Get dummies by descending count order
    return pd.get_dummies(df[column_name]).reindex_axis(df[column_name].value_counts().index, axis=1).iloc[:, :-1]

def parse_time_stamp(time_string):
    # Parse timestamp expressed in hours
    time_split = time_string.split(':')
    hour = int(time_split[0])
    if hour < 12:
        return '10-12'
    elif hour > 12 and hour < 15:
        return '12-15'
    else:
        return '15-'
    
def get_trainer_jockey_profile(df, individual):
    # Merge with trainer/jockey dataframe
    assert individual in ['trainer', 'jockey']
    if individual == 'trainer':
        merge_df = trainer_df
    elif individual == 'jockey':
        merge_df = jockey_df
    df = df.merge(merge_df[['%s_id' % individual, 'date_of_birth', 'place_of_birth']], 
                  on='%s_id' % individual, suffixes=['', '_%s' % individual])
    df['run_date'] = df['run_date'].apply(lambda x: pd.Timestamp(x))
    df['date_of_birth'] = df['date_of_birth'].apply(lambda x: pd.Timestamp(x))
    df['%s_age' % individual] = df['run_date'].subtract(df['date_of_birth']).dt.days / 365.0
    df.drop(['date_of_birth'], axis=1, inplace=True)
    df['place_of_birth_%s' % individual] = df['place_of_birth_%s' % individual].apply(lambda x: 'tokyo' if x == u'東京都' \
                                                                                      else 'outside_tokyo')
    return df

def feature_engineer(race_df):
    
    new_df = race_df.copy()

    # Feature engineering
    has_horse_weight = new_df['horse_weight'].apply(lambda x: bool(re.search(r'(\d+)\(.+\)', x)))
    new_df = new_df[has_horse_weight]
    new_df['horse_weight_increase'] = new_df['horse_weight'].apply(lambda x: re.search(r'\(.?(\d+)\)', x).group(1))
    new_df['horse_weight_increase'] = new_df['horse_weight_increase'].astype(float)
    new_df['horse_weight'] = new_df['horse_weight'].apply(lambda x: re.search(r'(\d+)\(.+\)', x).group(1))
    new_df['horse_weight'] = new_df['horse_weight'].astype(float)

    new_df['time'] = new_df['time'].apply(lambda x: parse_time_stamp(x))

    for individual in ['jockey', 'trainer']:
        new_df = get_trainer_jockey_profile(new_df, individual)

    # Get dummy columns
    dummied_cols = ['place', 'type', 'track', 'weather', 'condition', 'gender', 'breed', 'bracket', 'horse_number', 'time',
                    'place_of_birth_jockey', 'place_of_birth_trainer']
    for cols in dummied_cols:
        new_df = new_df.join(get_dummies_order_by_count(new_df, 
                                                       cols).rename(columns=lambda x: '-'.join([cols, str(x)])))
        try:
            new_df.drop(cols, axis=1, inplace=True)
        except ValueError:
            continue

    # Drop some other columns
    columns_to_drop_again = ['run_date', 
                             'finishing_position', 'corner_position', 'run_time_last_600', 
                             'jockey_id', 'owner_id', 'trainer_id', 'breeder_id', 
                             'parents', 
                             'horse_id', 'age_int', 'place_of_birth']
    if 1 == 1:
        for cols in columns_to_drop_again:
            try:
                new_df.drop(cols, axis=1, inplace=True)
            except ValueError:
                continue
    
    return new_df

new_df = feature_engineer(horse_race_df)

  app.launch_new_instance()


## 4.2 Regression Models

In [12]:
X = new_df.loc[:, new_df.columns != 'run_time_1000']
y = new_df.loc[:, 'run_time_1000']
X = sm.add_constant(X)
results = sm.OLS(y, X).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:          run_time_1000   R-squared:                       0.557
Model:                            OLS   Adj. R-squared:                  0.557
Method:                 Least Squares   F-statistic:                 1.635e+04
Date:                Mon, 26 Feb 2018   Prob (F-statistic):               0.00
Time:                        16:54:23   Log-Likelihood:            -1.6583e+06
No. Observations:              857855   AIC:                         3.317e+06
Df Residuals:                  857788   BIC:                         3.317e+06
Df Model:                          66                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------
