## 4. Predictive Analysis 

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns
import statsmodels.api as sm
import xgboost as xgb

from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor

# Read data from the data folder
race_df = pd.read_csv('data/race.csv', low_memory=False, index_col=0)
horse_df = pd.read_csv('data/horse.csv', low_memory=False, index_col=0)
individual_df = pd.read_csv('data/individual.csv', low_memory=False, index_col=0)
trainer_df = pd.read_csv('data/trainer.csv', low_memory=False, index_col=0)
jockey_df = pd.read_csv('data/jockey.csv', low_memory=False, index_col=0)
horse_race_df = pd.read_csv('data/horse_race.csv', low_memory=False, index_col=0)
horse_race_df['age_int'] = horse_race_df['sex_age'].apply(lambda x: re.search(r'\d+', x).group(0)).astype(int)

# Do some simple data transformation
horse_race_df['run_date'] = horse_race_df['run_date'].apply(pd.Timestamp)
horse_race_df = horse_race_df.sort_values(['horse_id', 'run_date'])
try:
    first_occur_df = pd.read_csv('data/first_occurence_race.csv', low_memory=False, index_col=0)
    first_occur_df['run_date'] = first_occur_df['run_date'].apply(pd.Timestamp)
except FileNotFoundError:
    horse_race_sorted = horse_race_df.copy()
    horse_id_set = set()
    first_occur_dict = {}
    for index, value in horse_race_sorted.iterrows():
        if value['horse_id'] not in horse_id_set:
            horse_id_set.add(value['horse_id'])
            first_occur_dict[index] = value
    first_occur_df = pd.DataFrame.from_dict(first_occur_dict, orient='index')
    first_occur_df.to_csv('data/first_occurence_race.csv', encoding='utf-8')
    
columns_to_drop = [
    'race', 'title', 'horse', 'sex_age',
    'distance', 'run_time', 'breeder',
    'jockey', 'margin', 'trainer_x', 'trainer_y', 'owner_x', 'owner_y', 'horse_name', 'date_of_birth', 
    'transaction_price', 'prize_obtained', 'race_record', 'highlight_race', 'relatives', 'status', 'prize'
]
for column in columns_to_drop:
    try:
        first_occur_df.drop(column, axis=1, inplace=True)
        horse_race_df.drop(column, axis=1, inplace=True)
    except ValueError:
        continue
        
horse_race_df = horse_race_df[horse_race_df['finishing_position'].apply(lambda x: bool(re.search(r'\d+', x)))]
horse_race_df['finishing_position'] = horse_race_df['finishing_position'].apply(lambda x: re.search(r'\d+', x).group(0))
horse_race_df['finishing_position'] = horse_race_df['finishing_position'].astype(int)

  from pandas.core import datetools


In [2]:
# Set some specs for plotting
mpl.rcParams['figure.figsize'] = (16.0, 8.0)
mpl.style.use('ggplot')
plt.rcParams['font.family'] = 'IPAGothic'

%matplotlib inline

## 4.1 Feature Engineering 

In [3]:
def get_dummies_order_by_count(df, column_name):
    # Get dummies by descending count order
    return pd.get_dummies(df[column_name]).reindex(df[column_name].value_counts().index, axis=1).iloc[:, :-1]

def parse_time_stamp(time_string):
    # Parse timestamp expressed in hours
    time_split = time_string.split(':')
    hour = int(time_split[0])
    if hour < 12:
        return '10-12'
    elif hour > 12 and hour < 15:
        return '12-15'
    else:
        return '15-after'
    
def get_trainer_jockey_profile(df, individual):
    # Merge with trainer/jockey dataframe
    assert individual in ['trainer', 'jockey']
    if individual == 'trainer':
        merge_df = trainer_df
    elif individual == 'jockey':
        merge_df = jockey_df
    df = df.merge(merge_df[['%s_id' % individual, 'date_of_birth', 'place_of_birth']], 
                  on='%s_id' % individual, suffixes=['', '_%s' % individual])
    df['run_date'] = df['run_date'].apply(lambda x: pd.Timestamp(x))
    df['date_of_birth'] = df['date_of_birth'].apply(lambda x: pd.Timestamp(x))
    df['%s_age' % individual] = df['run_date'].subtract(df['date_of_birth']).dt.days / 365.0
    df.drop(['date_of_birth'], axis=1, inplace=True)
    df['place_of_birth_%s' % individual] = df['place_of_birth_%s' % individual].apply(lambda x: 'tokyo' if x == u'東京都' \
                                                                                      else 'outside_tokyo')
    return df

def feature_engineer(race_df, dummy=True, drop_columns=True):
    
    new_df = race_df.copy()

    # Feature engineering
    has_horse_weight = new_df['horse_weight'].apply(lambda x: bool(re.search(r'(\d+)\(.+\)', x)))
    new_df = new_df[has_horse_weight]
    new_df['horse_weight_increase'] = new_df['horse_weight'].apply(lambda x: re.search(r'\(.?(\d+)\)', x).group(1))
    new_df['horse_weight_increase'] = new_df['horse_weight_increase'].astype(float)
    new_df['horse_weight'] = new_df['horse_weight'].apply(lambda x: re.search(r'(\d+)\(.+\)', x).group(1))
    new_df['horse_weight'] = new_df['horse_weight'].astype(float)

    new_df['time'] = new_df['time'].apply(lambda x: parse_time_stamp(x))

    for individual in ['jockey', 'trainer']:
        new_df = get_trainer_jockey_profile(new_df, individual)

    # Get dummy columns
    if dummy:
        dummied_cols = ['place', 'type', 'track', 'weather', 'condition', 'gender', 'breed', 'bracket', 'horse_number', 
                        'time', 'place_of_birth_jockey', 'place_of_birth_trainer']
        for cols in dummied_cols:
            new_df = new_df.join(get_dummies_order_by_count(new_df, 
                                                           cols).rename(columns=lambda x: '-'.join([cols, str(x)])))
            try:
                new_df.drop(cols, axis=1, inplace=True)
            except ValueError:
                continue

    # Drop some other columns
    columns_to_drop_again = ['finishing_position', 'corner_position', 'run_time_last_600', 
                             'jockey_id', 'owner_id', 'trainer_id', 'breeder_id', 
                             'parents', 'age_int', 'place_of_birth']
    if drop_columns:
        for cols in columns_to_drop_again:
            try:
                new_df.drop(cols, axis=1, inplace=True)
            except ValueError:
                continue
    
    return new_df.sort_values(['horse_id', 'run_date']).set_index(['horse_id', 'run_date'])

## 4.2 Regression Analysis

### 4.2.1 OLS for First Occurence Race

In [4]:
new_df_first = feature_engineer(first_occur_df)
X_first = new_df_first.loc[:, new_df_first.columns != 'run_time_1000']
y_first = new_df_first.loc[:, 'run_time_1000']
X_first = sm.add_constant(X_first)
results = sm.OLS(y_first, X_first).fit()
results.summary()

0,1,2,3
Dep. Variable:,run_time_1000,R-squared:,0.426
Model:,OLS,Adj. R-squared:,0.425
Method:,Least Squares,F-statistic:,971.6
Date:,"Thu, 08 Mar 2018",Prob (F-statistic):,0.0
Time:,19:28:27,Log-Likelihood:,-172260.0
No. Observations:,86622,AIC:,344600.0
Df Residuals:,86555,BIC:,345300.0
Df Model:,66,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,60.2242,0.633,95.151,0.000,58.984,61.465
jockey_weight,0.0838,0.007,12.578,0.000,0.071,0.097
win_odds,0.0003,0.000,2.471,0.013,5.54e-05,0.000
win_fav,0.1028,0.002,45.776,0.000,0.098,0.107
horse_weight,0.0074,0.000,33.384,0.000,0.007,0.008
curr_age,-0.2335,0.013,-17.429,0.000,-0.260,-0.207
horse_weight_increase,-0.0534,0.002,-25.199,0.000,-0.058,-0.049
jockey_age,-0.0042,0.001,-4.859,0.000,-0.006,-0.002
trainer_age,-0.0024,0.001,-3.542,0.000,-0.004,-0.001

0,1,2,3
Omnibus:,102675.695,Durbin-Watson:,1.962
Prob(Omnibus):,0.0,Jarque-Bera (JB):,226979364.451
Skew:,5.268,Prob(JB):,0.0
Kurtosis:,253.554,Cond. No.,1e+16


### 4.2.2 OLS for Full Race

In [5]:
new_df_full = feature_engineer(horse_race_df)
X_full = new_df_full.loc[:, new_df_full.columns != 'run_time_1000']
y_full = new_df_full.loc[:, 'run_time_1000']
X_full = sm.add_constant(X_full)
results = sm.OLS(y_full, X_full).fit()
results.summary()

0,1,2,3
Dep. Variable:,run_time_1000,R-squared:,0.557
Model:,OLS,Adj. R-squared:,0.557
Method:,Least Squares,F-statistic:,16350.0
Date:,"Thu, 08 Mar 2018",Prob (F-statistic):,0.0
Time:,19:29:19,Log-Likelihood:,-1658300.0
No. Observations:,857855,AIC:,3317000.0
Df Residuals:,857788,BIC:,3317000.0
Df Model:,66,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,44.6347,0.142,314.697,0.000,44.357,44.913
jockey_weight,0.0207,0.002,13.127,0.000,0.018,0.024
win_odds,0.0015,2.91e-05,51.592,0.000,0.001,0.002
win_fav,0.0670,0.001,106.068,0.000,0.066,0.068
horse_weight,0.0019,7.04e-05,26.474,0.000,0.002,0.002
curr_age,-0.3122,0.002,-174.727,0.000,-0.316,-0.309
horse_weight_increase,-0.0095,0.000,-24.545,0.000,-0.010,-0.009
jockey_age,-0.0038,0.000,-15.223,0.000,-0.004,-0.003
trainer_age,-0.0036,0.000,-17.275,0.000,-0.004,-0.003

0,1,2,3
Omnibus:,379363.098,Durbin-Watson:,1.185
Prob(Omnibus):,0.0,Jarque-Bera (JB):,50886002.269
Skew:,1.111,Prob(JB):,0.0
Kurtosis:,40.666,Cond. No.,3900000000000000.0


In [6]:
horse_race_df_grp_by = horse_race_df.set_index(['horse_id', 'run_date'])
horse_race_df_grp_by['run_time_diff'] = horse_race_df_grp_by['run_time_1000'].diff()
horse_race_df_grp_by = horse_race_df_grp_by[~horse_race_df_grp_by.index.isin(first_occur_df.set_index(['horse_id', 
                                                                                                       'run_date']).index)]
horse_race_df_grp_by.reset_index(inplace=True)
new_df_full_diff = feature_engineer(horse_race_df_grp_by)
new_df_full_diff['last_run_time'] = new_df_full_diff['run_time_1000'] - new_df_full_diff['run_time_diff']
new_df_full_diff.drop('run_time_diff', inplace=True, axis=1)

X_full_diff = new_df_full_diff.loc[:, new_df_full_diff.columns != 'run_time_1000']
y_full_diff = new_df_full_diff.loc[:, 'run_time_1000']
X_full_diff = sm.add_constant(X_full_diff)
results_diff = sm.OLS(y_full_diff, X_full_diff).fit()
results_diff.summary()

0,1,2,3
Dep. Variable:,run_time_1000,R-squared:,0.64
Model:,OLS,Adj. R-squared:,0.64
Method:,Least Squares,F-statistic:,20450.0
Date:,"Thu, 08 Mar 2018",Prob (F-statistic):,0.0
Time:,19:30:07,Log-Likelihood:,-1414600.0
No. Observations:,771242,AIC:,2829000.0
Df Residuals:,771174,BIC:,2830000.0
Df Model:,67,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,30.5710,0.142,215.162,0.000,30.293,30.850
jockey_weight,0.0078,0.001,5.235,0.000,0.005,0.011
win_odds,0.0009,2.76e-05,33.620,0.000,0.001,0.001
win_fav,0.0360,0.001,59.542,0.000,0.035,0.037
horse_weight,-0.0004,6.79e-05,-5.214,0.000,-0.000,-0.000
curr_age,-0.1599,0.002,-91.514,0.000,-0.163,-0.156
horse_weight_increase,0.0021,0.000,5.620,0.000,0.001,0.003
jockey_age,-0.0026,0.000,-10.811,0.000,-0.003,-0.002
trainer_age,-0.0015,0.000,-7.381,0.000,-0.002,-0.001

0,1,2,3
Omnibus:,189316.328,Durbin-Watson:,1.811
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3523847.206
Skew:,0.709,Prob(JB):,0.0
Kurtosis:,13.375,Cond. No.,3850000000000000.0


## 4.3 Regression Models

In [7]:
max_window = 3
try:
    df_combined = pd.read_csv('data/horse_race_combined.csv', low_memory=False, index_col=0)
    df_combined['run_date'] = df_combined['run_date'].apply(lambda x: pd.Timestamp(x))
    df_combined.set_index(['horse_id', 'run_date'], inplace=True)
    dependent = ['run_time_1000',
                 'run_time_diff', 'run_time_quo', 
                 'run_time_mean', 'run_time_median'] + \
                ['run_time_ma_window_%s' % str(idx) for idx in range(2, max_window + 1)] + \
                ['run_time_ewma_window_%s' % str(idx) for idx in range(2, max_window + 1)]
    df_combined_y = df_combined[dependent].copy()
    df_combined_x = df_combined[list(filter(lambda x: x not in dependent, df_combined.columns))].copy()
    df_y_original_dict = {}
    df_y_original_dict['run_time_diff'] = df_combined_x['last_run_time']
    df_y_original_dict['run_time_quo'] = df_combined_x['last_run_time']
    for col_name in dependent[3:]:
        df_combined_y[col_name + '_diff'] = df_combined_y['run_time_1000'] - df_combined_y[col_name]
        df_combined_y[col_name + '_quo'] = df_combined_y['run_time_1000'] / df_combined_y[col_name]
        df_y_original_dict[col_name + '_diff'] = df_combined_y[col_name]
        df_y_original_dict[col_name + '_quo'] = df_combined_y[col_name]
        df_combined_y.drop(col_name, axis=1, inplace=True)
except FileNotFoundError:
    df_combined = horse_race_df.set_index(['horse_id', 'run_date'])
    df_combined['run_time_diff'] = df_combined['run_time_1000'].diff()
    df_combined['last_run_time'] = df_combined['run_time_1000'] - df_combined['run_time_diff']
    df_combined['run_time_quo'] = df_combined['run_time_1000'] / df_combined['last_run_time']
    df_combined['run_time_mean'] = df_combined['last_run_time']

    df_reset = df_combined['run_time_mean'].reset_index()
    horse_id_lst = list(df_reset['horse_id'])
    run_time_mean_lst = list(df_reset['run_time_mean'])
    new_run_time_mean_lst = []
    new_run_time_median_lst = []
    curr_index = horse_id_lst[0]
    curr_count = 0
    curr_sum = 0
    curr_stored = []
    for index, value in zip(horse_id_lst, run_time_mean_lst):
        if index != curr_index:
            curr_count = 1
            curr_sum = value
            curr_index = index
            curr_stored = [value]
        else:
            curr_count += 1
            curr_sum += value
            curr_stored.append(value)
        new_run_time_mean_lst.append(curr_sum / (curr_count * 1.0))
        new_run_time_median_lst.append(np.median(curr_stored))
    df_combined['run_time_mean'] = pd.Series(new_run_time_mean_lst, index=df_combined.index)
    df_combined['run_time_median'] = pd.Series(new_run_time_median_lst, index=df_combined.index)

    for window in range(2, max_window + 1):
        ma = df_combined.groupby(level=0)['run_time_1000'].rolling(window).mean().groupby(level=0).shift(1)
        ma = ma.reset_index(level=1)['run_time_1000'].reset_index()
        ewma = df_combined.groupby(level=0)['run_time_1000'].apply(lambda series: series.ewm(ignore_na=True, 
                                                                                             min_periods=window, 
                                                                                             adjust=True,
                                                                                             com=0.030927835051546).mean())
        ewma = ewma.groupby(level=0).shift(1)
        df_combined['run_time_ma_window_%s' % str(window)] = ma.set_index(['horse_id', 'run_date'])['run_time_1000']
        df_combined['run_time_ewma_window_%s' % str(window)] = ewma
    df_combined.reset_index().to_csv('data/horse_race_combined.csv', encoding='utf-8')

In [8]:
df_combined_y[list(filter(lambda x: 'diff' in x or 'quo' in x, df_combined_y.columns))].T

horse_id,1986102130,1989107128,1989107128,1989107128,1990104469,1990104469,1991100019,1991100019,1991103654,1991103654,...,2015110099,2015110099,2015110099,2015110100,2015110100,2015110102,2015110106,2015110106,2015110106,2015110106
run_date,2000-01-29 15:40:00,2000-02-05 13:45:00,2000-03-11 15:10:00,2000-04-15 15:30:00,2000-03-18 11:45:00,2000-04-15 15:30:00,2000-02-12 15:45:00,2000-03-19 15:00:00,2000-02-26 11:50:00,2000-03-11 15:10:00,...,2017-12-24 10:35:00,2018-01-14 10:00:00,2018-02-04 10:10:00,2018-01-13 10:50:00,2018-02-04 10:25:00,2018-02-18 12:45:00,2017-11-25 10:35:00,2018-01-21 10:20:00,2018-01-27 12:50:00,2018-02-11 12:15:00
run_time_diff,1.642857,-1.146258,0.51049,2.697936,1.279356,1.83003,0.214286,-1.160714,-1.56714,0.568839,...,2.071429,0.583333,-0.22619,-0.297619,1.166667,-1.392857,3.371795,-2.690476,-2.580357,1.25
run_time_quo,1.027381,0.983224,1.007599,1.039856,1.019112,1.026826,1.003567,0.980746,0.977388,1.008398,...,1.034856,1.009485,0.996357,0.995138,1.019152,0.977625,1.053982,0.959132,0.959135,1.02064
run_time_mean_diff,1.642857,-1.146258,-0.062639,2.656177,1.279356,2.469709,0.214286,-1.053571,-1.56714,-0.214731,...,0.392857,0.845238,0.407738,-0.297619,1.017857,-1.392857,3.371795,-1.004579,-3.250076,-1.187557
run_time_mean_quo,1.027381,0.983224,0.999076,1.039215,1.019112,1.036545,1.003567,0.982493,0.977388,0.996866,...,1.006429,1.013802,1.006635,0.995138,1.016668,0.977625,1.053982,0.98434,0.949068,0.98115
run_time_median_diff,1.642857,-1.146258,-0.062639,2.697936,1.279356,2.469709,0.214286,-1.053571,-1.56714,-0.214731,...,0.392857,0.583333,0.065476,-0.297619,1.017857,-1.392857,3.371795,-1.004579,-2.580357,-0.989698
run_time_median_quo,1.027381,0.983224,0.999076,1.039856,1.019112,1.036545,1.003567,0.982493,0.977388,0.996866,...,1.006429,1.009485,1.00106,0.995138,1.016668,0.977625,1.053982,0.98434,0.959135,0.984241
run_time_ma_window_2_diff,,,0.255245,1.348968,,0.915015,,-0.580357,,0.284419,...,1.035714,0.291667,-0.113095,,0.583333,,,-1.345238,-1.290179,0.625
run_time_ma_window_2_quo,,,1.003785,1.019539,,1.013235,,0.99028,,1.004181,...,1.017129,1.00472,0.998175,,1.009485,,,0.97914,0.979141,1.010215
run_time_ma_window_3_diff,,,,1.968787,,,,,,,...,,1.079365,0.043651,,,,,,-2.617063,-0.026786
run_time_ma_window_3_quo,,,,1.028774,,,,,,,...,,1.017693,1.000706,,,,,,0.958577,0.999567


In [9]:
# Model testing for run time residual
x = feature_engineer(df_combined_x.reset_index())
x = x.drop('last_run_time', axis=1)
y = df_combined_y.loc[df_combined_y.index.isin(x.index), 'run_time_diff']

# OLS
reg = linear_model.LinearRegression(fit_intercept=False)
scores_reg = cross_val_score(reg, x, y, scoring='neg_mean_squared_error')
print("RMSE for OLS: %0.5f" % np.sqrt(-scores_reg.mean()))

RMSE for OLS: 2.03554


In [10]:
class ModelComparer(object): 
    
    no_drop_col = ['run_time_1000']
    
    def __init__(self, X_df, y_df, original_y_df_dict):
        self.X = feature_engineer(X_df.reset_index())
        self.y = y_df[y_df.index.isin(self.X.index)]
        self.y_original = original_y_df_dict
        self.run_time_serie = self.y['run_time_1000']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.3)
        self.model_dict = {}
        
    def add_model(self, model_method, model_name):
        
        print('Adding model named %s ' % model_name)
        self.model_dict[model_name] = {}
        
        for y_col_name in self.y.columns:
            X_train = self.X_train.copy()
            X_test = self.X_test.copy()
            model = model_method()
            self.model_dict[model_name]['Model Spec'] = repr(model)
            
            if y_col_name not in self.no_drop_col:
                X_train.drop('last_run_time', axis=1, inplace=True)
                X_test.drop('last_run_time', axis=1, inplace=True)
                
            y_train = self.y_train[y_col_name].dropna()
            y_test = self.y_test[y_col_name].dropna()
            
            X_train = X_train[X_train.index.isin(y_train.index)]
            X_test = X_test[X_test.index.isin(y_test.index)]
            
            print('Performing analysis on column %s for model %s (Size: %s)' % (y_col_name, model_name, str(X_train.shape)))
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            if y_col_name not in self.no_drop_col:
                y_pred = pd.Series(y_pred, index=X_test.index)
                original_serie = df_y_original_dict[y_col_name]
                original_serie = original_serie[original_serie.index.isin(y_pred.index)]
                run_time_serie = self.run_time_serie[self.run_time_serie.index.isin(y_pred.index)]
                if 'quo' in y_col_name:
                    y_pred = y_pred * original_serie
                elif 'diff' in y_col_name:
                    y_pred = y_pred + original_serie
                self.model_dict[model_name]['Transformed RMSE: %s (%s)' % (y_col_name, 
                                                                           X_train.shape[0])] = '%.6f' % self.get_rmse(y_pred, 
                                                                                                                       run_time_serie)  
            else:
                self.model_dict[model_name]['Original RMSE: %s (%s)' % (y_col_name, 
                                                                        X_train.shape[0])] = '%.6f' % self.get_rmse(y_pred, 
                                                                                                                    y_test)
        
    def get_report(self):
        try:
            df = pd.DataFrame.from_dict(self.model_dict)
            return df.sort_values(df.columns[0])
        except IndexError:
            return
    
    @staticmethod
    def get_rmse(y_true, y_pred):
        diff = np.sum(np.abs(y_true - y_pred) ** 2)
        return (diff / y_true.shape[0]) ** 1/2
    
mc = ModelComparer(df_combined_x, df_combined_y, df_y_original_dict)

In [11]:
def get_ols_base(**kwargs):
    reg = linear_model.LinearRegression(fit_intercept=False)
    return reg

def get_xgboost_base(**kwargs):
    xgb_model = xgb.XGBRegressor(learning_rate=0.01)
    return xgb_model

def get_decision_tree_base(**kwargs):
    dt = DecisionTreeRegressor(max_depth=6)
    return dt

def get_random_forest_base(**kwargs):
    regr = RandomForestRegressor(max_depth=6)
    return regr

def get_gbm_base(**kwargs):
    params = {'n_estimators': 500, 
              'max_depth': 4, 
              'min_samples_split': 2, 
              'learning_rate': 0.01, 
              'loss': 'ls'}
    clf = GradientBoostingRegressor(**params)
    return clf

def get_ann_base(**kwargs):
    mlp = MLPRegressor(max_iter=1500)
    return mlp

mc.add_model(get_ols_base, 'OLS - Base Model')
mc.add_model(get_xgboost_base, 'XGB - Base Model (0.01LR)')
mc.add_model(get_decision_tree_base, 'DT - Base Model (6MD)')
mc.add_model(get_random_forest_base, 'RF - Base Model (6MD)')
mc.add_model(get_gbm_base, 'GBM - Base Model (4MD, 0.01LR)')
mc.add_model(get_ann_base, 'ANN - Base Model (1500MI)')

Adding model named OLS - Base Model 
Performing analysis on column run_time_1000 for model OLS - Base Model (Size: (539869, 68))
Performing analysis on column run_time_diff for model OLS - Base Model (Size: (539869, 67))
Performing analysis on column run_time_quo for model OLS - Base Model (Size: (539869, 67))
Performing analysis on column run_time_mean_diff for model OLS - Base Model (Size: (539869, 67))
Performing analysis on column run_time_mean_quo for model OLS - Base Model (Size: (539869, 67))
Performing analysis on column run_time_median_diff for model OLS - Base Model (Size: (539869, 67))
Performing analysis on column run_time_median_quo for model OLS - Base Model (Size: (539869, 67))
Performing analysis on column run_time_ma_window_2_diff for model OLS - Base Model (Size: (484666, 67))
Performing analysis on column run_time_ma_window_2_quo for model OLS - Base Model (Size: (484666, 67))
Performing analysis on column run_time_ma_window_3_diff for model OLS - Base Model (Size: (

Adding model named ANN - Base Model (1500MI) 
Performing analysis on column run_time_1000 for model ANN - Base Model (1500MI) (Size: (539869, 68))
Performing analysis on column run_time_diff for model ANN - Base Model (1500MI) (Size: (539869, 67))
Performing analysis on column run_time_quo for model ANN - Base Model (1500MI) (Size: (539869, 67))
Performing analysis on column run_time_mean_diff for model ANN - Base Model (1500MI) (Size: (539869, 67))
Performing analysis on column run_time_mean_quo for model ANN - Base Model (1500MI) (Size: (539869, 67))
Performing analysis on column run_time_median_diff for model ANN - Base Model (1500MI) (Size: (539869, 67))
Performing analysis on column run_time_median_quo for model ANN - Base Model (1500MI) (Size: (539869, 67))
Performing analysis on column run_time_ma_window_2_diff for model ANN - Base Model (1500MI) (Size: (484666, 67))
Performing analysis on column run_time_ma_window_2_quo for model ANN - Base Model (1500MI) (Size: (484666, 67))
P

In [12]:
mc.get_report()

Unnamed: 0,ANN - Base Model (1500MI),DT - Base Model (6MD),"GBM - Base Model (4MD, 0.01LR)",OLS - Base Model,RF - Base Model (6MD),XGB - Base Model (0.01LR)
Transformed RMSE: run_time_ewma_window_2_diff (484666),0.002497,0.001698,0.001681,0.001772,0.001687,0.018421
Transformed RMSE: run_time_ewma_window_3_diff (484666),0.135396,0.001698,0.001681,0.001772,0.001688,0.018421
Transformed RMSE: run_time_ma_window_2_diff (484666),0.522305,0.486719,0.482178,0.507604,0.484101,0.524914
Transformed RMSE: run_time_ma_window_3_diff (435322),0.678377,0.675069,0.664348,0.715875,0.671649,0.736623
Original RMSE: run_time_1000 (539869),1.031564,1.078805,0.999206,1.153729,1.066123,252.930681
Transformed RMSE: run_time_mean_diff (539869),1.379783,1.415373,1.387710,1.479975,1.408421,1.605763
Transformed RMSE: run_time_median_diff (539869),1.500340,1.512756,1.486612,1.578168,1.505330,1.697243
Transformed RMSE: run_time_diff (539869),1.969845,1.985402,1.968545,2.069507,1.975807,2.095074
Transformed RMSE: run_time_ma_window_3_quo (435322),12.438364,0.676686,0.665647,0.717149,0.672827,64.454447
Transformed RMSE: run_time_ewma_window_2_quo (484666),12.994731,0.001693,0.001677,0.001769,0.001684,63.931375
