# Modelling

In [1]:
import pandas as pd
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine
import matplotlib as plt
import pickle
import os

In [2]:
DATA_DIR = '../data'

## Load Target

In [3]:
race_winner = pickle.load(open(os.path.join(DATA_DIR, 'race_winner.p'), 'rb'))
race_winner

Unnamed: 0_level_0,winning_box
race_id,Unnamed: 1_level_1
2539775,3
2851623,5
2539776,3
2851624,4
2539777,1
...,...
2539770,1
2539771,4
2539772,2
2539773,1


# Load Filters

In [4]:
runners_6 = pickle.load(open(os.path.join(DATA_DIR, 'runners_6.p'), 'rb'))
runners_6

16022
16809
16895
16896
16897
...
4642019
4642020
4642021
4642022
4642023


In [5]:
grade = pickle.load(open(os.path.join(DATA_DIR, 'grade.p'), 'rb'))
grade

Unnamed: 0_level_0,date_time
race_id,Unnamed: 1_level_1
2539774,2008-11-15 20:58:00+00:00
2539775,2008-11-15 21:14:00+00:00
2851623,2010-08-30 10:07:00+00:00
2851624,2010-08-30 10:23:00+00:00
2539777,2008-11-15 21:45:00+00:00
...,...
2539765,2008-11-15 18:38:00+00:00
2539766,2008-11-15 18:56:00+00:00
2539767,2008-11-15 19:11:00+00:00
2539769,2008-11-15 19:42:00+00:00


## Load Benchmark

In [6]:
benchmark = pickle.load(open(os.path.join(DATA_DIR, 'benchmark.p'), 'rb'))
benchmark

Unnamed: 0_level_0,benchmark
race_id,Unnamed: 1_level_1
16022,1
16809,2
16895,4
16896,5
16897,2
...,...
4642020,4
4642021,3
4642022,6
4642023,3


## Load Features

In [7]:
features_dict = pickle.load(open(os.path.join(DATA_DIR, 'features_dict_25days.p'), 'rb'))
features_list = features_dict['features']
no_qual_races = features_dict['no_qual_races']

# Join Features

In [8]:
starting_point = grade.merge(runners_6, left_index = True, right_index = True)
starting_point = starting_point.merge(race_winner, left_index = True, right_index = True)
starting_point = starting_point.merge(benchmark, left_index = True, right_index = True)
starting_point

Unnamed: 0_level_0,date_time,winning_box,benchmark
race_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2539774,2008-11-15 20:58:00+00:00,6,3
2539775,2008-11-15 21:14:00+00:00,3,2
2851623,2010-08-30 10:07:00+00:00,5,6
2851624,2010-08-30 10:23:00+00:00,4,5
2539777,2008-11-15 21:45:00+00:00,1,2
...,...,...,...
2539765,2008-11-15 18:38:00+00:00,1,6
2539766,2008-11-15 18:56:00+00:00,4,3
2539767,2008-11-15 19:11:00+00:00,6,6
2539769,2008-11-15 19:42:00+00:00,6,4


In [9]:
modelling_data = starting_point.copy()
for features in features_list:
    modelling_data = modelling_data.merge(features, left_index = True, right_index = True)
modelling_data

Unnamed: 0_level_0,date_time,winning_box,benchmark,min_time_1,min_time_2,min_time_3,min_time_4,min_time_5,min_time_6,avg_time_1,...,pcnt_place_3,pcnt_place_4,pcnt_place_5,pcnt_place_6,pcnt_show_1,pcnt_show_2,pcnt_show_3,pcnt_show_4,pcnt_show_5,pcnt_show_6
race_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2539774,2008-11-15 20:58:00+00:00,6,3,29.08,28.95,29.13,28.89,28.76,29.74,29.176667,...,0.5,0.500000,1.0,0.000000,0.666667,1.000000,0.500000,0.500000,1.0,0.000000
2539775,2008-11-15 21:14:00+00:00,3,2,29.11,29.10,29.25,29.13,29.10,29.02,29.120000,...,0.0,0.500000,0.0,0.333333,0.333333,1.000000,0.666667,0.500000,0.0,0.666667
2851623,2010-08-30 10:07:00+00:00,5,6,30.02,29.95,30.74,30.23,29.84,29.78,30.415000,...,0.0,0.000000,0.5,0.333333,0.500000,0.000000,0.000000,0.000000,0.5,1.000000
2851624,2010-08-30 10:23:00+00:00,4,5,29.99,30.11,30.11,29.53,29.71,29.71,30.117500,...,0.0,0.666667,1.0,0.666667,0.000000,0.000000,0.000000,0.666667,1.0,0.666667
2539777,2008-11-15 21:45:00+00:00,1,2,,,31.08,,30.32,29.47,,...,0.0,,0.0,1.000000,,,0.000000,,0.0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2539765,2008-11-15 18:38:00+00:00,1,6,28.85,29.00,29.13,29.10,29.50,28.87,28.865000,...,0.5,0.500000,0.0,0.333333,1.000000,0.666667,0.500000,0.750000,0.0,0.333333
2539766,2008-11-15 18:56:00+00:00,4,3,28.89,29.44,29.55,29.43,29.67,29.04,29.193333,...,0.0,0.000000,0.0,0.500000,1.000000,0.000000,0.000000,0.000000,0.0,0.500000
2539767,2008-11-15 19:11:00+00:00,6,6,29.65,29.71,30.10,29.52,29.95,29.56,29.890000,...,0.5,0.500000,0.0,0.333333,0.333333,0.500000,0.750000,0.500000,0.0,0.666667
2539769,2008-11-15 19:42:00+00:00,6,4,29.30,28.98,,28.99,29.52,29.57,29.300000,...,,0.666667,0.0,0.000000,1.000000,0.500000,,1.000000,0.0,0.666667


In [10]:
# check columns
modelling_data.columns.values

array(['date_time', 'winning_box', 'benchmark', 'min_time_1',
       'min_time_2', 'min_time_3', 'min_time_4', 'min_time_5',
       'min_time_6', 'avg_time_1', 'avg_time_2', 'avg_time_3',
       'avg_time_4', 'avg_time_5', 'avg_time_6', 'min_stime_1',
       'min_stime_2', 'min_stime_3', 'min_stime_4', 'min_stime_5',
       'min_stime_6', 'avg_stime_1', 'avg_stime_2', 'avg_stime_3',
       'avg_stime_4', 'avg_stime_5', 'avg_stime_6', 'avg_fin_1',
       'avg_fin_2', 'avg_fin_3', 'avg_fin_4', 'avg_fin_5', 'avg_fin_6',
       'pcnt_win_1', 'pcnt_win_2', 'pcnt_win_3', 'pcnt_win_4',
       'pcnt_win_5', 'pcnt_win_6', 'pcnt_place_1', 'pcnt_place_2',
       'pcnt_place_3', 'pcnt_place_4', 'pcnt_place_5', 'pcnt_place_6',
       'pcnt_show_1', 'pcnt_show_2', 'pcnt_show_3', 'pcnt_show_4',
       'pcnt_show_5', 'pcnt_show_6'], dtype=object)

In [11]:
# target must start from 0
modelling_data['winning_box'] = modelling_data['winning_box'] - 1
modelling_data['benchmark'] = modelling_data['benchmark'] - 1

# Store Modelling data

In [12]:
modelling_data.to_csv(os.path.join(DATA_DIR, 'modelling_data_25days.csv')) 