In [1]:
import pandas as pd
import sqlite3

def select_return_table(table_name):
    # Select from all records and convert to pandas dataframe
    data = curs.execute('SELECT * FROM %s' % table_name).fetchall()
    column = [element[1] for element in curs.execute('PRAGMA table_info(%s)' % table_name).fetchall()]
    return pd.DataFrame(data, columns=column)

def get_missing_value_perc(df, cond=lambda x: x == 'null'):
    # Check missing value and output percentage
    df_sum = df.applymap(cond).sum()
    df_percentage = df.applymap(cond).sum() / df.applymap(lambda data: data == 'null').count()
    df_percentage = df_percentage.apply(lambda x: "{0:.2f}%".format(x * 100))
    return pd.concat([df_sum, df_percentage], axis=1, keys=["Missing Value", "Missing Value (%)"])

## I. Database Connection

In [2]:
# Establish database connection and check table name
conn = sqlite3.connect('temp/race.db')
curs = conn.cursor()
table_name = curs.execute('SELECT name FROM sqlite_master WHERE type="table"').fetchall()
print(table_name)

[('race_record',), ('horse_record',), ('individual_record',), ('trainer_profile',), ('jockey_profile',)]


In [3]:
# Read from record data
record_dict = {name[0]: select_return_table(name[0]) for name in table_name}
race_df = record_dict['race_record']
horse_df = record_dict['horse_record']
individual_df = record_dict['individual_record']
trainer_df = record_dict['trainer_profile']
jockey_df = record_dict['jockey_profile']

## II. Integrity Check

### i. Race Record

In [4]:
# Snapshot of tha race_record dataframe
print(race_df.shape)
race_df.describe().T

(287681, 27)


Unnamed: 0,count,unique,top,freq
run_date,287681,681,2002-04-21,547
place,287681,10,中山,41958
race,287681,12,3R,26656
title,287681,1642,3歳未勝利,67621
type,287681,3,ダ,140215
track,287681,4,右,189136
distance,287681,67,1200m,68470
weather,287681,6,晴,170843
condition,287681,4,良,225379
time,287681,85,12:50,8336


In [5]:
# Ensure that (almost) all races on the same day at the same place have a count of 12
race_count = curs.execute('SELECT DISTINCT run_date, place, race from race_record').fetchall()
race_count_df = pd.DataFrame(race_count, columns=['run_date', 'place', 'race'])
race_count_df.groupby(['run_date', 'place']).count().sample(n=10)

Unnamed: 0_level_0,Unnamed: 1_level_0,race
run_date,place,Unnamed: 2_level_1
2005-12-10,中山,6
2000-07-15,小倉,12
2001-04-07,中山,12
2005-08-07,新潟,12
2005-04-03,中山,12
2001-10-13,東京,12
2001-07-21,小倉,12
2000-03-19,阪神,12
2006-08-13,新潟,6
2002-07-27,小倉,12


### ii. Horse Record

In [6]:
# Check the data columns
horse_df.sample(n=3)

Unnamed: 0,horse_name,date_of_birth,trainer,owner,breeder,place_of_birth,transaction_price,prize_obtained,race_record,highlight_race,relatives,parents,status,gender,breed,offer_info
1457,マイネルシェーバー,1997年3月1日,田口輝彦 (笠松),足立達昭,飛渡牧場,新冠町,-,"1,250万円 (中央) /1,671万円 (地方)",77戦9勝 [ 9-9-6-53 ],02'報知中津特別(OP),ハワイアンエルフ 、 イシノハワイアン,リンドシェーバー キクカエルフ,,セ,鹿毛,
14419,シンボリスキャン,1999年4月18日,斎藤宏 (美浦),高橋一郎,シンボリ牧場,門別町,-,510万円 (中央) /854万円 (地方),66戦18勝 [ 18-16-7-25 ],02'3歳未出走,シンボリプレオ 、 マツニシキ,スキャン ビューティゲラン,抹消,牡,黒鹿毛,
30147,グラインダー,2003年3月4日,菅原欣也 (栃木),戸崎利彦,藤原牧場,静内町,-,870万円 (中央) /18万円 (地方),25戦2勝 [ 2-0-1-22 ],08'3歳上500万下,カサロス 、 ディスパージョン,ラムタラ ダイヤモンドロッチ,,牡,栗毛,


In [7]:
# Snapshot of tha race_record dataframe
print(horse_df.shape)
horse_df.describe().T

(32648, 16)


Unnamed: 0,count,unique,top,freq
horse_name,32648,32648,セレスエンブレム,1
date_of_birth,32648,1905,2002年4月3日,72
trainer,32648,1447,藤沢和雄 (美浦),147
owner,32648,4677,サンデーレーシング,493
breeder,32648,3017,ノーザンファーム,1175
place_of_birth,32648,92,浦河町,6205
transaction_price,32648,2771,-,27523
prize_obtained,32648,15805,0万円,5213
race_record,32648,12211,2戦0勝 [ 0-0-0-2 ],1167
highlight_race,32648,6651,,10355


### iii. Individual Record

In [8]:
# Check the data columns
individual_df.sample(n=3)

Unnamed: 0,individual_type,name,year,rank,first,second,third,out,races_major,wins_major,...,wins_flat,races_grass,wins_grass,races_dirt,wins_dirt,wins_percent,wins_percent_2nd,wins_percent_3rd,prize_obtained,representative_horse
46621,馬主,名駿,1998,454,1,2,2,17,0,0,...,1,18,1,4,0,0.045,0.136,0.227,2919.7,グランドウィン
43904,生産者,佐藤信広,2010,574,0,2,1,26,0,0,...,0,11,0,18,0,0.0,0.069,0.103,2014.6,アドバンスヘイロー
4016,生産者,斉藤スタッド,2002,464,2,3,6,27,1,0,...,2,22,1,17,1,0.051,0.128,0.282,3189.1,アマートベンハー


In [9]:
# Snapshot of tha race_record dataframe
print(individual_df.shape)
individual_df.describe().T

(84936, 23)


Unnamed: 0,count,unique,top,freq
individual_type,84936,4,馬主,35940
name,84936,7384,村田牧場,33
year,84936,33,2000,4159
rank,84936,1499,1313,736
first,84936,190,0,34471
second,84936,164,0,36226
third,84936,152,0,34665
out,84936,744,1,5995
races_major,84936,124,0,56020
wins_major,84936,29,0,78458


### iv. Trainer Profile

In [10]:
# Check the data columns
trainer_df.sample(n=3)

Unnamed: 0,trainer_name,date_of_birth,place_of_birth,first_run_date,first_run_horse,first_win_date,first_win_horse
93,[東]高松邦男,1948/02/26,千葉県,1979/03/03,キリープリンス,1979/05/06,キョウエイジョージ
568,[地]工藤勉,1959/08/29,,,,,
12,[東]佐藤吉勝,1957/11/04,福島県,1999/03/07,ドミニカシチー,1999/04/17,ニシノボルドー


In [11]:
# Snapshot of tha race_record dataframe
print(trainer_df.shape)
trainer_df.describe().T

(784, 7)


Unnamed: 0,count,unique,top,freq
trainer_name,784,784,[地]斉藤尭,1
date_of_birth,784,756,1956/10/09,2
place_of_birth,784,36,,563
first_run_date,784,154,,565
first_run_horse,784,220,,565
first_win_date,784,208,,565
first_win_horse,784,220,,565


### v. Jockey Profile

In [12]:
# Check the data columns
jockey_df.sample(n=3)

Unnamed: 0,jockey_name,date_of_birth,place_of_birth,blood_type,height,weight,first_flat_run_date,first_flat_run_horse,first_flat_win_date,first_flat_win_horse,first_obs_run_date,first_obs_run_horse,first_obs_win_date,first_obs_win_horse
578,納谷和玖,1973/11/22,,,,,2006/04/16,ワイエスハッスル,,,,,,
213,寺島祐治,1968/03/12,,,,,1987/03/07,クボノブレーブ,1987/04/19,シービースマイル,,,,
326,吉田順治,1968/01/12,,,,,1997/08/16,ワンダーアモン,,,,,,


In [13]:
# Snapshot of tha race_record dataframe
print(jockey_df.shape)
jockey_df.describe().T

(593, 14)


Unnamed: 0,count,unique,top,freq
jockey_name,593,593,田辺弘,1
date_of_birth,593,580,1977/03/02,2
place_of_birth,593,36,,408
blood_type,593,5,,410
height,593,23,,408
weight,593,16,,408
first_flat_run_date,593,316,,29
first_flat_run_horse,593,557,,29
first_flat_win_date,593,318,,244
first_flat_win_horse,593,348,,244


## III. Preprocessing

### i. Missing Value

In [14]:
# Check missing value
get_missing_value_perc(race_df)

Unnamed: 0,Missing Value,Missing Value (%)
run_date,0,0.00%
place,0,0.00%
race,0,0.00%
title,0,0.00%
type,0,0.00%
track,0,0.00%
distance,0,0.00%
weather,0,0.00%
condition,0,0.00%
time,0,0.00%


In [15]:
# Check missing value
get_missing_value_perc(horse_df)

Unnamed: 0,Missing Value,Missing Value (%)
horse_name,0,0.00%
date_of_birth,0,0.00%
trainer,0,0.00%
owner,0,0.00%
breeder,0,0.00%
place_of_birth,0,0.00%
transaction_price,0,0.00%
prize_obtained,0,0.00%
race_record,0,0.00%
highlight_race,0,0.00%


In [16]:
# Check missing value
get_missing_value_perc(individual_df)

Unnamed: 0,Missing Value,Missing Value (%)
individual_type,0,0.00%
name,0,0.00%
year,0,0.00%
rank,0,0.00%
first,63,0.07%
second,63,0.07%
third,63,0.07%
out,63,0.07%
races_major,63,0.07%
wins_major,63,0.07%


In [17]:
# Check missing value
get_missing_value_perc(trainer_df)

Unnamed: 0,Missing Value,Missing Value (%)
trainer_name,0,0.00%
date_of_birth,0,0.00%
place_of_birth,563,71.81%
first_run_date,565,72.07%
first_run_horse,565,72.07%
first_win_date,565,72.07%
first_win_horse,565,72.07%


In [18]:
# Check missing value
get_missing_value_perc(jockey_df)

Unnamed: 0,Missing Value,Missing Value (%)
jockey_name,0,0.00%
date_of_birth,0,0.00%
place_of_birth,408,68.80%
blood_type,410,69.14%
height,408,68.80%
weight,408,68.80%
first_flat_run_date,29,4.89%
first_flat_run_horse,29,4.89%
first_flat_win_date,244,41.15%
first_flat_win_horse,244,41.15%
