In [1]:
import pandas as pd
import sqlite3

## I. Database Connection

In [2]:
# Establish database connection and check table name
conn = sqlite3.connect('temp/race.db')
curs = conn.cursor()
table_name = curs.execute('SELECT name FROM sqlite_master WHERE type="table"').fetchall()
print(table_name)

[('race_record',), ('horse_record',), ('individual_record',), ('trainer_profile',), ('jockey_profile',)]


In [3]:
def select_return_table(table_name):
    # Select from all records and convert to pandas dataframe
    data = curs.execute('SELECT * FROM %s' % table_name).fetchall()
    column = [element[1] for element in curs.execute('PRAGMA table_info(%s)' % table_name).fetchall()]
    return pd.DataFrame(data, columns=column)

# Read from record data
record_dict = {name[0]: select_return_table(name[0]) for name in table_name}
race_df = record_dict['race_record']
horse_df = record_dict['horse_record']
individual_df = record_dict['individual_record']
trainer_df = record_dict['trainer_profile']
jockey_df = record_dict['jockey_profile']

## II. Integrity Check

### i. Race Record

In [4]:
# Snapshot of tha race_record dataframe
print(race_df.shape)
race_df.describe().T

(287681, 27)


Unnamed: 0,count,unique,top,freq
run_date,287681,681,2005-05-21,547
place,287681,10,中山,41958
race,287681,12,3R,26656
title,287681,1642,3歳未勝利,67621
type,287681,3,ダ,140215
track,287681,4,右,189136
distance,287681,67,1200m,68470
weather,287681,6,晴,170843
condition,287681,4,良,225379
time,287681,85,12:50,8336


In [5]:
# Ensure that (almost) all races on the same day at the same place have a count of 12
race_count = curs.execute('SELECT DISTINCT run_date, place, race from race_record').fetchall()
race_count_df = pd.DataFrame(race_count, columns=['run_date', 'place', 'race'])
race_count_df.groupby(['run_date', 'place']).count().sample(n=10)

Unnamed: 0_level_0,Unnamed: 1_level_0,race
run_date,place,Unnamed: 2_level_1
2003-02-02,中山,12
2003-08-30,新潟,12
2005-07-24,函館,12
2004-07-18,新潟,12
2004-11-07,東京,12
2006-08-26,札幌,6
2004-09-26,中山,12
2006-09-03,小倉,6
2006-08-05,新潟,6
2004-07-03,函館,12


### ii. Horse Record

In [6]:
# Check the data columns
horse_df.sample(n=3)

Unnamed: 0,horse_name,date_of_birth,trainer,owner,breeder,place_of_birth,transaction_price,prize_obtained,race_record,highlight_race,relatives,parents,status,gender,breed,offer_info
23818,ウォーターブリュー,2002年5月17日,武邦彦 (栗東),山岡良一,本巣牧場,浦河町,-,460万円 (中央),5戦0勝 [ 0-0-2-3 ],,ウォーターダグ 、 ウォーターベルーガ,ドリームウェル ウォーターリブ,抹消,牡,栗毛,
3609,ジェニアリータ,1997年1月24日,新関力 (美浦),伊達秀和,Hidekazu Date,米,-,770万円 (中央),14戦1勝 [ 1-0-1-12 ],00'3歳未勝利,ヘヴンリーブルー 、 ノースドール,Gone West ゴールデンソネット,抹消,牝,鹿毛,
1277,ノボユウユウ,1996年3月17日,諏訪富三 (美浦),池ばた,Stillmeadow Farm,米,-,"1,232万円 (中央) /150万円 (地方)",32戦1勝 [ 1-1-2-28 ],スピカ特別,,Woodman Glen Kate,抹消,牡,鹿毛,


In [7]:
# Snapshot of tha race_record dataframe
print(horse_df.shape)
horse_df.describe().T

(32648, 16)


Unnamed: 0,count,unique,top,freq
horse_name,32648,32648,ウインスポット,1
date_of_birth,32648,1905,2002年4月3日,72
trainer,32648,1447,藤沢和雄 (美浦),147
owner,32648,4677,サンデーレーシング,493
breeder,32648,3017,ノーザンファーム,1175
place_of_birth,32648,92,浦河町,6205
transaction_price,32648,2771,-,27523
prize_obtained,32648,15805,0万円,5213
race_record,32648,12211,2戦0勝 [ 0-0-0-2 ],1167
highlight_race,32648,6651,,10355


### iii. Individual Record

In [8]:
# Check the data columns
individual_df.sample(n=3)

Unnamed: 0,individual_type,name,year,rank,first,second,third,out,races_major,wins_major,...,wins_flat,races_grass,wins_grass,races_dirt,wins_dirt,wins_percent,wins_percent_2nd,wins_percent_3rd,prize_obtained,representative_horse
75835,生産者,小野与市,1991,564,2,0,0,5,0,0,...,0,1,0,6,2,0.286,0.286,0.286,2280.0,ハクバブリッジ
38299,生産者,長浜牧場,1999,338,3,4,2,28,4,0,...,3,30,3,7,0,0.081,0.189,0.243,4892.1,トウカイティアラ
72436,馬主,加藤鈴幸,1999,1331,0,0,0,8,2,0,...,0,7,0,1,0,0.0,0.0,0.0,0.0,メルクバルブルボン


In [9]:
# Snapshot of tha race_record dataframe
print(individual_df.shape)
individual_df.describe().T

(84936, 23)


Unnamed: 0,count,unique,top,freq
individual_type,84936,4,馬主,35940
name,84936,7384,シンボリ牧場,33
year,84936,33,2000,4159
rank,84936,1499,1313,736
first,84936,190,0,34471
second,84936,164,0,36226
third,84936,152,0,34665
out,84936,744,1,5995
races_major,84936,124,0,56020
wins_major,84936,29,0,78458


### iv. Trainer Profile

In [10]:
# Check the data columns
trainer_df.sample(n=3)

Unnamed: 0,trainer_name,date_of_birth,place_of_birth,first_run_date,first_run_horse,first_win_date,first_win_horse
590,[地]足立勝久,1943/09/21,,,,,
235,[西]伊藤修司,1930/02/22,,,,,
427,[地]保利良次,1955/04/19,,,,,


In [11]:
# Snapshot of tha race_record dataframe
print(trainer_df.shape)
trainer_df.describe().T

(784, 7)


Unnamed: 0,count,unique,top,freq
trainer_name,784,784,[西]崎山博樹,1
date_of_birth,784,756,1944/09/11,2
place_of_birth,784,36,,563
first_run_date,784,154,,565
first_run_horse,784,220,,565
first_win_date,784,208,,565
first_win_horse,784,220,,565


### v. Jockey Profile

In [12]:
# Check the data columns
jockey_df.sample(n=3)

Unnamed: 0,jockey_name,date_of_birth,place_of_birth,blood_type,height,weight,first_flat_run_date,first_flat_run_horse,first_flat_win_date,first_flat_win_horse,first_obs_run_date,first_obs_run_horse,first_obs_win_date,first_obs_win_horse
573,田中博康,1985/12/05,埼玉県,B型,159cm,47kg,2006/03/04,ペイルローズ,2006/03/18,タイキエンデバー,,,,
518,野澤憲彦,1974/07/12,,,,,2003/11/29,マルハチエトルリア,,,,,,
472,ナカタニ,1970/10/21,,,,,1992/03/14,ニホンピロナーリー,1992/03/14,ニホンピロナーリー,,,,


In [13]:
# Snapshot of tha race_record dataframe
print(jockey_df.shape)
jockey_df.describe().T

(593, 14)


Unnamed: 0,count,unique,top,freq
jockey_name,593,593,繁田健一,1
date_of_birth,593,580,1977/03/02,2
place_of_birth,593,36,,408
blood_type,593,5,,410
height,593,23,,408
weight,593,16,,408
first_flat_run_date,593,316,,29
first_flat_run_horse,593,557,,29
first_flat_win_date,593,318,,244
first_flat_win_horse,593,348,,244


## III. Preprocessing