In [112]:
import pandas as pd
import numpy as np 
import datetime
from tqdm.notebook import tqdm 
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import requests
from bs4 import BeautifulSoup
import re
from tqdm.notebook import tqdm
import time
from urllib.request import urlopen
import optuna.integration.lightgbm as lgb_o

クラス定義

In [113]:
class DataProcessor:
    """    
    Attributes:
    ----------
    data : pd.DataFrame
        rawデータ
    data_p : pd.DataFrame
        preprocessing後のデータ
    data_h : pd.DataFrame
        merge_horse_results後のデータ
    data_pe : pd.DataFrame
        merge_peds後のデータ
    data_c : pd.DataFrame
        process_categorical後のデータ
    no_peds: Numpy.array
        merge_pedsを実行した時に、血統データが存在しなかった馬のhorse_id一覧
    """
    
    def __init__(self):
        self.data = pd.DataFrame()
        self.data_p = pd.DataFrame()
        self.data_h = pd.DataFrame()
        self.data_pe = pd.DataFrame()
        self.data_c = pd.DataFrame()
        
    def merge_horse_results(self, hr, n_samples_list=[5, 9, 'all']):
        """
        馬の過去成績データから、
        n_samples_listで指定されたレース分の着順と賞金の平均を追加してdata_hに返す
        Parameters:
        ----------
        hr : HorseResults
            馬の過去成績データ
        n_samples_list : list, default [5, 9, 'all']
            過去何レース分追加するか
        """
        self.data_h = self.data_p.copy()
        for n_samples in n_samples_list:
            self.data_h = hr.merge_all(self.data_h, n_samples=n_samples)
	
	#6/6追加： 馬の出走間隔追加
        self.data_h['interval'] = (self.data_h['date'] - self.data_h['latest']).dt.days
        self.data_h.drop(['開催', 'latest'], axis=1, inplace=True)
	
	    
    def merge_peds(self, peds):
        """
        5世代分血統データを追加してdata_peに返す
        Parameters:
        ----------
        peds : Peds.peds_e
            Pedsクラスで加工された血統データ。
        """
	
        self.data_pe = \
            self.data_h.merge(peds, left_on='horse_id', right_index=True,
                                                             how='left')
        self.no_peds = self.data_pe[self.data_pe['peds_0'].isnull()]\
            ['horse_id'].unique()
        if len(self.no_peds) > 0:
            print('scrape peds at horse_id_list "no_peds"')
            
    def process_categorical(self, le_horse, le_jockey, results_m):
        """
        カテゴリ変数を処理してdata_cに返す
        Parameters:
        ----------
        le_horse : sklearn.preprocessing.LabelEncoder
            horse_idを0始まりの整数に変換するLabelEncoderオブジェクト。
        le_jockey : sklearn.preprocessing.LabelEncoder
            jockey_idを0始まりの整数に変換するLabelEncoderオブジェクト。
        results_m : Results.data_pe
            ダミー変数化のとき、ResultsクラスとShutubaTableクラスで列を合わせるためのもの
        """
	
        df = self.data_pe.copy()
        
        #ラベルエンコーディング。horse_id, jockey_idを0始まりの整数に変換
        mask_horse = df['horse_id'].isin(le_horse.classes_)
        new_horse_id = df['horse_id'].mask(mask_horse).dropna().unique()
        le_horse.classes_ = np.concatenate([le_horse.classes_, new_horse_id])
        df['horse_id'] = le_horse.transform(df['horse_id'])
        mask_jockey = df['jockey_id'].isin(le_jockey.classes_)
        new_jockey_id = df['jockey_id'].mask(mask_jockey).dropna().unique()
        le_jockey.classes_ = np.concatenate([le_jockey.classes_, new_jockey_id])
        df['jockey_id'] = le_jockey.transform(df['jockey_id'])
        
        #horse_id, jockey_idをpandasのcategory型に変換
        df['horse_id'] = df['horse_id'].astype('category')
        df['jockey_id'] = df['jockey_id'].astype('category')
        
        #そのほかのカテゴリ変数をpandasのcategory型に変換してからダミー変数化
        #列を一定にするため
        weathers = results_m['weather'].unique()
        race_types = results_m['race_type'].unique()
        ground_states = results_m['ground_state'].unique()
        sexes = results_m['性'].unique()
        df['weather'] = pd.Categorical(df['weather'], weathers)
        df['race_type'] = pd.Categorical(df['race_type'], race_types)
        df['ground_state'] = pd.Categorical(df['ground_state'], ground_states)
        df['性'] = pd.Categorical(df['性'], sexes)
        df = pd.get_dummies(df, columns=['weather', 'race_type', 'ground_state', '性'])
        
        self.data_c = df
        

リザルトクラス定義

In [151]:
class Results(DataProcessor):
    def __init__(self, results):
        super(Results, self).__init__()
        self.data = results
        
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(race_id_list):
        """
        レース結果データをスクレイピングする関数
        Parameters:
        ----------
        race_id_list : list
            レースIDのリスト
        Returns:
        ----------
        race_results_df : pandas.DataFrame
            全レース結果データをまとめてDataFrame型にしたもの
        """

        #race_idをkeyにしてDataFrame型を格納
        race_results = {}
        for race_id in tqdm(race_id_list):
            try:
                url = "https://db.netkeiba.com/race/" + race_id
                #メインとなるテーブルデータを取得
                df = pd.read_html(url)[0]

                html = requests.get(url)
                html.encoding = "EUC-JP"
                soup = BeautifulSoup(html.text, "html.parser")

                #天候、レースの種類、コースの長さ、馬場の状態、日付をスクレイピング
                texts = (
                    soup.find("div", attrs={"class": "data_intro"}).find_all("p")[0].text
                    + soup.find("div", attrs={"class": "data_intro"}).find_all("p")[1].text
                )
                info = re.findall(r'\w+', texts)
                for text in info:
                    if text in ["芝", "ダート"]:
                        df["race_type"] = [text] * len(df)
                    if "障" in text:
                        df["race_type"] = ["障害"] * len(df)
                    if "m" in text:
                        df["course_len"] = [int(re.findall(r"\d+", text)[0])] * len(df)
                    if text in ["良", "稍重", "重", "不良"]:
                        df["ground_state"] = [text] * len(df)
                    if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                        df["weather"] = [text] * len(df)
                    if "年" in text:
                        df["date"] = [text] * len(df)

                #馬ID、騎手IDをスクレイピング
                horse_id_list = []
                horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/horse")}
                )
                for a in horse_a_list:
                    horse_id = re.findall(r"\d+", a["href"])
                    horse_id_list.append(horse_id[0])
                jockey_id_list = []
                jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/jockey")}
                )
                for a in jockey_a_list:
                    jockey_id = re.findall(r"\d+", a["href"])
                    jockey_id_list.append(jockey_id[0])
                df["horse_id"] = horse_id_list
                df["jockey_id"] = jockey_id_list

                #インデックスをrace_idにする
                df.index = [race_id] * len(df)

                race_results[race_id] = df
                time.sleep(1)
            #存在しないrace_idを飛ばす
            except IndexError:
                continue
            #wifiの接続が切れた時などでも途中までのデータを返せるようにする
            except Exception as e:
                print(e)
                break
            #Jupyterで停止ボタンを押した時の対処
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる
        race_results_df = pd.concat([race_results[key] for key in race_results])

        return race_results_df
    
    #前処理    
    def preprocessing(self):
        df = self.data.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)
        df['rank'] = df['着順'].map(lambda x:1 if x<4 else 0)

        # 性齢を性と年齢に分ける
        df["性"] = df["性齢"].map(lambda x: str(x)[0])
        df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

        # 馬体重を体重と体重変化に分ける
        df["体重"] = df["馬体重"].str.split("(", expand=True)[0]
        df["体重変化"] = df["馬体重"].str.split("(", expand=True)[1].str[:-1]
	
	#errors='coerce'で、"計不"など変換できない時に欠損値にする
        df['体重'] = pd.to_numeric(df['体重'], errors='coerce')
        df['体重変化'] = pd.to_numeric(df['体重変化'], errors='coerce')

        # 単勝をfloatに変換
        df["単勝"] = df["単勝"].astype(float)
	# 距離は10の位を切り捨てる
        df["course_len"] = df["course_len"].astype(float) // 100

        # 不要な列を削除
        df.drop(["タイム", "着差", "調教師", "性齢", "馬体重", '馬名', '騎手', '人気', '着順'],
                axis=1, inplace=True)

        df["date"] = pd.to_datetime(df["date"], format="%Y年%m月%d日")
        
        #開催場所
        df['開催'] = df.index.map(lambda x:str(x)[4:6])
	
	#6/6出走数追加
        df['n_horses'] = df.index.map(df.index.value_counts())

        self.data_p = df
    
    #カテゴリ変数の処理
    def process_categorical(self):
        self.le_horse = LabelEncoder().fit(self.data_pe['horse_id'])
        self.le_jockey = LabelEncoder().fit(self.data_pe['jockey_id'])
        super().process_categorical(self.le_horse, self.le_jockey, self.data_pe)


出馬クラス定義

In [121]:

class ShutubaTable(DataProcessor):
    def __init__(self, shutuba_tables):
        super(ShutubaTable, self).__init__()
        self.data = shutuba_tables
    
    @classmethod
    def scrape(cls, race_id_list, date):
        data = pd.DataFrame()
        for race_id in tqdm(race_id_list):
            url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + race_id
            df = pd.read_html(url)[0]
            df = df.T.reset_index(level=0, drop=True).T

            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")

            texts = soup.find('div', attrs={'class': 'RaceData01'}).text
            texts = re.findall(r'\w+', texts)
            for text in texts:
                if 'm' in text:
                    df['course_len'] = [int(re.findall(r'\d+', text)[0])] * len(df)
                if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                    df["weather"] = [text] * len(df)
                if text in ["良", "稍重", "重"]:
                    df["ground_state"] = [text] * len(df)
                if '不' in text:
                    df["ground_state"] = ['不良'] * len(df)
                # 2020/12/13追加
                if '稍' in text:
                    df["ground_state"] = ['稍重'] * len(df)
                if '芝' in text:
                    df['race_type'] = ['芝'] * len(df)
                if '障' in text:
                    df['race_type'] = ['障害'] * len(df)
                if 'ダ' in text:
                    df['race_type'] = ['ダート'] * len(df)
            df['date'] = [date] * len(df)

            # horse_id
            horse_id_list = []
            horse_td_list = soup.find_all("td", attrs={'class': 'HorseInfo'})
            for td in horse_td_list:
                horse_id = re.findall(r'\d+', td.find('a')['href'])[0]
                horse_id_list.append(horse_id)
            # jockey_id
            jockey_id_list = []
            jockey_td_list = soup.find_all("td", attrs={'class': 'Jockey'})
            for td in jockey_td_list:
                jockey_id = re.findall(r'\d+', td.find('a')['href'])[0]
                jockey_id_list.append(jockey_id)
            df['horse_id'] = horse_id_list
            df['jockey_id'] = jockey_id_list

            df.index = [race_id] * len(df)
            data = data.append(df)
            time.sleep(1)
        return cls(data)
             
    #前処理            
    def preprocessing(self):
        df = self.data.copy()
        
        df["性"] = df["性齢"].map(lambda x: str(x)[0])
        df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

        # 馬体重を体重と体重変化に分ける
        df = df[df["馬体重(増減)"] != '--']
        df["体重"] = df["馬体重(増減)"].str.split("(", expand=True)[0].astype(int)
        df["体重変化"] = df["馬体重(増減)"].str.split("(", expand=True)[1].str[:-1]
        # 2020/12/13追加：増減が「前計不」などのとき欠損値にする
        df['体重変化'] = pd.to_numeric(df['体重変化'], errors='coerce')
        
        df["date"] = pd.to_datetime(df["date"])
        
        df['枠'] = df['枠'].astype(int)
        df['馬番'] = df['馬番'].astype(int)
        df['斤量'] = df['斤量'].astype(int)
	
        df['開催'] = df.index.map(lambda x:str(x)[4:6])
	
	#6/6出走数追加
        df['n_horses'] = df.index.map(df.index.value_counts())

        # 使用する列を選択
        df = df[['枠', '馬番', '斤量', 'course_len', 'weather','race_type',
        'ground_state', 'date', 'horse_id', 'jockey_id', '性', '年齢',
       '体重', '体重変化', '開催', 'n_horses']]
        
        self.data_p = df.rename(columns={'枠': '枠番'})


HorseResultsクラス定義

In [122]:

class HorseResults:
    def __init__(self, horse_results):
        self.horse_results = horse_results[['日付', '着順', '賞金', '着差', '通過', '開催', '距離']]
        self.preprocessing()
    
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(horse_id_list):
        """
        馬の過去成績データをスクレイピングする関数
        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト
        Returns:
        ----------
        horse_results_df : pandas.DataFrame
            全馬の過去成績データをまとめてDataFrame型にしたもの
        """

        #horse_idをkeyにしてDataFrame型を格納
        horse_results = {}
        for horse_id in tqdm(horse_id_list):
            try:
                url = 'https://db.netkeiba.com/horse/' + horse_id
                df = pd.read_html(url)[3]
                #受賞歴がある馬の場合、3番目に受賞歴テーブルが来るため、4番目のデータを取得する
                if df.columns[0]=='受賞歴':
                    df = pd.read_html(url)[4]
                df.index = [horse_id] * len(df)
                horse_results[horse_id] = df
                time.sleep(1)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる        
        horse_results_df = pd.concat([horse_results[key] for key in horse_results])

        return horse_results_df
    
    def preprocessing(self):
        df = self.horse_results.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)

        df["date"] = pd.to_datetime(df["日付"])
        df.drop(['日付'], axis=1, inplace=True)
        
        #賞金のNaNを0で埋める
        df['賞金'].fillna(0, inplace=True)
        
        #1着の着差を0にする
        df['着差'] = df['着差'].map(lambda x: 0 if x<0 else x)
        
        #レース展開データ
        #n=1: 最初のコーナー位置, n=4: 最終コーナー位置
        def corner(x, n):
            if type(x) != str:
                return x
            elif n==4:
                return int(re.findall(r'\d+', x)[-1])
            elif n==1:
                return int(re.findall(r'\d+', x)[0])
        df['first_corner'] = df['通過'].map(lambda x: corner(x, 1))
        df['final_corner'] = df['通過'].map(lambda x: corner(x, 4))
        
        df['final_to_rank'] = df['final_corner'] - df['着順']
        df['first_to_rank'] = df['first_corner'] - df['着順']
        df['first_to_final'] = df['first_corner'] - df['final_corner']
        
        #開催場所
        df['開催'] = df['開催'].str.extract(r'(\D+)')[0].map(place_dict).fillna('11')
        #race_type
        df['race_type'] = df['距離'].str.extract(r'(\D+)')[0].map(race_type_dict)
        #距離は10の位を切り捨てる
        df['course_len'] = df['距離'].str.extract(r'(\d+)').astype(int) // 100
        df.drop(['距離'], axis=1, inplace=True)
        #インデックス名を与える
        df.index.name = 'horse_id'
        
        self.horse_results = df
        self.target_list = ['着順', '賞金', '着差', 'first_corner', 'final_corner',
                            'first_to_rank', 'first_to_final','final_to_rank']
    
    #n_samplesレース分馬ごとに平均する
    def average(self, horse_id_list, date, n_samples='all'):
        target_df = self.horse_results.query('index in @horse_id_list')
        
        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')
        
	#集計して辞書型に入れる
        self.average_dict = {}
        self.average_dict['non_category'] = filtered_df.groupby(level=0)[self.target_list].mean()\
            .add_suffix('_{}R'.format(n_samples))
        for column in ['course_len', 'race_type', '開催']:
            self.average_dict[column] = filtered_df.groupby(['horse_id', column])\
                [self.target_list].mean().add_suffix('_{}_{}R'.format(column, n_samples))
	
	#6/6追加: 馬の出走間隔追加のために、全レースの日付を変数latestに格納
        if n_samples == 5:
            self.latest = filtered_df.groupby('horse_id')['date'].max().rename('latest')
    
    def merge(self, results, date, n_samples='all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        self.average(horse_id_list, date, n_samples)
        merged_df = df.merge(self.average_dict['non_category'], left_on='horse_id',
                             right_index=True, how='left')
        for column in ['course_len','race_type', '開催']:
            merged_df = merged_df.merge(self.average_dict[column], 
                                        left_on=['horse_id', column],
                                        right_index=True, how='left')
	
	#6/6追加：馬の出走間隔追加のために、全レースの日付を変数latestに格納
        if n_samples == 5:
            merged_df = merged_df.merge(self.latest, left_on='horse_id',
                             right_index=True, how='left')
        return merged_df
    
    def merge_all(self, results, n_samples='all'):
        date_list = results['date'].unique()
        merged_df = pd.concat([self.merge(results, date, n_samples) for date in tqdm(date_list)])
        return merged_df

#開催場所をidに変換するための辞書型
place_dict = {
    '札幌':'01',  '函館':'02',  '福島':'03',  '新潟':'04',  '東京':'05', 
    '中山':'06',  '中京':'07',  '京都':'08',  '阪神':'09',  '小倉':'10'
}

#レースタイプをレース結果データと整合させるための辞書型
race_type_dict = {
    '芝': '芝', 'ダ': 'ダート', '障': '障害'
}


In [6]:
#使えない
hr = HorseRselts(horse_results)
hr.horse_results

NameError: name 'HorseRselts' is not defined

In [12]:
hr = HorseResults.read_pickle(['horse_results.pickle'])
hr.horse_results

Unnamed: 0_level_0,着順,賞金,着差,通過,開催,date,first_corner,final_corner,final_to_rank,first_to_rank,first_to_final,race_type,course_len
horse_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017105318,11,0.0,0.9,3-3-3-2,03,2021-07-17,3.0,2.0,-9.0,-8.0,1.0,芝,20
2017105318,16,0.0,1.9,1-1-1-1,06,2020-12-26,1.0,1.0,-15.0,-15.0,0.0,芝,22
2017105318,4,110.0,0.2,7-6-4-4,01,2020-08-02,7.0,4.0,0.0,3.0,3.0,芝,20
2017105318,6,0.0,0.6,10-7-7-2,01,2019-08-31,10.0,2.0,-4.0,4.0,8.0,芝,18
2017105318,1,500.0,0.0,1-1-1-1,01,2019-07-27,1.0,1.0,0.0,0.0,0.0,芝,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017101106,7,0.0,1.6,5-5-6-7,09,2020-03-14,5.0,7.0,0.0,-2.0,-2.0,ダート,18
2017101106,9,0.0,1.7,8-8-10-9,07,2020-02-29,8.0,9.0,0.0,-1.0,-1.0,ダート,19
2017101106,13,0.0,1.8,1-1-1-1,08,2020-02-09,1.0,1.0,-12.0,-12.0,0.0,ダート,18
2017101106,5,51.0,1.6,10-9-9-7,08,2020-01-25,10.0,7.0,2.0,5.0,3.0,ダート,18


In [132]:

class Peds:
    def __init__(self, peds):
        self.peds = peds
        self.peds_e = pd.DataFrame() #after label encoding and transforming into category
    
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(horse_id_list):
        """
        血統データをスクレイピングする関数
        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト
        Returns:
        ----------
        peds_df : pandas.DataFrame
            全血統データをまとめてDataFrame型にしたもの
        """

        peds_dict = {}
        for horse_id in tqdm(horse_id_list):
            try:
                url = "https://db.netkeiba.com/horse/ped/" + horse_id
                df = pd.read_html(url)[0]

                #重複を削除して1列のSeries型データに直す
                generations = {}
                for i in reversed(range(5)):
                    generations[i] = df[i]
                    df.drop([i], axis=1, inplace=True)
                    df = df.drop_duplicates()
                ped = pd.concat([generations[i] for i in range(5)]).rename(horse_id)

                peds_dict[horse_id] = ped.reset_index(drop=True)
                time.sleep(1)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #列名をpeds_0, ..., peds_61にする
        peds_df = pd.concat([peds_dict[key] for key in peds_dict], axis=1).T.add_prefix('peds_')

        return peds_df
    
    def encode(self):
        df = self.peds.copy()
        for column in df.columns:
            df[column] = LabelEncoder().fit_transform(df[column].fillna('Na'))
        self.peds_e = df.astype('category')


In [32]:
p = Peds.read_pickle(['peds.pickle'])
p.encode()
p.peds_e

Unnamed: 0,peds_0,peds_1,peds_2,peds_3,peds_4,peds_5,peds_6,peds_7,peds_8,peds_9,...,peds_52,peds_53,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61
2017105318,503,2272,180,372,179,377,98,189,94,169,...,51,604,118,325,18,188,166,870,818,511
2017104612,222,5585,156,303,752,4633,49,140,203,319,...,156,367,135,122,455,286,267,383,368,1795
2017103879,236,5206,75,23,492,3810,0,105,51,25,...,445,655,73,107,14,315,46,645,344,1880
2017106259,259,1403,185,365,783,2044,21,88,37,71,...,188,579,116,110,369,784,14,1166,1264,2934
2017104140,235,7267,163,366,394,1048,98,169,47,194,...,145,235,10,39,384,410,380,11,667,2330
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017102659,444,1584,189,123,565,3266,78,68,97,236,...,313,564,24,280,310,253,48,928,445,2262
2016102166,272,3585,153,337,493,3315,38,152,112,212,...,5,510,118,242,225,949,446,1118,1189,2608
2016103450,469,4390,189,133,658,4848,78,68,192,293,...,341,118,138,297,70,352,267,329,685,1326
2017105662,232,7326,163,260,531,4431,98,169,233,312,...,344,277,150,400,171,55,239,203,151,2113


In [96]:
r = Results.read_pickle(['results.pickle'])
r.preprocessing()
r.data_p.head() 

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,weather,race_type,ground_state,date,horse_id,jockey_id,rank,性,年齢,体重,体重変化,開催,n_horses
201901010101,1,1,54.0,1.4,18.0,曇,芝,良,2019-07-27,2017105318,5339,1,牡,2,518,-16,1,9
201901010101,3,3,54.0,3.5,18.0,曇,芝,良,2019-07-27,2017104612,5203,1,牡,2,496,-8,1,9
201901010101,4,4,51.0,46.6,18.0,曇,芝,良,2019-07-27,2017103879,1180,1,牡,2,546,6,1,9
201901010101,8,9,51.0,56.8,18.0,曇,芝,良,2019-07-27,2017106259,1179,0,牡,2,458,-8,1,9
201901010101,5,5,54.0,140.3,18.0,曇,芝,良,2019-07-27,2017104140,1062,0,牡,2,436,0,1,9


In [97]:
r.merge_horse_results(hr, n_samples_list=[5, 9, 'all'])
r.data_h.head()

  0%|          | 0/110 [00:00<?, ?it/s]

  0%|          | 0/110 [00:00<?, ?it/s]

  0%|          | 0/110 [00:00<?, ?it/s]

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,weather,race_type,ground_state,date,horse_id,...,final_to_rank_race_type_allR,着順_開催_allR,賞金_開催_allR,着差_開催_allR,first_corner_開催_allR,final_corner_開催_allR,first_to_rank_開催_allR,first_to_final_開催_allR,final_to_rank_開催_allR,interval
201901010101,1,1,54.0,1.4,18.0,曇,芝,良,2019-07-27,2017105318,...,-1.0,,,,,,,,,48.0
201901010101,3,3,54.0,3.5,18.0,曇,芝,良,2019-07-27,2017104612,...,0.0,,,,,,,,,13.0
201901010101,4,4,51.0,46.6,18.0,曇,芝,良,2019-07-27,2017103879,...,,,,,,,,,,42.0
201901010101,8,9,51.0,56.8,18.0,曇,芝,良,2019-07-27,2017106259,...,3.0,,,,,,,,,13.0
201901010101,5,5,54.0,140.3,18.0,曇,芝,良,2019-07-27,2017104140,...,-3.0,,,,,,,,,20.0


In [98]:
r.merge_peds(p.peds_e)
r.data_pe.head()

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,weather,race_type,ground_state,date,horse_id,...,peds_52,peds_53,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61
201901010101,1,1,54.0,1.4,18.0,曇,芝,良,2019-07-27,2017105318,...,51,604,118,325,18,188,166,870,818,511
201901010101,3,3,54.0,3.5,18.0,曇,芝,良,2019-07-27,2017104612,...,156,367,135,122,455,286,267,383,368,1795
201901010101,4,4,51.0,46.6,18.0,曇,芝,良,2019-07-27,2017103879,...,445,655,73,107,14,315,46,645,344,1880
201901010101,8,9,51.0,56.8,18.0,曇,芝,良,2019-07-27,2017106259,...,188,579,116,110,369,784,14,1166,1264,2934
201901010101,5,5,54.0,140.3,18.0,曇,芝,良,2019-07-27,2017104140,...,145,235,10,39,384,410,380,11,667,2330


In [99]:
r.process_categorical() #r.le_horse, r.le_jockeyに対応関係が保持される

In [100]:
r.data_c

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,date,horse_id,jockey_id,rank,年齢,...,race_type_芝,race_type_ダート,race_type_障害,ground_state_良,ground_state_不良,ground_state_稍重,ground_state_重,性_牡,性_牝,性_セ
201901010101,1,1,54.0,1.4,18.0,2019-07-27,10614,152,1,2,...,1,0,0,1,0,0,0,1,0,0
201901010101,3,3,54.0,3.5,18.0,2019-07-27,10194,142,1,2,...,1,0,0,1,0,0,0,1,0,0
201901010101,4,4,51.0,46.6,18.0,2019-07-27,9851,136,1,2,...,1,0,0,1,0,0,0,1,0,0
201901010101,8,9,51.0,56.8,18.0,2019-07-27,11089,135,0,2,...,1,0,0,1,0,0,0,1,0,0
201901010101,5,5,54.0,140.3,18.0,2019-07-27,9967,47,0,2,...,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201908040412,5,5,57.0,15.4,18.0,2019-10-13,1535,63,0,5,...,0,1,0,0,0,0,1,1,0,0
201908040412,2,2,57.0,36.0,18.0,2019-10-13,189,87,0,7,...,0,1,0,0,0,0,1,1,0,0
201908040412,1,1,57.0,7.2,18.0,2019-10-13,1985,83,0,5,...,0,1,0,0,0,0,1,1,0,0
201908040412,4,4,55.0,23.9,18.0,2019-10-13,740,163,0,6,...,0,1,0,0,0,0,1,0,1,0


In [102]:
race_id_list = ['2020010106{}' .format(str(i).zfill(2)) for i in range(1, 13, 1)]
st = ShutubaTable.scrape(race_id_list, '2020/08/09')

  0%|          | 0/12 [00:00<?, ?it/s]

In [149]:
#前処理
st.preprocessing()

#馬の過去成績データの追加、　新しい馬はNaNが返される
st.merge_horse_results(hr)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [148]:
#五世代分の血統データの追加
#st.merge_peds(p.peds_e)

#scrape peds at horse_id_list "no_peds" と返された場合
peds_new = Peds.scrape(st.no_peds)
peds.to_pickle('peds_h.pickle') #pedsを更新する前にバックアップ

peds = update_data(p.peds, peds_new)
peds.to_pickle('peds.pickle')

p = Peds.read_pickle(['peds.pickle'])
p.encode()
st.merge_peds(p.peds_e)


  0%|          | 0/34 [00:00<?, ?it/s]

NameError: name 'peds' is not defined

In [135]:
peds_new = Peds.scrape(st.no_peds)


  0%|          | 0/34 [00:00<?, ?it/s]

In [137]:
peds_new.to_pickle('peds_h.pickle')

In [139]:
peds = update_data(p.peds, peds_new)

NameError: name 'update_data' is not defined

In [145]:
p.encode()

In [144]:
p = Peds.read_pickle(['peds.pickle'])

In [129]:
st.no_peds

array(['2018101480', '2018105226', '2018103956', '2018104036',
       '2018105434', '2018101331', '2018102410', '2018106349',
       '2017102919', '2017101400', '2017101067', '2017100213',
       '2017103329', '2017103943', '2017102931', '2017104757',
       '2017104001', '2017104870', '2017106213', '2017104231',
       '2017103103', '2017101364', '2018105280', '2018103574',
       '2018105648', '2018102843', '2018100174', '2018106583',
       '2018106513', '2018100430', '2017102121', '2015101808',
       '2017101727', '2015104457'], dtype=object)

In [153]:
st.process_categorical(r.le_horse, r.le_jockey, r.data_pe)

In [154]:
st.data_c

Unnamed: 0,枠番,馬番,斤量,course_len,date,horse_id,jockey_id,年齢,体重,体重変化,...,race_type_芝,race_type_ダート,race_type_障害,ground_state_良,ground_state_不良,ground_state_稍重,ground_state_重,性_牡,性_牝,性_セ
202001010601,1,1,54,1200,2020-08-09,11501,66,2,464,0,...,1,0,0,1,0,0,0,0,1,0
202001010601,2,2,54,1200,2020-08-09,11502,126,2,448,4,...,1,0,0,1,0,0,0,0,1,0
202001010601,3,3,53,1200,2020-08-09,11503,136,2,402,0,...,1,0,0,1,0,0,0,0,1,0
202001010601,4,4,54,1200,2020-08-09,11504,54,2,408,2,...,1,0,0,1,0,0,0,0,1,0
202001010601,5,5,54,1200,2020-08-09,11505,47,2,418,0,...,1,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202001010612,6,12,57,1200,2020-08-09,3373,34,5,474,2,...,1,0,0,1,0,0,0,1,0,0
202001010612,7,13,57,1200,2020-08-09,1668,65,6,494,6,...,1,0,0,1,0,0,0,1,0,0
202001010612,7,14,57,1200,2020-08-09,7873,30,4,486,-4,...,1,0,0,1,0,0,0,1,0,0
202001010612,8,15,52,1200,2020-08-09,10139,85,3,460,4,...,1,0,0,1,0,0,0,0,1,0


In [162]:
r.data_c['horse_id'].sort_values().reset_index()

Unnamed: 0,index,horse_id
0,201906030108,0
1,201907010505,0
2,201902010211,1
3,201906030611,1
4,201908020511,1
...,...,...
47113,201908050106,11498
47114,201905050501,11499
47115,201905050206,11499
47116,201907040606,11499


In [168]:
print(r.data_c.columns)

Index(['枠番', '馬番', '斤量', '単勝', 'course_len', 'date', 'horse_id', 'jockey_id',
       'rank', '年齢',
       ...
       'race_type_芝', 'race_type_ダート', 'race_type_障害', 'ground_state_良',
       'ground_state_不良', 'ground_state_稍重', 'ground_state_重', '性_牡', '性_牝',
       '性_セ'],
      dtype='object', length=188)


In [169]:
print(st.data_c.columns)

Index(['枠番', '馬番', '斤量', 'course_len', 'date', 'horse_id', 'jockey_id', '年齢',
       '体重', '体重変化',
       ...
       'race_type_芝', 'race_type_ダート', 'race_type_障害', 'ground_state_良',
       'ground_state_不良', 'ground_state_稍重', 'ground_state_重', '性_牡', '性_牝',
       '性_セ'],
      dtype='object', length=186)


In [173]:
def split_data(df, test_size=0.3):
    sorted_id_list = df.sort_value("date").index.unique()
    train_id_list = sorted_id_list[: round(len(sorted_id_list) * (1 - text_size))]
    test_id_list = sorted_id_list[round(len(sorted_id_list) * (1 - test_size)) :]
    train = df.loc[train_id_list]
    test = df.loc[test_id_list]
    return train, test

In [178]:
train, test = split_data(r.data_c)

AttributeError: 'DataFrame' object has no attribute 'sort_value'

In [179]:
_a_a_a_ = pd.read_pickle('horse_results.pickle')
_a_a_a_

Unnamed: 0,日付,開催,天気,R,レース名,映像,頭数,枠番,馬番,オッズ,...,着差,ﾀｲﾑ指数,通過,ペース,上り,馬体重,厩舎ｺﾒﾝﾄ,備考,勝ち馬(2着馬),賞金
2017105318,2021/07/17,1福島5,晴,9.0,南相馬特別(1勝クラス),,16.0,5.0,10,5.2,...,0.9,**,3-3-3-2,34.9-35.9,36.5,532(-38),,,フィデリオグリーン,
2017105318,2020/12/26,5中山7,晴,9.0,立志賞(1勝クラス),,18.0,6.0,12,3.5,...,1.9,**,1-1-1-1,37.5-36.8,38.7,570(+18),,,ルトロヴァイユ,
2017105318,2020/08/02,1札幌4,晴,7.0,3歳以上1勝クラス,,8.0,8.0,8,1.5,...,0.2,**,7-6-4-4,36.6-34.7,34.5,552(+28),,,シャムロックヒル,110.0
2017105318,2019/08/31,2札幌5,曇,11.0,札幌2歳S(G3),,12.0,7.0,10,1.8,...,0.6,**,10-7-7-2,36.0-37.3,37.3,524(+6),,,ブラックホール,
2017105318,2019/07/27,1札幌1,曇,1.0,2歳未勝利,,9.0,1.0,1,1.4,...,-1.8,**,1-1-1-1,35.9-36.5,36.5,518(-16),,,(プントファイヤー),500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017101106,2020/03/14,1阪神5,雨,2.0,3歳未勝利,,13.0,6.0,9,137.1,...,1.6,**,5-5-6-7,38.3-37.1,38.2,460(0),,,サンライズプラウ,
2017101106,2020/02/29,1中京1,曇,2.0,3歳未勝利,,12.0,7.0,10,45.4,...,1.7,**,8-8-10-9,30.6-38.4,38.7,460(-2),,,ワンダークンナパー,
2017101106,2020/02/09,2京都4,晴,2.0,3歳未勝利,,15.0,2.0,3,40.9,...,1.8,**,1-1-1-1,38.3-38.3,40.1,462(0),,,クールインパクト,
2017101106,2020/01/25,1京都8,曇,3.0,3歳未勝利,,11.0,7.0,9,28.0,...,1.6,**,10-9-9-7,36.9-37.8,38.3,462(+14),,,フームスムート,51.0


In [180]:
len(_a_a_a_)

251337

In [181]:
_s_s_s_ = pd.read_pickle('results.pickle')
_s_s_s_

Unnamed: 0,着順,枠番,馬番,馬名,性齢,斤量,騎手,タイム,着差,単勝,人気,馬体重,調教師,course_len,weather,race_type,ground_state,date,horse_id,jockey_id
201901010101,1,1,1,ゴルコンダ,牡2,54.0,ルメール,1:48.3,,1.4,1.0,518(-16),[東] 木村哲也,1800,曇,芝,良,2019年7月27日,2017105318,05339
201901010101,2,3,3,プントファイヤー,牡2,54.0,岩田康誠,1:50.1,大,3.5,2.0,496(-8),[東] 手塚貴久,1800,曇,芝,良,2019年7月27日,2017104612,05203
201901010101,3,4,4,ラグリマスネグラス,牡2,51.0,団野大成,1:50.9,5,46.6,6.0,546(+6),[東] 藤沢和雄,1800,曇,芝,良,2019年7月27日,2017103879,01180
201901010101,4,8,9,キタノコドウ,牡2,51.0,菅原明良,1:51.5,3.1/2,56.8,7.0,458(-8),[東] 高木登,1800,曇,芝,良,2019年7月27日,2017106259,01179
201901010101,5,5,5,ネモフィラブルー,牡2,54.0,川島信二,1:51.7,1.1/2,140.3,9.0,436(0),[西] 矢作芳人,1800,曇,芝,良,2019年7月27日,2017104140,01062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201910021212,12,6,11,スリープレッピー,セ6,56.0,森裕太朗,2:45.7,1/2,120.3,15.0,458(+8),[西] 藤沢則雄,2600,曇,芝,重,2019年9月1日,2013104167,01165
201910021212,13,1,1,バリオラージュ,牡5,54.0,斎藤新,2:46.0,1.3/4,7.5,4.0,460(+2),[西] 角居勝彦,2600,曇,芝,重,2019年9月1日,2014105643,01178
201910021212,14,2,3,サンライズアミーゴ,牡4,54.0,亀田温心,2:46.2,1,99.2,12.0,478(+14),[西] 牧浦充徳,2600,曇,芝,重,2019年9月1日,2015102081,01176
201910021212,15,6,12,トロハ,牝3,52.0,武豊,2:46.2,クビ,17.5,8.0,468(+2),[西] 浜田多実,2600,曇,芝,重,2019年9月1日,2016104221,00666


In [182]:
len(_s_s_s_)

47574