In [1]:
import pandas as pd
import numpy as np 
import datetime
from tqdm.notebook import tqdm 
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import requests
from bs4 import BeautifulSoup
import re
from tqdm.notebook import tqdm
import time
from urllib.request import urlopen
import optuna.integration.lightgbm as lgb_o

In [2]:
class DataProcessor:
    """    
    Attributes:
    ----------
    data : pd.DataFrame
        rawデータ
    data_p : pd.DataFrame
        preprocessing後のデータ
    data_h : pd.DataFrame
        merge_horse_results後のデータ
    data_pe : pd.DataFrame
        merge_peds後のデータ
    data_c : pd.DataFrame
        process_categorical後のデータ
    no_peds: Numpy.array
        merge_pedsを実行した時に、血統データが存在しなかった馬のhorse_id一覧
    """
    
    def __init__(self):
        self.data = pd.DataFrame()
        self.data_p = pd.DataFrame()
        self.data_h = pd.DataFrame()
        self.data_pe = pd.DataFrame()
        self.data_c = pd.DataFrame()
        
    def merge_horse_results(self, hr, n_sample_list=[5, 9, 'all']):
        
        """
        馬の過去成績データから、
        n_samples_listで指定されたレース分の着順と賞金の平均を追加してdata_hに返す
        Parameters:
        ----------
        hr : HorseResults
            馬の過去成績データ
        n_samples_list : list, default [5, 9, 'all']
            過去何レース分追加するか
        """
        self.data_h = self.data_p.copy()
        for n_samples in n_sanples_list:
            self.data_h = hr.merge_all(self.data_h, n_samples=n_samples)
            
        #6/6追加　馬の出走感感覚の追加
        self.data_h['interval'] = (self.data_h['date'] - self.data_h['latest']).dt.days
        self.data_h.drop(['開催', 'latest'], axis=1, inplace=True)
        
    def merge_peds(self, peds):
        """
        5世代分血統データを追加してdata_peに返す
        Parameters:
        ----------
        peds : Peds.peds_e
            Pedsクラスで加工された血統データ。
        """
        
        self.data_pe = \
            self.data_h.merge(peds, left_on='horse_id', right_index=True,how='left')
        self.no_peds = self.data_pe[self.data_pe['peds_0'].isnull()]\
            ['horse_id'].unique()
        if len(self.no_peds) > 0:
            print('scrape peds at horse_id_list "no_peds"')
            
    def process_categorical(self, le_horse, le_jockey, results_m):
        """
        カテゴリ変数を処理してdata_cに返す
        Parameters:
        ----------
        le_horse : sklearn.preprocessing.LabelEncoder
            horse_idを0始まりの整数に変換するLabelEncoderオブジェクト。
        le_jockey : sklearn.preprocessing.LabelEncoder
            jockey_idを0始まりの整数に変換するLabelEncoderオブジェクト。
        results_m : Results.data_pe
            ダミー変数化のとき、ResultsクラスとShutubaTableクラスで列を合わせるためのもの
        """
        df = self.data_pe.copy()
        #ラベルコーディングhorse_id, jockly_idを0始まりの整数に変換
        mask_horse = df['horse_id'].isin(le_horse.classes_)
        new_horse_id = df['horse_id'].mask(mask_horse).dropna().unique()
        le_horse.classes_ = np.concatenate([le_horse.classes_, new_horse_id])
        df['horse_id'] = le_horse.transform(df['horse_id'])
        mask_jockey = df['jockey_id'].isin(le_jockey.classes_)
        new_jockey_id = df['jockey_id'].mask(mask_jockey).dropna(). unique()
        le_jockey.classes_ = np.concatenate([le_jockey.classes_, new_jockey_id])
        df['jockey_id'] = le_jockey.transform(df['jokcey_id'])
        
        #horse_id, jockey_idをpandasのカテゴリー型に変換
        df['horse_id'] = df['horse_id'].astype('category')
        df['jockey_id'] = df[jockey_id].astype('category')
        
        #その他のカテゴリ変数をpandasのカテゴリ型にしてからダミー変数化
        #列を一定にする
        weathers = results_m['weather'].unique()
        race_types = results_m['race_type'].unique()
        ground_states = results_m['ground_state'].unique()
        sexes = results_m = ['性'].unique()
        df['weather'] = pd.Categorical(df['weather'], weathers)
        df['race_type'] = pd.Categorical(df['race_type'], race_types)
        df['ground_sate'] = pd.Categorical(df['ground_state'], ground_states)
        df['性'] = pd.Categorical(df['性'], sexes)
        df = pd.get_dummies(df, columns=['weather', 'race_type', 'ground_state', '性'])
        
        self.data_c = df
        

In [3]:
class DataProcessor:
    """    
    Attributes:
    ----------
    data : pd.DataFrame
        rawデータ
    data_p : pd.DataFrame
        preprocessing後のデータ
    data_h : pd.DataFrame
        merge_horse_results後のデータ
    data_pe : pd.DataFrame
        merge_peds後のデータ
    data_c : pd.DataFrame
        process_categorical後のデータ
    no_peds: Numpy.array
        merge_pedsを実行した時に、血統データが存在しなかった馬のhorse_id一覧
    """
    
    def __init__(self):
        self.data = pd.DataFrame()
        self.data_p = pd.DataFrame()
        self.data_h = pd.DataFrame()
        self.data_pe = pd.DataFrame()
        self.data_c = pd.DataFrame()
        
    def merge_horse_results(self, hr, n_sample_list=[5, 9, all]):
        """
        馬の過去成績データから、
        n_samples_listで指定されたレース分の着順と賞金の平均を追加してdata_hに返す
        Parameters:
        ----------
        hr : HorseResults
            馬の過去成績データ
        n_samples_list : list, default [5, 9, 'all']
            過去何レース分追加するか
        """
        
        self.data_h = self.data_p.copy()
        for n_sample in n_samples_list:
            self.data_h = hr.merge_all(self.data_h, n_samples=n_samples)
            
        #6/6の追加、馬の出馬感覚追加
        self.data_h['interval'] = (self.data_h['date'] - self.data_h['latest']).dt.days
        self.data_h.drop(['開催','latest'], axis=1, inplace=True)
        
    def merge_peds(self,peds):
        """
        5世代分血統データを追加してdata_peに返す
        Parameters:
        ----------
        peds : Peds.peds_e
            Pedsクラスで加工された血統データ。
        """
        
        self.data_pe = \
        self.data_h.merge(peds, left_on='horse_id', right_index=True, how='left')
        self.no_peds = self.data.data_pe[self_pe['peds_0'].isnull()] ['horse_id'].unique()
        if len(self.no_peds) > 0:
            print('scrape peds as horse_id_list "no_peds"')
            
        def process_categorical(self, le_horse, le_jockey, results_m):
            """
            カテゴリ変数を処理してdata_cに返す
            Parameters:
            ----------
            le_horse : sklearn.preprocessing.LabelEncoder
                horse_idを0始まりの整数に変換するLabelEncoderオブジェクト。
            le_jockey : sklearn.preprocessing.LabelEncoder
                jockey_idを0始まりの整数に変換するLabelEncoderオブジェクト。
            results_m : Results.data_pe
                ダミー変数化のとき、ResultsクラスとShutubaTableクラスで列を合わせるためのもの
            """
            
            df = self.data_pe.copy()
            
            #ラベルコーディング。horse_id,jockey_idを0始まりの変数に変換
            mask_horse = df['horse_id'].isin(le_horse.classes_)
            new_horse_id = df['horse_id'].mask(mask_horse).dropna().unique()
            le_horse.classes_ = np.concatenate([le_horse.classes_, new_horse_id])
            df['horse_id'] = le_horse.transform(df['horse_id'])
            mask_jockey = df['jockey_id'].isin(le_jockey.classes_)
            new_jockey_id = df['horse_id'].mask(mask_jockey).dropna().unique()
            le_jockey.classes_ = np.concatenate([le_jockey.classes_, new_jockey_id])
            df['jockey_id'] = le_jockey.transform(df['jockey_id'])
            
            #horse_id, jockey_idをpandasのカテゴリー型に変換
            df['horse_id'] = df['horse_id'].astype('categoty')
            df['jockey_id'] = df['jockey_id'].astype('category')
            
            #その他のカテゴリ変数をpandasのカテゴリ型に変換してからダミー変数化
            #列を一定にする
            weather = results_m['weather'].unique()
            race_types = results_m['race_type'].unique()
            ground_states = results_m['ground_states'].unique()
            sexes = results_m['性'].unique()
            df['weather'] = pd.Categorical(df['weather'], weathers)
            df['race_type'] = pd.Categorical(df['race_type'], race_types)
            df['ground_state'] = pd.Categorical(df['ground_sate'], ground_states)
            df['性'] = pd.Categorical(df['性'], sexes)
            df = pd.get_dummies(df, ccolumns=['weather', 'race_type', 'ground_sate', '性'])
            
            self.data_c = df

In [13]:
class Results(DataProcessor):
    def __init__(self, results):
        super(Results, self).__init__()
        self.data = results
        
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(race_id_list):
        """
        レース結果データをスクレイピングする関数
        Parameters:
        ----------
        race_id_list : list
            レースIDのリスト
        Returns:
        ----------
        race_results_df : pandas.DataFrame
            全レース結果データをまとめてDataFrame型にしたもの
        """
            
        #race_idをkeyにしてdataframe型を格納
        race_results = {}
        for race_id in tqdm(race_id_list):
            try:
                url = "https://db.netkeiba.com/race" + race_id
                #メインとなるテーブルデータを取得
                df = pd.read_html(url)[0]
                
                html = requests.get(url)
                html.encoding = "EUC-JP"
                soup = BeautifulSoup(html.text, "html.parser")
                
                #天候、レース種類、コースの長さ、馬場の状態、日付をスクレイピング
                texts = (
                    soup.find("div", attrs={"class" "data_intro"}).find_all("p")[0].text + soup.find("div", attrs={"class": "ata_intro"}).find_all("p")[1].test)
                info = re.findall('r\w+', texts)
                for text in info:
                    if text in ["芝", "ダート"]:
                        df["race_type"] = [text] * len(df)
                    if "障" in text:
                        df["course_len"] = ["障害"] * len(df)
                    if "m" in text:
                        df["course_len"] = [int(re.findall(r"\d+", text)[0])] * len(df)
                    if text in ["良", "稍重", "重", "不良"]:
                        df["ground_state"] = [text] * len(df)
                    if text in ["雲", "晴", "雨", "小雨", "小雪", "雪"]:
                        df["weather"] = [text] * len(df)
                    if "年" in text:
                        df["date"] = [text] * len(df)
                        
                #馬ID,騎手IDをスクレイピング
                horse_id_list = []
                horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all("a", attrs={"href": re.compile("^/horse")})
                
                for a in horse_a_list:
                    horse_id = re.findall(r"\d+", a["href"])
                    horse_id_list.append(horse_id[0])
                jockey_id_list = []
                jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all("a", attrs={"href": re.compile("^/jockey")})
                
                for a in jockey_a_list:
                    jockey_id = re.findall(r"\d+", a["href"])
                    jockey_id_list.append(jockey_id[0])
                df["horse_id"] = horse_id_list
                df["jockey_id"] = jockey_id_list
                
                #インデックスをrace_idにする
                df.index = [race_id] * len(df)
                
                race_results[race_id] = df
                time.sleep(1)
                
            #存在しないrace_idを飛ばす
            except IndexErrot:
                continue
            #wifiが切れた時にデータを一時的に返す
            except Exception as e:
                print(e)
                break
            #jupyterで停止ボタンを押した時の対処
            except:
                break
            
        #pd.DataFrame型に対して一つのデータにまとめる
        race_results_df = pd.concat([race_results[key] for key in race_results])
        
        return race_results_df
    
    #前処理
    def preprocessing(self):
        df = self.data.copy()
        
        #掠順に数位以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)
        df['rank'] = df['着順'].map(lambda x:1 if x<4 else 0)
        
        #性年を性と年齢に分ける
        df["性"] = df['性齡'].map(lambda x: str(x)[0])
        df['年齢'] = df['性齡'].map(lambda x: str(x)[1:]).astype(int)
                    
        #馬体重を体重と体重変化に分ける
        df['体重'] = df['馬体重'].str.split("(", expand=True)[0]
        df['体重変化'] = df['馬体重'].str.split("(", expand=True)[1].str[:-1]
        
        #errors='coerce'で、"計不"など変換できない時に欠損値にする
        df['体重'] = pd.to_numeric(df['体重'], errors='coerce')
        df['体重変化'] = pd.to_numeric(df['体重変化'], errors='coerce')
        
        #単勝をfloatに変換
        df['単勝'] = df['単勝'].astype(float)
        #距離は10の値を切り捨てる
        df['course_len'] = df["course_len"].astype(float) // 100
        
        #不要な列を削除
        df.drop(["タイム","着差","調教師","性齡","馬体重","馬名","騎手","人気","着順"], axis=1, inplace=True)
        df['date'] = pd.to_datetime(df["date"], format="%Y年%m月%d日")
        
        #開催場所
        df['開催'] = df.index.map(lambda x:str(x)[4:6])
        
        #6/6出走数追加
        df['n_horses'] = df.index.map(df.index.value_counts())

        self.data_p = df
    #カテゴリ変数の処理
    def process_categorical(self):
        self.le_horse = LabelEncorder().fit(self.data_pe['horse_id'])
        self.le_jockey = LabelEncorder().fit(self.data_pe['jockey_id'])
        super().process_categorical(self.li_horse, self.le_jockey, self.data_pe)
        

In [5]:
class ShutubaTable(DataProcessor):
    def __init__(self, shutubaTables):
        super(ShutubaTable,self).__init__()
        self.data = shutuba_tables
        
    @classmethod
    def scrape(cls, race_id, date):
        data = pd.DataFrame()
        for race_id in tqdm(race_id_list):
            url = url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + race_id
            df = pd.read_html(url)[0]
            df = df.T.reset_index(level=0, drop=True).T
            
            html = requests.get(url)
            html.encording = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")
            
            texts = soup.find('div', attrs={'class': 'RaceData01'}).text
            texts = re.findall(r'\w+', texts)
            for text in texts:
                if 'm' in text:
                    df['course_len'] = [int(re.findall(r'\d+', text)[0])] * len(df)
                if text in ["曇","晴","雨","小雨","小雪","雪"]:
                    df["weather"] = [text] * len(df)
                if text in ["良","稍重","量"]:
                    df['ground_state'] = [text] * len(df)
                if '不' in text:
                    df['ground_state'] = ['不良'] * len(df)
                #2020/12/13追加
                if '稍' in text:
                    df['grpund_state'] = ['稍重'] * len(df)
                if '芝' in text:
                    df['race_type'] = ['芝'] * len(df)
                if '障' in text:
                    df['race_type'] = ['障害'] * len(df)
                if 'ダ' in text:
                    df['race_type'] = ['ダート'] * len(df)
            df['date'] = [date] * len(df)
            
            #horse_id
            horse_id_list = []
            horse_id_list = soup.find_all("td", attrs={'class': 'HorseInfo'})
            for td in horse_td_list:
                horse_id = re.findall(r'\d+', td.find('a')['href'])[0]
                horse_id_list.append(horse_id)
            #jockey_id
            jockey_id_list = []
            jockey_id_list = soup.find_all("td", attrs={'class': 'jockey'})
            for td in jockey_td_list:
                jockey_id = re.findall(r'\d+', td.find('a')['href'])[0]
                jockey_id_list.append(jockey_id)
            df['horse_id'] = horse_id_list
            df['jockey_id'] = jockey_id_list
            
            df.index = [race_id] *len(df)
            data = data.append(df)
            time.sleep(1)
        return cls(data)
    
    #前処理
    def preprocessing(self):
        df = self.data.copy()
        
        df['性'] = df["性齡"].map(lambda x: str(x)[0])
        df["年齢"] = df["性齡"].map(lambda x: str(x)[1:]).astype(int)
        
        #馬体重を体重と体重変化に分ける
        df = df[df["馬体重(増減)"] != '--']
        df["体重"] = df["馬体重(増減)"].str.split("(", expand=True)[0].astype(int)
        df["体重変化"] = df["馬体重(増減)"].str.split("(", expand=True)[1].str[:-1]
        #2020/12/13追加:増減が前計府などのとき欠損値にする
        df['体重変化'] = pd.to_numeric(df['体重変化'],errors='coerce')
        
        df['date'] = pd.to_datetime(df["date"])
        
        df['枠'] = df['枠'].astype(int)
        df['馬番'] = df['馬番'].astype(int)
        df['斤量'] = df['斤量'].astype(int)
        
        df['開催'] = df.index.map(lambda x:str(x)[4:6])
        
        #6/6出走数追加
        df['n_horses'] = df.index.map(df.index.value_counts())
        
        #使用する列を選択
        df = df[["枠","馬番","斤量","course_len","weather","race_type","groud_state","date","horse_id","jockey_id","性","年齢","体重","体重変化","開催","n_horses"]]
        self.data_p = df.rename(colunbs={'枠': '枠番'})

In [5]:
class HorseResults:
    def __init__(self, horse_results):
        self.horse_results = horse_results[['日付','着順','賞金','着差','通過','開催','距離']]
        self.preprocessing()
        
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @classmethod
    def scrape(horse_id_list):
        """
        馬の過去成績データをスクレイピングする関数
        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト
        Returns:
        ----------
        horse_results_df : pandas.DataFrame
            全馬の過去成績データをまとめてDataFrame型にしたもの
        """
        #horse_idをkeyにしてDataFrame型を格納
        horse_results = {}
        for horse_id in tqdm(horse_id_list):
            try:
                url = 'https://db.netkeiba.com/horse/' + horse_id
                df = pd.read_html(url)[3]
                #受賞歴がある馬の場合、3番目に受賞テーブルがあるため、4番目のデータを取得する
                if df.columns[0]=='受賞歴':
                    df = pd.read_html(url)[4]
                df.index = [horse_id] * len(df)
                horse_results[horse_id] = df
                time.sleep(1)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break
            
        #pd.DataFrame型にして一つのデータにまとめる
        horse_results_df = pd.concat([horse_results[key] for key in horse_results])
        
        return horse_results_df
    
    def preprocessing(self):
        df = self.horse_results.copy()
        
        #着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)
        
        df["date"] = pd.to_datetime(df["日付"])
        df.drop(['日付'], axis=1, inplace=True)
        
        #賞金のNaNを0で埋める
        df['賞金'].fillna(0, inplace=True)
        
        #1着の着差を0にする
        df['着差'] = df['着差'].map(lambda x: 0 if x<0 else x)
        
        #レース展開データ
        #n=1 際車のコーナー位置 n=4 最終コーナー位置
        def corner(x, n):
            if type(x) != str:
                return x
            elif n==4:
                return int(re.findall(r'\d+', x)[-1])
            elif n==1:
                return int(re.findall(r'\d+', x)[0])
        df['first_corner'] = df['通過'].map(lambda x: corner(x, 1))
        df['final_corner'] = df['通過'].map(lambda x: corner(x, 4))
        
        df['final_to_rank'] = df['final_corner'] - df['着順']
        df['first_to_rank'] = df['first_corner'] - df['着順']
        df['first_to/final'] = df['first_corner'] - df['final_corner']
        
        #開催場所
        df['開催'] = df['開催'].str.extract(r'(\D+)')[0].map(place_dict).fillna('11')
        #race_type
        df['race_type'] = df['距離'].str.extract(r'(\D+)')[0].map(race_type_dict)
        #距離ほ10のくらいを切り捨てる
        df['course_len'] = df['距離'].str.extract(r'(\d+)').astype(int) // 100
        df.drop(['距離'], axis=1, inplace=True)
        #インデックス名を与える
        df.index.name = 'horse_id'
        
        self.horse_results = df
        self.target_list = ['着順','賞金','着差','first_corner','final_corner','first_to_rank','first_to_final','final_to_rank']
        
    #n_samplesレース分馬ごとに平均する
    def average(self, horse_id_list, date, n_sample='all'):
        target_df = self.horse_results.query('index in @horse_id_list')
        
        #過去何走分取り出すか決める
        if n_sample == 'all':
            filtered_df = target_df[target_d['date'] < date]
        elif n_sample > 0:
            filtered_df = target_df[target_df['date'] < date].sort_values('date', ascending=False).groupby(level=0).head(n_sample)
        else:
            raise Execption('n_sample must be > 0')
            
        
        #集計して辞書型に入れる
        self.average_dict = {}
        self.average_dict['non_category'] = filtered_df.groupby(level=0)[self.target_list].mean\
            .add_suffix('_{}R'.format(n_sample))
        for column in ['course_len', 'race_type', '開催']:
            self.average_dict[column] = filered_df.groupby(['horse_id', column])\
                [self.target_list].mean().add_suffix('_{}_{}R'.format(column, n_samples))
            
        #6/6追加,　馬の出走感覚追加のために全レースの日付をlatestに格納
        if n_sample == 5:
            self.latest = filtered_df.groupby('horse_id')['date'].max().rename('latest')
            
    def merge(self, results, date, n_sample='all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        self.average(horse_id_list, date, n_sample)
        merged_df = df.merge(self.average_dict['non_category'], left_on='hprse_id', right_index=True, how='left')
        
        for column in ['course_len', 'race_type', '開催']:
            merged_df = merged_df.merge(self.average_dict[column],
                                       left_on = ['horse_id', column],
                                       right_index = True, how='left')
            
        #6/6追加, 馬の出走間隔追加のために、全レースの日付を変数latestに格納
        if n_sample == 5:
            merged_df = merged_df.merge(self.latest, left_on = 'horse_id',
                                       right_index=True, how='left')
        return merged_df
        
    def merge_all(self, results, n_sample='all'):
        date_list = results['date'].unique()
        merged_df = pd.concat([self.merge(results, date, n_sample) for date in tqdm(date_list)])
        return merged_df
    
#開催場所をidに変換するための辞書型
place_dict = {
    '札幌':'01','函館':'02','福島':'03','新潟':'04','東京':'05',
    '中山':'06','中京':'07','京都':'08','阪神':'09','小倉':'10'
}
    
#レースタイプをレース結果とデータ整合させるための辞書
race_type_dict = {
    '芝':'芝','ダ':'ダート','障':'障害'
}
        

クラアス内にhorse_resultsに、前処理された過去成績データが入る
下のはpickleではないものの呼び出し　だから使えない

In [3]:
#使わない
hr = HorseResults(horse_results)
hr.horse_results

NameError: name 'HorseResults' is not defined

保存してあるpickleデータからオブジェクトを作る

In [9]:
hr = HorseResults.read_pickle(['horse_results.pickle'])
hr.horse_results

Unnamed: 0_level_0,着順,賞金,着差,通過,開催,date,first_corner,final_corner,final_to_rank,first_to_rank,first_to/final,race_type,course_len
horse_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017105318,11,0.0,0.9,3-3-3-2,03,2021-07-17,3.0,2.0,-9.0,-8.0,1.0,芝,20
2017105318,16,0.0,1.9,1-1-1-1,06,2020-12-26,1.0,1.0,-15.0,-15.0,0.0,芝,22
2017105318,4,110.0,0.2,7-6-4-4,01,2020-08-02,7.0,4.0,0.0,3.0,3.0,芝,20
2017105318,6,0.0,0.6,10-7-7-2,01,2019-08-31,10.0,2.0,-4.0,4.0,8.0,芝,18
2017105318,1,500.0,0.0,1-1-1-1,01,2019-07-27,1.0,1.0,0.0,0.0,0.0,芝,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017101106,7,0.0,1.6,5-5-6-7,09,2020-03-14,5.0,7.0,0.0,-2.0,-2.0,ダート,18
2017101106,9,0.0,1.7,8-8-10-9,07,2020-02-29,8.0,9.0,0.0,-1.0,-1.0,ダート,19
2017101106,13,0.0,1.8,1-1-1-1,08,2020-02-09,1.0,1.0,-12.0,-12.0,0.0,ダート,18
2017101106,5,51.0,1.6,10-9-9-7,08,2020-01-25,10.0,7.0,2.0,5.0,3.0,ダート,18


In [10]:
class Peds:
    def __init__(self, peds):
        self.peds = peds
        self.peds_e = pd.DataFrame()#after label encoding and transforming into category
        
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = updata_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(horse_id_list):
        """
        血統データをスクレイピングする関数
        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト
        Returns:
        ----------
        peds_df : pandas.DataFrame
            全血統データをまとめてDataFrame型にしたもの
        """
        
        peds_dict = {}
        for horse_id in tqdm(horse_id_list):
            try:
                url = "https://db.netkeiba.com/horse/ped/" + horse_id
                df = pd.read_html(url)[0]
                
                #重複を削除して一列のseries型に直す
                generations = {}
                for i in reversed(range(5)):
                    generations[i] = df[i]
                    df.drop([i], axis=1, inplace=True)
                    df = df.drop_duplicates()
                ped = pd.concat([generations[i] for i in range[5]]).rename(horse_id)
                
                peds_dict[horse_id] = ped.reset_index(drop=True)
                time.sleep(1)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break
                
        #列名をpeds_0, peds_61にする
        peds_df = pd.concat([peds_dict[key] for key in peds_dict], axis=1).T.add_prefix('peds_')
        
        return peds_df
    
    def encode(self):
        df = self.peds.copy()
        for column in df.columns:
            df[column] = LabelEncoder().fit_transform(df[column].fillna('Na'))
        self.peds_e = df.astype('category')
        

In [11]:
p = Peds.read_pickle(['peds.pickle'])
p.encode()
p.peds_e

Unnamed: 0,peds_0,peds_1,peds_2,peds_3,peds_4,peds_5,peds_6,peds_7,peds_8,peds_9,...,peds_52,peds_53,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61
2017105318,503,2272,180,372,179,377,98,189,94,169,...,51,604,118,325,18,188,166,870,818,511
2017104612,222,5585,156,303,752,4633,49,140,203,319,...,156,367,135,122,455,286,267,383,368,1795
2017103879,236,5206,75,23,492,3810,0,105,51,25,...,445,655,73,107,14,315,46,645,344,1880
2017106259,259,1403,185,365,783,2044,21,88,37,71,...,188,579,116,110,369,784,14,1166,1264,2934
2017104140,235,7267,163,366,394,1048,98,169,47,194,...,145,235,10,39,384,410,380,11,667,2330
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017102659,444,1584,189,123,565,3266,78,68,97,236,...,313,564,24,280,310,253,48,928,445,2262
2016102166,272,3585,153,337,493,3315,38,152,112,212,...,5,510,118,242,225,949,446,1118,1189,2608
2016103450,469,4390,189,133,658,4848,78,68,192,293,...,341,118,138,297,70,352,267,329,685,1326
2017105662,232,7326,163,260,531,4431,98,169,233,312,...,344,277,150,400,171,55,239,203,151,2113


In [14]:
r = Results.read_pickle(['results.pickle'])
r.preprocessing()
r.data_p.head()

KeyError: '性齡'