In [3]:
import pandas as pd
import time
from tqdm.notebook import tqdm
import pickle
import re 
from urllib.request import urlopen
import numpy as np
import requests
from bs4 import BeautifulSoup 
import matplotlib.pyplot as plt
from itertools import combinations
from itertools import permutations
import warnings
warnings.simplefilter(action="ignore")
import lightgbm as lgb
import optuna
import optuna.integration.lightgbm as lgb_o

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split

In [4]:
#閾値設定
def gain(return_func, X, n_samples=100, range_=[0.5, 3.5]):
    gain = {}
    for i in tqdm(range(n_samples)):
        threshold = range_[1] * i / n_samples + range_[0] * (1 - (i / n_samples))
        n_bets, return_rate, n_hits, std = return_func(X, threshold)
        if n_bets > 2:
            gain[threshold] = {"return_rate":return_rate,
                                "n_hits":n_hits,
                                "std":std,
                                "n_bets":n_bets}
    return pd.DataFrame(gain).T

#入力年のrace_id_listの制作
def race_id_c(year):
    race_id_list = []
    
    race_id_head = year
    for place in range(1,11,1):
        for kai in range(1,6,1):
            for day in range(1,13,1):
                for r in range(1,13,1):
                    race_id = str(place).zfill(2) + str(kai).zfill(2) +\
                    str(day).zfill(2) + str(r).zfill(2)
                    race_ids = race_id_head + race_id
                    race_id_list.append(race_ids)
    return race_id_list

#時系列を元にデータ分割
def split_data(df, test_size=0.3):
    sorted_id_list = df.sort_values("date").index.unique()
    train_id_list = sorted_id_list[: round(len(sorted_id_list) * (1 - test_size))]
    test_id_list = sorted_id_list[round(len(sorted_id_list) * (1 - test_size)) :]
    train = df.loc[train_id_list]#.drop(['date'], axis=1)
    test = df.loc[test_id_list]#.drop(['date'], axis=1)
    return train, test

def plot(df, label=' '):
    plt.fill_between(df.index, y1=df['return_rate']-df['std'],
        y2=df['return_rate']+df['std'],alpha=0.3)
 
    plt.plot(df.index, df['return_rate'], label=label)
    plt.legend()
    plt.grid(True)

place_dict = {
    '札幌':'01',  '函館':'02',  '福島':'03',  '新潟':'04',  '東京':'05', 
    '中山':'06',  '中京':'07',  '京都':'08',  '阪神':'09',  '小倉':'10'
}

race_type_dict = {
    '芝': '芝', 'ダ': 'ダート', '障': '障害'
}

In [5]:
horse_results = pd.read_pickle("pickle_file/horse_results_21.pickle")
peds_data = pd.read_pickle("pickle_file/n_peds_all2.pickle")
results_data = pd.read_pickle('pickle_file/results_all.pickle')
return_tables = pd.read_pickle('pickle_file/Return_tables_all.pickle')

dataprocessor

In [54]:

class DataProcessor:
    def __init__(self):
        self.data = pd.DataFrame() # raw data
        self.data_p = pd.DataFrame() #after preprocessing
        self.data_h = pd.DataFrame() #after merging horse_results
        self.data_pe = pd.DataFrame() #after merging peds
        self.data_c = pd.DataFrame() #after processing categorical features
        self.data_ = pd.DataFrame()
        #self.no_peds = pd.DataFrame()#親データがないhorse_id_list
    
    # 馬の過去成績のデータ追加    
    def merge_horse_results(self, hr, n_samples_list=[5, 9, 'all']):
        self.data_h = self.data_p.copy()
        for n_samples in n_samples_list:
            self.data_h = hr.merge_all(self.data_h, n_samples=n_samples)
            
        #self.data_h["interval"] = (self.data_h["data"] - self.data_h["latest"]).dt.days
        self.data_h.drop(["開催"],axis=1, inplace=True)
    
    # 馬の親データの追加                
    def merge_peds(self, peds):
        self.data_pe = self.data_h.merge(peds,left_on='horse_id',
        right_index=True, how='left')
        
        self.no_peds = self.data_pe[self.data_pe['peds_0'].isnull()]\
            ['horse_id'].unique()
        if len(self.no_peds) > 0:
            print('scrape peds at horse_id_list "no_peds"')
    
    # 質的変数への変換        
    def process_categorical(self, le_horse, le_jockey, results_m):
        df = self.data_pe.copy()
        
        #ラベルエンコーディング　horse_id,jockey_idを0始まりの整数に変換
        mask_horse = df['horse_id'].isin(le_horse.classes_)
        new_horse_id = df['horse_id'].mask(mask_horse).dropna().unique()
        le_horse.classes_ = np.concatenate([le_horse.classes_, new_horse_id])
        df['horse_id'] = le_horse.transform(df['horse_id'])
        
        mask_jockey = df['jockey_id'].isin(le_jockey.classes_)
        new_jockey_id = df['jockey_id'].mask(mask_jockey).dropna().unique()
        le_jockey.classes_ = np.concatenate([le_jockey.classes_, new_jockey_id])
        df['jockey_id'] = le_jockey.transform(df['jockey_id'])
        
        df["horse_id"] = df["horse_id"].astype('category')
        df["jockey_id"] = df["jockey_id"].astype('category')
        
        weathers = results_m['weather'].unique()
        race_types = results_m['race_type'].unique()
        ground_states = results_m['ground_state'].unique()
        sexes = results_m['性'].unique()
        df['weather'] = pd.Categorical(df['weather'], weathers)
        df['race_type'] = pd.Categorical(df['race_type'], race_types)
        df['ground_state'] = pd.Categorical(df['ground_state'], ground_states)
        df['性'] = pd.Categorical(df['性'], sexes)
        
        df = pd.get_dummies(df, columns=['weather', 'race_type', 'ground_state', '性'])
        #race_idを軸に馬番をsort
        df = df.reset_index().sort_values(["index","馬番"]).set_index('index')
        df.index.name = None
        
        self.data_c = df
     
    
# Results class 
# 足りないhorse_idをスクレイプした際は確認したのちdef to_data_frameを使う必要あり
class Results(DataProcessor):
    def __init__(self, results):
        super(Results, self).__init__()
        self.data = results
    
    # path_listはpickle名
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(race_id_list, pre_race_results={}):
        race_results = pre_race_results
        for race_id in tqdm(race_id_list):
            if race_id in race_results.key():
                continue
            
            try:
                time.sleep(1)
                url = "https://db.netkeiba.com/race/" + race_id
                df = pd.read_html(url)[0]
                html = requests.get(url)
                html.encode = "EUC-JP"
                soup = BeautifulSoup(html.text, "html.parser")
                
                texts = (
                    soup.find("div", attrs={"class" : "data_intro"}).find_all("p")[0].tesxt
                    + soup.find("div", attrs={"class" : "data_intro"}).find_all("p")[1].tesxt
                )
                info = re.findall(r"\w+", texts)
                for text in info:
                    if text in ["芝", "ダート"]:
                        df["race_type"] = [text] * len(df)
                    if "障" in text:
                        df["race_type"] = ["障害"] * len(df)
                    if "m" in text:
                        df["course_len"] = [int(re.findall(r"\d+", text)[0])] * len(df)
                    if text in ["良", "稍重", "稍", "重", "不良"]:
                        df["ground_state"] = [text] * len(df)
                    if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                        df["weather"] = [text] * len(df)
                    if "年" in text:
                        df["date"] = [text] * len(df)
                        
                #馬ID、騎手IDをスクレイピング
                horse_id_list = []
                horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/horse")}
                )
                for a in horse_a_list:
                    horse_id = re.findall(r"\d+", a["href"])
                    horse_id_list.append(horse_id[0])
                jockey_id_list = []
                jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/jockey")}
                )
                for a in jockey_a_list:
                    jockey_id = re.findall(r"\d+", a["href"])
                    jockey_id_list.append(jockey_id[0])
                df["horse_id"] = horse_id_list
                df["jockey_id"] = jockey_id_list

                #インデックスをrace_idにする
                df.index = [race_id] * len(df)
                
                race_results[race_id] = df 
                
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break
        return race_results
        # 一度dataframe型に直さずに出力を返す        
        #return race_results
        
    def to_data_frame(race_results):
        race_results_df = pd.concat([race_results[key] for key in race_results])

        return race_results_df
        
        
        
    # to_dataframe関数で出力後にdataframe型に変換が可能            
    #def to_dataframe(race_results):
        #race_results_df = pd.concat([race_results[key] for key in race_results])
        
        #r.data_rr = race_results_df                    
                
    # regressionをtrueにすることでsecond,着順が表示され回帰が行える。
    def preprocessing(self, regression=False, ranking=False):
        df = self.data.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)
        df['rank'] = df['着順'].map(lambda x: 1 if x < 4 else 0)

        # 性齢を性と年齢に分ける
        df["性"] = df["性齢"].map(lambda x: str(x)[0])
        df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

        # 馬体重を体重と体重変化に分ける (馬体重修正するかも)
        df["体重"] = df["馬体重"].str.split("(", expand=True)[0].astype(int)
        df["体重変化"] = df["馬体重"]\
            .str.split("(", expand=True)[1].replace("前計不", "0").str[:-1].astype(dtype = int)

        # データをint, floatに変換
        df["単勝"] = df["単勝"].astype(float)
        df['course_len'] = df['course_len'].astype(float) // 100

        # 不要な列を削除
        #回帰子を作るために一時的に着順,タイムの列を削除しない。
        #df.drop(["タイム", "着差", "調教師", "性齢", "馬体重","馬名","騎手","人気","着順"], axis=1, inplace=True)
        df.drop(["着差", "調教師", "性齢", "馬体重","馬名","騎手","人気","cource_len"], axis=1, inplace=True)

        df["date"] = pd.to_datetime(df["date"], format="%Y年%m月%d日")
        
        df["タイム"].fillna("0", inplace=True)
        df["a"] = df["タイム"].map(lambda x:re.sub(r"\D", "", x)).astype(str)
        df["second"] = df["a"].map(lambda x: 0 if x=="0" \
        else (int(x[0]) * 60) + int(x[1:3]) + int(x[1:])/10).astype(float)
        df.drop("a", axis=1 ,inplace=True)
        df.drop(["タイム"],axis=1, inplace=True)
        
        df["rls"] = df["second"]\
        .map(lambda x: np.sqrt(np.log(x)))
        
        df["ranking"] = df["着順"].map(lambda x: x if x==1 else\
            (x if x==2 else(x if x==3 else(x if x==4 else(x if x==5 else 0)))))
        
        df['開催'] = df.index.map(lambda x: str(x)[4:6])
        
        df["n_horses"] = df.index.map(df.index.value_counts())
        
        if regression == True:
            self.data_p = df
        else:
            self.data_p = df.drop(["second","着順","rls"],axis=1)
            
        if ranking == True:
            self.data_p = df
        else:
            self.data_p = df.drop(["ranking"], axis=1)
    
    def process_categorical(self):
        self.le_horse = LabelEncoder().fit(self.data_pe['horse_id'])
        self.le_jockey = LabelEncoder().fit(self.data_pe['jockey_id'])
        super().process_categorical(self.le_horse, self.le_jockey, self.data_pe)
  
    
# ShutubaTable class        
class ShutubaTable(DataProcessor):
    def __init__(self, shutuba_tables):
        super(ShutubaTable, self).__init__()
        self.data = shutuba_tables
        
    @classmethod    
    def scrape(cls, race_id_list, date):
        data = pd.DataFrame()
        for race_id in tqdm(race_id_list):
            
            url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + race_id
            df = pd.read_html(url)[0]
            df = df.T.reset_index(level=0, drop=True).T
            
            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")
            
            texts = soup.find("div", attrs={"class":"RaceData01"}).text
            texts = re.findall(r"\w+", texts)
            for text in texts:
                if 'm' in text:
                    df['course_len'] = [int(re.findall(r'\d+', text)[0])] * len(df)
                if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                    df["weather"] = [text] * len(df)
                if text in ["良", "稍重", "稍", "重"]:
                    df["ground_state"] = [text] * len(df)
                if '不' in text:
                    df["ground_state"] = ['不良'] * len(df)
                if '芝' in text:
                    df['race_type'] = ['芝'] * len(df)
                if '障' in text:
                    df['race_type'] = ['障害'] * len(df)
                if 'ダ' in text:
                    df['race_type'] = ['ダート'] * len(df)
            df['date'] = [date] * len(df)
            
            # horse_id
            horse_id_list = []
            horse_td_list = soup.find_all("td", attrs={'class': 'HorseInfo'})
            for td in horse_td_list:
                horse_id = re.findall(r'\d+', td.find('a')['href'])[0]
                horse_id_list.append(horse_id)
            # jockey_id
            jockey_id_list = []
            jockey_td_list = soup.find_all("td", attrs={'class': 'Jockey'})
            for td in jockey_td_list:
                jockey_id = re.findall(r'\d+', td.find('a')['href'])[0]
                jockey_id_list.append(jockey_id)
            df['horse_id'] = horse_id_list
            df['jockey_id'] = jockey_id_list

            df.index = [race_id] * len(df)
            data = data.append(df)
            time.sleep(1)
        return cls(data)

    #disclosuer = Trueで馬体重が公開されていないデータでも予測が行える        
    def preprocessing(self, disclosuer=False):
        df = self.data.copy()
        
        df["性"] = df["性齢"].map(lambda x: str(x)[0])
        df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

        # 馬体重を体重と体重変化に分ける
        if disclosuer == True:
            df["体重"] = 470
            df["体重変化"] = 0
        else:
            df = df[df["馬体重(増減)"] != '--']
            df["体重"] = df["馬体重(増減)"].str.split("(", expand=True)[0].astype(int)

            df["体重変化"] = df["馬体重(増減)"].str.split("(", expand=True)[1].replace("前計不)", "0)").str[:-1].astype(dtype = int)
            
        
        #df["体重変化"] = df["馬体重(増減)"].str.split("(", expand=True)[1]
        #df["体重変化"] = df["体重変化"].replace("前計不)", "0)")
        #df["体重変化"] = df["体重変化"].str[:-1].astype(int)
        
        df["date"] = pd.to_datetime(df["date"])
        
        df['枠'] = df['枠'].astype(int)
        df['馬番'] = df['馬番'].astype(int)
        df['斤量'] = df['斤量'].astype(int)
        
        df["開催"] = df.index.map(lambda x:str(x)[4:6])
        
        df["n_horses"] = df.index.map(df.index.value_counts())

        # 不要な列を削除
        df = df[['枠', '馬番', '斤量', 'course_len', 'weather','race_type',
        'ground_state', 'date', 'horse_id', 'jockey_id', '性', '年齢',
        '体重', '体重変化',"開催","n_horses"]]
        
        self.data_p = df.rename(columns={'枠': '枠番'})
        

horse_results

In [7]:
class Horse_Results:
    def __init__(self, horse_results):
        self.horse_results = horse_results[['日付','着順','賞金','着差','通過','開催','距離']]
        self.preprocessing()
        #self.horse_results.rename(columns={'着順':'着順_ave','賞金':'賞金_ave'}, inplace=True)
    
    # path_listはHorse_Results.pickle名
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.concat([pd.read_pickle(path) for path in path_list])
        return cls(df)
    # 使い方
    # Horse_Results.read_pickle([pickle名])
    
    @staticmethod
    def scrape(horse_id_list):
        #horse_idをkeyにしてDataFrame型を格納
        horse_results = {}
        for horse_id in tqdm(horse_id_list):
            try:
                url = 'https://db.netkeiba.com/horse/' + horse_id
                df = pd.read_html(url)[3]
                #受賞歴がある馬の場合、3番目に受賞歴テーブルが来るため、4番目のデータを取得する
                if df.columns[0]=='受賞歴':
                    df = pd.read_html(url)[4]
                df.index = [horse_id] * len(df)
                horse_results[horse_id] = df
                time.sleep(1)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる        
        horse_results_df = pd.concat([horse_results[key] for key in horse_results])

        return horse_results_df
        
        
    def preprocessing(self):
        df = self.horse_results.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)

        df["date"] = pd.to_datetime(df["日付"])
        df.drop(['日付'], axis=1, inplace=True)
        
        #賞金のNaNを0で埋める
        df['賞金'].fillna(0, inplace=True)
        
        #1着の着差を0にする
        df['着差'] = df['着差'].map(lambda x: 0 if x<0 else x)
        
        #レース展開データ
        #n=1: 最初のコーナー位置, n=4: 最終コーナー位置
        def corner(x, n):
            if type(x) != str:
                return x
            elif n==4:
                return int(re.findall(r'\d+', x)[-1])
            elif n==1:
                return int(re.findall(r'\d+', x)[0])
        df['first_corner'] = df['通過'].map(lambda x: corner(x, 1))
        df['final_corner'] = df['通過'].map(lambda x: corner(x, 4))
        
        df['final_to_rank'] = df['final_corner'] - df['着順']
        df['first_to_rank'] = df['first_corner'] - df['着順']
        df['first_to_final'] = df['first_corner'] - df['final_corner']
        
        #開催場所
        df['開催'] = df['開催'].str.extract(r'(\D+)')[0].map(place_dict).fillna('11')
        #race_type
        df['race_type'] = df['距離'].str.extract(r'(\D+)')[0].map(race_type_dict)
        #距離
        df['course_len'] = df['距離'].str.extract(r'(\d+)').astype(int) // 100
        df.drop(['距離'], axis=1, inplace=True)
        
        #インデックス名を与える
        df.index.name = 'horse_id'
    
        self.horse_results = df
        self.target_list = ['着順', '賞金', '着差', 'first_corner',
                            'first_to_rank', 'first_to_final','final_to_rank']
        
        
    def average(self, horse_id_list, date, n_samples='all'):
        target_df = self.horse_results.loc[horse_id_list]
        
        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')
          
        self.average_dict = {}
        self.average_dict['non_category'] = filtered_df.groupby(level=0)[self.target_list]\
            .mean().add_suffix('_{}R'.format(n_samples))
        for column in ['course_len', 'race_type', '開催']:
            self.average_dict[column] = filtered_df.groupby(['horse_id', column])\
                [self.target_list].mean().add_suffix('_{}_{}R'.format(column, n_samples))    
    
    def merge(self, results, date, n_samples='all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        self.average(horse_id_list, date, n_samples)
        merged_df = df.merge(self.average_dict['non_category'], left_on='horse_id',
                             right_index=True, how='left')
        for column in ['course_len','race_type', '開催']:
            merged_df = merged_df.merge(self.average_dict[column], 
                                        left_on=['horse_id', column],
                                        right_index=True, how='left')
        return merged_df
    
    def merge_all(self, results, n_samples='all'):
        date_list = results['date'].unique()
        merged_df = pd.concat(
            [self.merge(results, date, n_samples) for date in tqdm(date_list)]
        )
        return merged_df
    
   

return

In [8]:
class Return:
    def __init__(self, return_tables):
        self.return_tables = return_tables
    
    # path_listはpathではなく保存名で良い
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.concat([pd.read_pickle(path) for path in path_list])
        return cls(df)
        
    @staticmethod
    def scrape(race_id_list):
        return_tables = {}
        for race_id in tqdm(race_id_list):
            try:
                url = "https://db.netkeiba.com/race/" + race_id

                #普通にスクレイピングすると複勝やワイドなどが区切られないで繋がってしまう。
                #そのため、改行コードを文字列brに変換して後でsplitする
                f = urlopen(url)
                html = f.read()
                html = html.replace(b'<br />', b'br')
                dfs = pd.read_html(html)

                #dfsの1番目に単勝〜馬連、2番目にワイド〜三連単がある
                df = pd.concat([dfs[1], dfs[2]])

                df.index = [race_id] * len(df)
                return_tables[race_id] = df
                time.sleep(0.7)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる
        return_tables_df = pd.concat([return_tables[key] for key in return_tables])
        return return_tables_df
    
    @property
    def sanrenpuku(self):
        sanrenpuku = self.return_tables[self.return_tables[0] == "三連複"][[1,2]]
        wins = sanrenpuku[1].str.split('-', expand=True)[[0,1,2]].add_prefix('wins_')
        return_ = sanrenpuku[2].rename('return')
        df = pd.concat([wins, return_], axis=1)
        return df.apply(lambda x:pd.to_numeric(x.str.replace(',',''), errors='coerce'))
    
    @property
    def sanrentan(self):
        sanrentan = self.return_tables[self.return_tables[0] == "三連単"][[1,2]]
        wins = sanrentan[1].str.split('→', expand=True)[[0,1,2]].add_prefix('wins_')
        return_ = sanrentan[2].rename('return')
        df = pd.concat([wins, return_], axis=1)
        return df.apply(lambda x: pd.to_numeric(x.str.replace(',',''), errors='coerce'))
    
    @property #本来ならRetrun(return_tables).fukusho(retrun_tables)の形だが、
    #Retrun(return_tables).fukushoで扱える
    def fukusho(self):
        fukusho = self.return_tables[self.return_tables[0] == '複勝'][[1,2]]
        wins = fukusho[1].str.split('br', expand=True)[[0,1,2]]
        wins.columns = ['win_0','win_1','win_2']
        returns = fukusho[2].str.split('br', expand=True)[[0,1,2]]
        returns.columns = ['return_0','return_1','return_2']
        df = pd.concat([wins, returns], axis=1)
        for column in df.columns:
            df[column] = df[column].str.replace(',', "")
        return df.fillna(0).astype(int)
        
    
    @property
    def wide(self):
        wide = self.return_tables[self.return_tables[0] == "ワイド"][[1,2]]
        wins = wide[1].str.split('br', expand=True)[[0,1,2]]
        wins = wins.stack().str.split('-', expand=True).add_prefix('win_')
        return_ = wide[2].str.split('br', expand=True)[[0,1,2]]
        return_ = return_.stack().rename("return")
        df = pd.concat([wins, return_], axis=1)
        return df.apply(lambda x: pd.to_numeric(x.str.replace(",",""),errors='coerce'))
    
    @property #単勝に対して予測を行う
    def tansho(self):
        tansho = self.return_tables[self.return_tables[0] == '単勝'][[1,2]]
        tansho.columns = ['win','return']
        
        for column in tansho.columns:
            tansho[column] = pd.to_numeric(tansho[column], errors='coerce')
        return tansho
    
    @property
    def umaren(self):
        umaren = self.return_tables[self.return_tables[0] == "馬連"][[1,2]]
        wins = umaren[1].str.split('-', expand=True)[[0,1]].add_prefix("win_")
        #wins.columns = (["win_1","win_2"])#.add_prefix("win_")
        return_ = umaren[2].rename('return')
        df = pd.concat([wins, return_], axis=1)
        return df.apply(lambda x: pd.to_numeric(x, errors="coerce"))
    
    @property
    def umatan(self):
        umatan = self.return_tables[self.return_tables[0] == '馬単'][[1,2]]
        wins = umatan[1].str.split("→", expand=True)[[0,1]].add_prefix("win_")
        return_ = umatan[2].rename('return')
        df = pd.concat([wins, return_], axis=1)
        return df.apply(lambda x: pd.to_numeric(x, errors="coerce"))
        
        #for column in umaren.columns:
            #umaren[column] = pd.to_numeric(umaren[column], errors='coerce')

modeleva

In [9]:
#このクラス内において、第一引数にあたるxにはX_testのようなテストデータを入れる(単勝項目有)
# return_tables_path = pickle_path
class ModelEvaluator:
    
    def __init__(self, model, return_tables_list):
        self.model = model
        self.rt = Return.read_pickle(return_tables_list)
        self.fukusho = self.rt.fukusho
        self.tansho = self.rt.tansho
        self.umaren = self.rt.umaren
        self.umatan = self.rt.umatan
        self.wide = self.rt.wide
        self.sanrentan = self.rt.sanrentan
        self.sanrenpuku = self.rt.sanrenpuku
        #self.std = std

    #3着以内に入る確率を予測、表示
    #X = Objective Variable type
    #引数train 項目に"単勝"があればdropする
    #引数std 標準偏差の計算を行う
    #引数minmax 出力された値のスケーリングを行う
    def predict_proba(self, X, train=True, std=True, minmax=False):
        #相対評価工程
        if train:
            proba = pd.Series(self.model.predict_proba(X.drop(["単勝"], axis=1))[:,1], index=X.index)
        else:
            proba = pd.Series(self.model.predict_proba(X, axis=1)[:,1], index=X.index)
        #proba = pd.Series(self.model.predict_proba(X)[:,1], index=X.index)
        if std:
            standerd_scaler = lambda x: (x - x.mean()) / x.std()
            proba = proba.groupby(level=0).transform(standerd_scaler)
            
        #min-maxスケーリング
        if minmax:
            proba = (proba - proba.min()) / (proba.max() - proba.min())
        return proba 
    
    #閾値(threshold)を設定する　デフォルト0.6
    #predict_probaで確率がthreshold以上であれば1を出力(1=賭ける)
    def predict(self, X, threshold=0.6):
        y_pred = self.predict_proba(X)
        self.proba = y_pred
        return [0 if p<threshold else 1 for p in y_pred]
        
    #auc曲線のスコアを求める
    def roc_auc_score(self, y_test, X_test):
        return roc_auc_score(y_test, lgb_clf.predict_proba(X_test.drop(["単勝"], axis=1))[:, 1])
    
    #各成分の特徴量の強さの表示 デフォルト20
    def feature_importance(self, X, n_display=20):
        importances = pd.DataFrame({"features":X.columns, 
                                  "importance":self.model.feature_importances_})
        return importances.sort_values('importance', ascending=False)[:n_display]
    
    #閾値を通して1と判定されたものだけをpred_tableとして出力する
    def pred_table(self, X, threshold=0.6, bet_only = True):
        pred_table = X.copy()[['馬番',"単勝"]]
        pred_table['pred'] = self.predict(X, threshold)
        pred_table["score"] = self.proba
        return pred_table[pred_table["pred"] == 1]
  
    # umabanはint型で入力する必要がある
    def bet(self, race_id, kind, umaban, amount):
        if kind == "tansho":
            rt_a = self.tansho.loc[race_id]
            return_ = (rt_a['win'] == umaban) * amount/100 * rt_a['return']
        elif kind == "fukusho":
            rt_a = self.fukusho.loc[race_id]
            return_ = ((rt_a[["win_0","win_1","win_2"]] == umaban).values * \
            rt_a[['return_0',"return_1","return_2"]]).sum() * amount/100
        elif kind == "umaren":
            rt_a = self.umaren.loc[race_id]
            return_ = (set(rt_a[["win_0","win_1"]]) == set(umaban)) * rt_a["return"] *\
            amount/100
        elif kind == "umatan":
            rt_a = self.umatan.loc[race_id]
            return_ = (list(rt_a[["win_0","win_1"]]) == list(umaban)) * rt_a["return"] *\
            amount/100
        elif kind == "wide":
            rt_a = self.wide.loc[race_id]
            return_ = (rt_a[["win_0","win_1"]].apply(lambda x:set(x)==set(umaban),axis=1)) *\
            rt_a["return"] / 100 * amount
        elif kind == "sanrentan":
            rt_a = self.sanrentan.loc[race_id]
            return_ = (list(rt_a[["wins_0","wins_1","wins_2"]])==list(umaban))*\
            rt_a["return"] / 100 * amount 
        elif kind == 'sanrenpuku':
            rt_a = self.sanrenpuku.loc[race_id]
            return_ = (set(rt_a[["wins_0","wins_1","wins_2"]])==set(umaban))*\
            rt_a["return"]/100 * amount
        elif not (return_ >= 0):
            return_ = amount
        return return_
                

    #items=True　にすることで、項目名の確認が可能になる。
    #ただし、Trueの状態では、gain関数に利用することができないのでFalseの必要がある  
    def fukusho_return(self, X, threshold=0.6, items=False):
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_list.append(np.sum([\
                self.bet(race_id,"fukusho",umaban,1) for umaban in preds["馬番"]\
                                      ]))
        return_rate = np.sum(return_list) / n_bets
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        n_hits = np.sum([x>0 for x in return_list])
        
        if items == True:    
            return {"n_bets":n_bets, "return_rate":return_rate, "n_hits":n_hits, "std":std}
        else:
            return n_bets, return_rate, n_hits, std
    
    def tansho_return(self, X, threshold=0.6, items=False):
        pred_table = self.pred_table(X, threshold)
        self.sample = pred_table
        n_bets = len(pred_table)
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_list.append(
                np.sum([self.bet(race_id,"tansho",umaban,1) for umaban in preds["馬番"]])
            )
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        if items == True:
            return {"n_bets":n_bets, "return_rate":return_rate, "n_hits":n_hits, "std":std}
        else:
            return n_bets, return_rate, n_hits, std     
 
    def tansho_return_proper(self, X, threshold=0.6, items=False):
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_list.append(
                np.sum(preds.apply(lambda x:self.bet(
                race_id, "tansho", x["馬番"], 1/x["単勝"]), axis=1)))
        bet_money = (1 / pred_table["単勝"]).sum()
        std = np.std(return_list) * np.sqrt(len(return_list)) / bet_money
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / bet_money
        if items == True:
            return {"n_bets":n_bets, "return_rate":return_rate, "n_hits":n_hits, "std":std}
        else:
            return n_bets, return_rate, n_hits, std

    
    def umaren_box(self, X, threshold=0.6, n_aite=5, items=False):
        pred_table = self.pred_table(X, threshold)
        n_bets = 0
        return_list = []
        
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                continue
            elif len(preds_jiku) >= 2:
                for umaban in combinations(preds_jiku["馬番"], 2):
                    return_ += self.bet(race_id, "umaren", umaban, 1)
                    n_bets += 1
                return_list.append(return_)
                
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        if items == True:    
            return {"n_bets":n_bets, "return_rate":return_rate, "n_hits":n_hits, "std":std}
        else:
            return n_bets, return_rate, n_hits, std

    
    def umatan_box(self, X, threshold=0.6, n_aite=5, items=False):
        pred_table = self.pred_table(X, threshold, bet_only=False)
        n_bets = 0
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query("pred == 1")
            if len(preds_jiku) == 1:
                continue
            elif len(preds_jiku) >= 2:
                for umaban in permutations(preds_jiku["馬番"], 2):
                    return_ += self.bet(race_id, "umatan", umaban, 1)
                    n_bets += 1
                return_list.append(return_)
            
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets            
        if items == True:
            return {"n_bets":n_bets, "return_rate":return_rate, "n_hits":n_hits, "std":std}
        else:
            return n_bets, return_rate, n_hits, std

        
    def wide_box(self, X, threshold=0.6, n_aite=5, items=False):
        pred_table = self.pred_table(X, threshold, bet_only=False)
        n_bets = 0
            
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query("pred == 1")
            if len(preds_jiku) == 1:
                continue
            elif len(preds_jiku) >= 2:
                for umaban in combinations(presd_jiku["馬番"], 2):
                    return_ += self.bet(race_id, "wide", umaban, 1)
                    n_bets += 1
                return_list.append(return_)
                    
        std = np.sum(return_list) * np.sqrt(len(return_list)) / n_bets
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        if items == True:
            return {"n_bets":n_bets, "return_rate":return_rate, "n_hits":n_hits, "std":std}
        else:
            return n_bets, return_rate, n_hits, std
            
    def sanrentan_box(self, X, threshold=0.6, items=False):
        pred_table = self.pred_table(X, threshold)
        n_bets = 0
            
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            if len(prefs) < 3:
                continue
            else:
                for umaban in permutations(preds["馬番"], 3):
                    return_ += self.bet(race_id, "sanrentan", umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
                
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        if items == True:
            return {"n_bets":n_bets, "return_rate":return_rate, "n_hits":n_hits, "std":std}
        else:
            return n_bets, return_rate, n_hits, std
            
    def sanrenpuku_box(self, X, threshold=0.6, items=False):
        pred_table = self.pred_table(X, threshold)
        n_bets = 0
        return_list = []
            
        for race_id, preds in pred_table.groupbu(level=0):
            return_ = 0
            if len(preds) < 3:
                continue
            else:
                for umaban in combinations(preds["馬番"], 3):
                    return_ += self.bet(race_id ,"sanrenpuku", umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        if items == True:
            return {"n_bets":n_bets, "return_rate":return_rate, "n_hits":n_hits, "std":std}
        else:
            return n_bets, return_rate, n_hits, std
        
    def umaren_nagashi(self, X, threshold=0.6, n_aite=5, items=False):
        pred_table = self.pred_table(X, threshold, bet_only=False)
        n_bets = 0
        return_lsit = []
            
        for race_id, preds in pred_table.groupbu(level=0):
            return_ = 0
            preds_ijku = preds.query("pred == 1")
            if len(preds_jiku) == 1:
                preds_aite = preds.sort_values("score", ascending=False)\
                .iloc[1:(n_aite+1)]["馬番"]
                return_ = preds_aite.map(
                race_id, "umaban", [preds_jiku["馬番"].values[0], x], 1).sum()
                n_bets += n_aite
                return_list.append(return_)
            elif len(preds_jiku) >= 2:
                for umaban in combinations(preds_jiku["馬番"], 2):
                    return_ += self.bet(race_id, "umaban", umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        n_hits = np.sum([x>0 for x in return_list])    
        return_rate = np.sum(return_list) / n_bets 
        if items == True:
            return {"n_bets":n_bets, "return_rate":return_rate, "n_hits":n_hits, "std":std}
        else:
            return n_bets, return_rate, n_hits, std
        
    def umatan_nagashi(self, X, threshold=0.6, n_aite=5, items=False):
        pred_table = self.pred_table(X, threshold, bet_only=False)
        n_bets = 0
        return_lsit = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query("pred == 1")
            if len(preds_jiku) == 1:
                preds_aite = preds.sort_values("score", ascending=False).\
                iloc[1: (n_aite+1)]["馬番"]
                return_ = preds_aite.map(
                lambda x: self.bet(race_id, "umatan", [preds_jiku["馬番"].values[0], x], 1)).sum()
                n_bets += n_aite
            elif len(preds_jiku) >= 2:
                for umaban in permutations(preds_jiku["馬番"], 2):
                    return_ += self.bet(race_id, "umatan", umaban, 1)
                    n_bets += 1
            return_list.append(return_)
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        if items == True:
            return {"n_bets":n_bets, "return_rate":return_rate, "n_hits":n_hits, "std":std}
        else:
            return n_bets, return_rate, n_hits, std
        
    def wide_nagashi(self, X, threshold=0.6, n_aite=5, items=False):
        pred_table = self.pred_table(X, threshpld, bet_only=False)
        n_bets = 0
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query("pred == 1")
            if len(preds_jiku) == 1:
                preds_aite = preds.sort_values("scoer", ascending=False).iloc[1:(n_aite+1)]["馬番"]
                return_ = preds_aite.map(lambda x: self.bet(race_id, "wide", [preds_jiku["馬番"].values[0], x], 1)).sum()
                n_bets += len(preds_aite)
                return_list.append(return_)
            elif len(preds_jiku) >= 2:
                for umaban in combinations(preds_jiku["馬番"], 2):
                    return_ += self.bet(race_id, "wide", umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        if items == True:
            return {"n_bets":n_bets, "return_rate":return_rate, "n_hits":n_hits, "std":std}
        else:
            return n_bets, return_rate, n_hits, std
        
    def sanrentan_nagshi(self, X, thresholf=1.5, n_aite=7, items=False):
        pred_table = self.pred_table(X, thresholod, bet_only=False)
        n_bets = 0
        return_ = []
        for race_id, preds in pred_table.groupbu(level=0):
            preds_jiku = preds.query("pred == 1")
            if len(preds_jiku) == 1:
                continue
            elif len(preds_jiku) == 2:
                preds_aite = preds.sort_values("score", ascending=False).\
                iloc[2: (n_aite+2)]["馬番"]
                return_ = preds_aite.map(lambda x: self.bet(race_id, "sanrentan",np.append\
                                                                (preds_jiku["馬番"].values, x), 1)).sum()
                n_bets += len(preds_aite)
                return_list.append(return_)
            elif len(preds_jiku) >= 3:
                return_ = 0
                for umaban in permutations(preds_jiku["馬番"], 3):
                    return_ += self.bet(race_id, "sanrentan", umaban, 1)
                    n_bets += 1
                return_list.append(return_)
                
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_lsit) / n_bets
        if items == True:
            return {"n_bets":n_bets, "return_rate":return_rate, "n_hits":n_hits, "std":std}
        else:
            return n_bets, return_rate, n_hits, std
            

peds

In [10]:
class Peds:
    def __init__(self, peds):
        self.peds = peds
        self.peds_e = pd.DataFrame() #after label encoding and transforming into category
    
    def encode(self):
        df = self.peds.copy()
        for column in df.columns:
            df[column] = LabelEncoder().fit_transform(df[column].fillna('Na'))
        self.peds_e = df.astype('category')
    
    # Peds.read_pickle(["pickle_path"])
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
        #df = pd.concat([pd.read_pickle(path) for path in path_list])
        #return cls(df)
    
    @staticmethod
    def scrape(horse_id_list):
        peds_dict = {}
        for horse_id in tqdm(horse_id_list):
            try:
                url = "https://db.netkeiba.com/horse/ped/" + horse_id
                df = pd.read_html(url)[0]
                
                #重複を削除して1列のSeries型データに直す
                generations = {}
                for i in reversed(range(5)):
                    generations[i] = df[i]
                    df.drop([i], axis = 1, inplace = True)
                    df = df.drop_duplicates()
                ped = pd.concat([generations[i] for i in range(5)]).rename(horse_id)
                
                peds_dict[horse_id] = ped.reset_index(drop = True)
                time.sleep(0.7)
                
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break
        #return peds
                
        #列名をpeds_0, ..., peds_61にする
        peds_df = pd.concat([peds_dict[key] for key in peds_dict], axis=1).T.add_prefix('peds_')
    
        return peds_df

In [50]:
#過去レースデータの前処理
r = Results(results_data)
r.preprocessing(regression=True, ranking=False)
#馬の過去レース情報の追加
hr = Horse_Results.read_pickle(["pickle_file/horse_results_19.pickle",
                   'pickle_file/horse_results_20.pickle',
                   'pickle_file/horse_results_21.pickle'])
r.merge_horse_results(hr)
#馬の親データの追加、カテゴリデータとしてエンコード
P = Peds(peds_data)
P.encode()
r.merge_peds(P.peds_e)
#質的データのエンコード
r.process_categorical()
#テストデータ、訓練データ、学習データに分割
train, test = split_data(r.data_c)
train, valid = split_data(train)

X_train = train.drop(["rank","date","単勝","second","rls","着順","data"],axis=1)
y_train = train["rank"]
X_test = test.drop(["rank","date","単勝","second","rls","着順","data"],axis=1)
y_test = test["rank"]
X_valid = valid.drop(["rank","date","単勝","second","rls","着順","data"],axis=1)
y_valid = valid["rank"]

  0%|          | 0/111 [00:00<?, ?it/s]

  0%|          | 0/110 [00:00<?, ?it/s]

  0%|          | 0/110 [00:00<?, ?it/s]

In [17]:
#optunaによって出力されたハイパーパラメータ
params = {
 'objective': 'binary',
 'random_state': 100,
 'feature_pre_filter': False,
 'lambda_l1': 9.490245203532942e-07,
 'lambda_l2': 6.421168438428032,
 'num_leaves': 36,
 'feature_fraction': 0.4,
 'bagging_fraction': 1.0,
 'bagging_freq': 0,
 'min_child_samples': 5}

In [55]:
#学習データをフィッティング
lgb_clf = lgb.LGBMClassifier(**params)
lgb_clf.fit(X_train.values, y_train.values)

LGBMClassifier(bagging_fraction=1.0, bagging_freq=0, feature_fraction=0.4,
               feature_pre_filter=False, lambda_l1=9.490245203532942e-07,
               lambda_l2=6.421168438428032, min_child_samples=5, num_leaves=36,
               objective='binary', random_state=100)

In [56]:
#予測を行うレースidを入力
race_id = ["202106050811"]　#予測を行いたいレースIDを入力　2021年6月5日京都競馬場で行われる第11レース
st = ShutubaTable.scrape(race_id, "2021/12/26")　

#出馬テーブルの整形
st.preprocessing()
st.merge_horse_results(hr)
st.merge_peds(P.peds_e)
st.process_categorical(r.le_horse, r.le_jockey, r.data_pe)

me = ModelEvaluator(lgb_clf,["pickle_file/Return_tables_all.pickle"])

#予測
scores = me.predict_proba(st.data_c.drop(['date'], axis=1), train=False)
pred = st.data_c[['馬番']].copy()
pred['score'] = scores
pred.loc['202106050811'].sort_values('score', ascending=False)



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,馬番,score
202106050811,7,2.643638
202106050811,9,1.499066
202106050811,10,0.815996
202106050811,2,0.27056
202106050811,11,0.216243
202106050811,16,0.129985
202106050811,13,0.115631
202106050811,5,0.025744
202106050811,6,-0.203321
202106050811,12,-0.28255


In [None]:
regressionのFalseで特徴量を削除する
