# import

In [1]:
import pandas as pd
from tqdm.notebook import tqdm as tqdm
import requests
from bs4 import BeautifulSoup
import re
import time
import urllib.request
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score,roc_curve, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import average_precision_score
import numpy as np
import matplotlib.pyplot as plt
from graphviz import *
from sklearn.preprocessing import LabelEncoder
import optuna.integration.xgboost as xgb_o
import optuna.integration.lightgbm as lgb_o
import scipy as sp
import lightgbm as lgb
import optuna
import sklearn
from scipy.special import comb
from itertools import combinations
import copy
from sklearn.metrics import r2_score
from sklearn.metrics import roc_auc_score
from scipy.special import comb
from itertools import permutations
import datetime
import lxml
import seaborn as sns
from hyperopt import hp, tpe, Trials, fmin,STATUS_OK
import fasttext as ft

# path

In [2]:
path_ubu = '/home/hipro/デスクトップ/Horse/Data/20_21'
path_mac2 = '/Users/rince/Desktop/Horse/Data/saishin2/'
path_mac = '/Users/rince/Desktop/Horse/Data/saishin/'
path_win = '/Users/Owner/Desktop/program/Horse/Data/saishin/'
path_win2 = '/Users/Owner/Desktop/program/Horse/Data/saishin2/'

# funcs

In [3]:

def split_data(df, test_size=0.2, rank_learning=True):
    df_ = df.copy()
    if not rank_learning:
        df_['rank'] = df_['rank'].map(lambda x:1 if x<4 else 0)
    sorted_id_list = df_.sort_values("date").index.unique()
    train_id_list = sorted_id_list[: round(len(sorted_id_list) * (1 - test_size))]
    test_id_list = sorted_id_list[round(len(sorted_id_list) * (1 - test_size)) :]
    train = df_.loc[train_id_list]#.drop(['date'], axis=1)
    test = df_.loc[test_id_list]#.drop(['date'], axis=1)
    return train, test

def rus_data(df, test_size=0.2):
    train, test = split_data(df,test_size=test_size)
    x_train = train.drop(['rank', 'date','単勝'], axis=1)
    y_train = train['rank']
    x_test = test.drop(['rank', 'date','単勝'], axis=1)
    y_test = test['rank']
    
    rus = RandomUnderSampler(random_state=0)
    x_resampled, y_resampled = rus.fit_resample(x_train, y_train)
    return x_resampled, y_resampled, x_test, y_test

def load_csv(load_path):
    df = pd.read_csv(load_path, index_col=0)
    return df

def gain(return_func, x_, n_samples=100,lower=50,t_range=[0.5,3.5]):
    gain = {}
    for i in range(n_samples):
        threshold = t_range[1] * (i/n_samples) + t_range[0] *(1-i/n_samples)
        n_bets, return_rate, n_hits,std = return_func(x_, threshold)
        if n_bets > lower:
            gain[threshold] = {'return_rate':return_rate,'n_hits':n_hits,'std':std,'n_bets':n_bets}
    return pd.DataFrame(gain).T

place_dict = {
    '札幌':'01',  '函館':'02',  '福島':'03',  '新潟':'04',  '東京':'05', 
    '中山':'06',  '中京':'07',  '京都':'08',  '阪神':'09',  '小倉':'10'
}

race_type_dict = {
    '芝': '芝', 'ダ': 'ダート', '障': '障害'
}

def plot(g,label=''):
    plt.fill_between(g.index,y1 = g['return_rate'] - g['std'],y2=g['return_rate']+g['std'],alpha=0.3)
    plt.plot(g.index,g['return_rate'],label=label)
    plt.grid(True)
    
def update_data(old, new):
    """
    Parameters:
    ----------
    old : pandas.DataFrame
        古いデータ
    new : pandas.DataFrame
        新しいデータ
    """

    filtered_old = old[~old.index.isin(new.index)]
    return pd.concat([filtered_old, new])

def scrape_race_results(race_id_list, pre_race_results={}):
    race_results = pre_race_results
    for race_id in race_id_list:
        if race_id in race_results.keys():
            continue
        try:
            time.sleep(0.5)
            url = "https://db.netkeiba.com/race/" + race_id
            race_results[race_id] = pd.read_html(url)[0]
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
        except:
            break
    return race_results

def plot_importances(xgb_model, x_test):
    importances = pd.DataFrame(
    {'features' : x_test.columns, 'importances' : xgb_model.feature_importances_})
    print(importances.sort_values('importances', ascending=False)[:20])
    
def xgb_pred(x_train, y_train, x_test, y_test):
    param_dist = {'objective':'binary:logistic',
                  'n_estimators':14,
                  'use_label_encoder':False,
                 'max_depth':4,
                 'random_state':100}
    
    best_params = {'booster': 'gbtree', 
                   'objective': 'binary:logistic',
                   'use_label_encoder':False,
                   'eval_metric': 'rmse', 
                   'random_state': 100, 
                   'use_label_encoder':False,
                   'eta': 0.13449222415941048,
                   'max_depth': 3,
                   'lambda': 0.7223936363734638, 
                   'n_estimators': 14, 
                   'reg_alpha': 0.7879044553842869,
                   'reg_lambda': 0.7780344172793093,
                   'importance_type': 'gain'}
    xgb_model = xgb.XGBClassifier(**best_params)
    hr_pred = xgb_model.fit(x_train.astype(float), np.array(y_train), eval_metric='logloss').predict(x_test.astype(float))
    print("---------------------")
    y_proba_train = xgb_model.predict_proba(x_train)[:,1]
    y_proba = xgb_model.predict_proba(x_test)[:,1]
    print('AUC train:',roc_auc_score(y_train,y_proba_train))    
    print('AUC test :',roc_auc_score(y_test,y_proba))
    print(classification_report(np.array(y_test), hr_pred))
    xgb.plot_importance(xgb_model) 
    plot_importances(xgb_model, x_test)
    return xgb_model

def lgb_pred(x_train, y_train, x_test, y_test):
    param_dist = {
        'objective' : 'binary',
          'random_state':100,
                 }
    best_params = {'objective': 'binary',
     'metric': 'l1',
     'verbosity': -1,
     'boosting_type': 'gbdt',
     'feature_pre_filter': False,
     'lambda_l1': 0.001101158293733924,
     'lambda_l2': 7.419556660834531e-07,
     'num_leaves': 254,
     'feature_fraction': 1.0,
     'bagging_fraction': 0.9773374137350906,
     'bagging_freq': 1,
     'min_child_samples': 5,
    #  'num_iterations': 200,
    #  'early_stopping_round': 50,
     'categorical_column': [4,
                            5,94,95,96,97,  98,  99,  100,  101,  102,  103,  104,  105,  106,  107,  108,  109,  110,  111,  112,  113,  114,  115,  116,  117,  118,  119,  120,  121,  122,  123,  124,  125,  126,  127,  128,  129,  130,  131,  132,  133,  134,  135,  136,  137,  138,  139,  140,  141,  142,  143,  144,  145,  146,  147,  148,  149,  150,  151,  152,  153,  154,
      155]
                  }

    lgb_model = lgb.LGBMClassifier(**best_params)
    hr_pred = lgb_model.fit(x_train.astype(float), np.array(y_train), eval_metric='logloss').predict(x_test.astype(float))
    print("---------------------")
    y_proba_train = lgb_model.predict_proba(x_train.astype(float))[:,1]
    y_proba = lgb_model.predict_proba(x_test.astype(float))[:,1]
    print('AUC train:',roc_auc_score(y_train,y_proba_train))    
    print('AUC test :',roc_auc_score(y_test,y_proba))
    print(classification_report(np.array(y_test), hr_pred))
    plt.clf()
    lgb.plot_importance(lgb_model) 
    plot_importances(lgb_model, x_test)
    return lgb_model

def make_data(data_,test_rate=0.8,is_rus=True):
    data_ = data_.sort_values('date')
    x_ = data_.drop(['rank','date','単勝'],axis=1)
    y_ = data_['rank']

    test_rate = int(test_rate*len(x_))
    x_train, x_test = x_.iloc[:test_rate],x_.iloc[test_rate:]
    y_train, y_test = y_.iloc[:test_rate],y_.iloc[test_rate:]
    if is_rus:
        rus = RandomUnderSampler(random_state=0)
        x_resampled, y_resampled = rus.fit_resample(x_train, y_train)
        return x_resampled, y_resampled, x_test, y_test
    else:
        return x_train,y_train,x_test,y_test

def make_check_data(data_,test_rate=0.8):
    data_ = data_.sort_values('date')
    x_ = data_.drop(['rank','date'],axis=1)
    y_ = data_['rank']

    test_rate = int(test_rate*len(x_))
    x_train, x_check = x_.iloc[:test_rate],x_.iloc[test_rate:]
    y_train, y_check = y_.iloc[:test_rate],y_.iloc[test_rate:]

    return x_check,y_check

def grid_search(x_train,y_train,x_test,y_test):
    trains = xgb.DMatrix(x_train.astype(float), label=y_train)
    tests = xgb.DMatrix(x_test.astype(float), label=y_test)

    base_params = {
        'booster': 'gbtree',
        'objective':'binary:logistic',
        'eval_metric': 'rmse',
        'random_state':100,
        'use_label_encoder':False
    }

    watchlist = [(trains, 'train'), (tests, 'eval')]
    tmp_params = copy.deepcopy(base_params)
    
#     インナー関数
    def optimizer(trial):
        eta = trial.suggest_uniform('eta', 0.01, 0.3)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        __lambda = trial.suggest_uniform('lambda', 0.7, 2)
        n_estimators = trial.suggest_int('n_estimators', 3, 20)
        learning_rate = trial.suggest_uniform('lambda', 0.01, 1)
        reg_alpha = trial.suggest_uniform('reg_alpha', 0.01, 1)
        reg_lambda = trial.suggest_uniform('reg_lambda', 0.01, 1)
        importance_type = trial.suggest_categorical('importance_type',
                                                    ['gain', 'weight', 'cover','total_gain','total_cover'])

        tmp_params['eta'] = eta
        tmp_params['max_depth'] = max_depth
        tmp_params['lambda'] = __lambda
        tmp_params['n_estimators'] = n_estimators
        tmp_params['learning_rate'] = learning_rate
        tmp_params['reg_alpha'] = reg_alpha
        tmp_params['reg_lambda'] = reg_lambda
        tmp_params['importance_type'] = importance_type
        model = xgb.train(tmp_params, trains, num_boost_round=50)
        predicts = model.predict(tests)
        r2 = r2_score(y_test, predicts)
        print(f'#{trial.number}, Result: {r2}, {trial.params}')
        return r2
    
def predict(race_id,p,hr,r,return_tables,lgb_clf,date):
    data =  ShutubaTable.scrape([str(race_id)], date)
    st = ShutubaTable(data)
    st.preprocessing()
    st.merge_horse_results(hr)
    st.merge_peds(p.peds_e)
    st.process_categorical(r.le_horse, r.le_jockey, r.data_pe)
    return_tables.rename(columns={'0':0,'1':1,'2':2,'3':3},inplace=True)
    me_st = ModelEvaluator(lgb_clf, return_tables)

    
    #予測
    scores = me_st.predict_proba(st.data_c.drop(['date'],axis=1),train=False)
    pred = st.data_c[['馬番']].copy()
    pred['scores'] = scores
    print(pred.loc[race_id].sort_values('scores',ascending=False))
    

# race_id 命名規則

race_id 202105040802\
yyyy_pp_xx_xxrr\
y : year\
p : palce\
x : 謎\
r : race番号

# r.data_c['単勝'] == st.data_c[オッズ]

# classes

In [215]:

class HorseResults:
    def __init__(self, horse_results):
        self.horse_results = horse_results[['日付', '着順', '賞金', '着差', '通過',
                                            '開催', '距離']]
        self.preprocessing()
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.concat([pd.read_pickle(path) for path in path_list])
        return cls(df)
    @staticmethod
    def scrape(horse_id_list):
        #horse_idをkeyにしてDataFrame型を格納
        horse_results = {}
        for horse_id in tqdm(horse_id_list):
#         for horse_id in horse_id_list:
            try:
                url = 'https://db.netkeiba.com/horse/' + horse_id
                df = pd.read_html(url)[3]
                #受賞歴がある馬の場合、3番目に受賞歴テーブルが来るため、4番目のデータを取得する
                if df.columns[0]=='受賞歴':
                    df = pd.read_html(url)[4]
                df.index = [horse_id] * len(df)
                horse_results[horse_id] = df
                time.sleep(0.5)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる        
        horse_results_df = pd.concat([horse_results[key] for key in horse_results])

        return horse_results_df
    
    
    #省略
        
    def preprocessing(self):
        df = self.horse_results.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)

        df["date"] = pd.to_datetime(df["日付"])
        df.drop(['日付'], axis=1, inplace=True)
        
        #賞金のNaNを0で埋める
        df['賞金'].fillna(0, inplace=True)
        
        #1着の着差を0にする
        df['着差'] = df['着差'].map(lambda x: 0 if x<0 else x)
        
        #レース展開データ
        #n=1: 最初のコーナー位置, n=4: 最終コーナー位置
        def corner(x, n):
            if type(x) != str:
                return x
            elif n==4:
                return int(re.findall(r'\d+', x)[-1])
            elif n==1:
                return int(re.findall(r'\d+', x)[0])
        df['first_corner'] = df['通過'].map(lambda x: corner(x, 1))
        df['final_corner'] = df['通過'].map(lambda x: corner(x, 4))
        
        df['final_to_rank'] = df['final_corner'] - df['着順']
        df['first_to_rank'] = df['first_corner'] - df['着順']
        df['first_to_final'] = df['first_corner'] - df['final_corner']
        
        #開催場所
        df['開催'] = df['開催'].str.extract(r'(\D+)')[0].map(place_dict).fillna('11')
        #race_type
        df['race_type'] = df['距離'].str.extract(r'(\D+)')[0].map(race_type_dict)
        #距離
        df['course_len'] = df['距離'].str.extract(r'(\d+)').astype(int) // 100
        df.drop(['距離'], axis=1, inplace=True)
        
        #インデックス名を与える
        df.index.name = 'horse_id'
    
        self.horse_results = df
        self.target_list = ['着順', '賞金', '着差', 'first_corner',
                            'first_to_rank', 'first_to_final','final_to_rank']
        
        
    def average(self, horse_id_list, date, n_samples='all'):
        target_df = self.horse_results.query('index in @horse_id_list')
        
        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')
          
        self.average_dict = {}
        self.average_dict['non_category'] = filtered_df.groupby(level=0)[self.target_list]\
            .mean().add_suffix('_{}R'.format(n_samples))
        for column in ['course_len', 'race_type', '開催']:
            self.average_dict[column] = filtered_df.groupby(['horse_id', column])\
                [self.target_list].mean().add_suffix('_{}_{}R'.format(column, n_samples)).fillna(0)

    
    def merge(self, results, date, n_samples='all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        self.average(horse_id_list, date, n_samples)
        merged_df = df.merge(self.average_dict['non_category'], left_on='horse_id',
                             right_index=True, how='left')
        for column in ['course_len','race_type', '開催']:
            merged_df = merged_df.merge(self.average_dict[column], 
                                        left_on=['horse_id', column],
                                        right_index=True, how='left').fillna(0)
        return merged_df
    
    def merge_all(self, results, n_samples='all'):
        date_list = results['date'].unique()
        merged_df = pd.concat(
            [self.merge(results, date, n_samples) for date in tqdm(date_list)]
        )
        return merged_df

class Return:

    def __init__(self, return_tables):
        self.return_tables = return_tables
    
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.concat([pd.read_pickle(path) for path in path_list])
        return cls(df)

    @staticmethod
    def scrape(race_id_list):
        """
        払い戻し表データをスクレイピングする関数

        Parameters:
        ----------
        race_id_list : list
            レースIDのリスト

        Returns:
        ----------
        return_tables_df : pandas.DataFrame
            全払い戻し表データをまとめてDataFrame型にしたもの
        """

        return_tables = {}
        for race_id in tqdm(race_id_list):
            try:
                url = "https://db.netkeiba.com/race/" + race_id

                #普通にスクレイピングすると複勝やワイドなどが区切られないで繋がってしまう。
                #そのため、改行コードを文字列brに変換して後でsplitする
                f = urllib.request.urlopen(url)
                html = f.read()
                html = html.replace(b'<br />', b'br')
                dfs = pd.read_html(html)

                #dfsの1番目に単勝〜馬連、2番目にワイド〜三連単がある
                df = pd.concat([dfs[1], dfs[2]])

                df.index = [race_id] * len(df)
                return_tables[race_id] = df
                time.sleep(0.5)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる
        return_tables_df = pd.concat([return_tables[key] for key in return_tables])
        return return_tables_df
    
    
    
   
    @property
    def fukusho(self):
        fukusho = self.return_tables[self.return_tables[0]=='複勝'][[1,2]]
        wins = fukusho[1].str.split('br', expand=True)[[0,1,2]]
        
        wins.columns = ['win_0', 'win_1', 'win_2']
        returns = fukusho[2].str.split('br', expand=True)[[0,1,2]]
        returns.columns = ['return_0', 'return_1', 'return_2']
        
        df = pd.concat([wins, returns], axis=1)
        for column in df.columns:
            df[column] = df[column].str.replace(',', '')
        return df.fillna(0).astype(int)
    
    @property
    def tansho(self):
        tansho = self.return_tables[self.return_tables[0]=='単勝'][[1,2]]
        tansho.columns = ['win', 'return']
        
        for column in tansho.columns:
            tansho[column] = pd.to_numeric(tansho[column], errors='coerce')
            
        return tansho
    
    @property
    def umaren(self):
        umaren = self.return_tables[self.return_tables[0]=='馬連'][[1,2]]
        wins = umaren[1].str.split('-', expand=True)[[0,1]].add_prefix('win_')
        return_ = umaren[2].rename('return')  
        df = pd.concat([wins, return_], axis=1)        
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    
    @property
    def umatan(self):
        umatan = self.return_tables[self.return_tables[0]=='馬単'][[1,2]]
        wins = umatan[1].str.split('→', expand=True)[[0,1]].add_prefix('win_')
        return_ = umatan[2].rename('return')  
        df = pd.concat([wins, return_], axis=1)        
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    
    @property
    def wide(self):
        wide = self.return_tables[self.return_tables[0]=='ワイド'][[1,2]]
        wins = wide[1].str.split('br', expand=True)[[0,1,2]]
        wins = wins.stack().str.split('-', expand=True).add_prefix('win_')
        return_ = wide[2].str.split('br', expand=True)[[0,1,2]]
        return_ = return_.stack().rename('return')
        df = pd.concat([wins, return_], axis=1)
        return df.apply(lambda x: pd.to_numeric(x.str.replace(',',''), errors='coerce'))
    
    @property
    def sanrentan(self):
        rentan = self.return_tables[self.return_tables[0]=='三連単'][[1,2]]
        wins = rentan[1].str.split('→', expand=True)[[0,1,2]].add_prefix('win_')
        return_ = rentan[2].rename('return')
        df = pd.concat([wins, return_], axis=1) 
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    
    @property
    def sanrenpuku(self):
        renpuku = self.return_tables[self.return_tables[0]=='三連複'][[1,2]]
        wins = renpuku[1].str.split('-', expand=True)[[0,1,2]].add_prefix('win_')
        return_ = renpuku[2].rename('return')
        df = pd.concat([wins, return_], axis=1) 
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    
class ModelEvaluator:

    
    def __init__(self, model, return_tables):
        self.model = model
        self.rt = Return(return_tables)
        self.fukusho = self.rt.fukusho
        self.tansho = self.rt.tansho
        self.umaren = self.rt.umaren
        self.umatan = self.rt.umatan
        self.wide = self.rt.wide
        self.sanrenpuku = self.rt.sanrenpuku
        self.sanrentan = self.rt.sanrentan

    
    #3着以内に入る確率を予測
    def predict_proba(self, X, train=True, std=True, minmax=False):
        if train:
            proba = pd.Series(
                self.model.predict_proba(X.drop(['単勝'], axis=1))[:, 1], index=X.index
            )
        else:
            proba = pd.Series(
                self.model.predict_proba(X, axis=1)[:, 1], index=X.index
            )
        if std:
            #レース内で標準化して、相対評価する。「レース内偏差値」みたいなもの。
            standard_scaler = lambda x: (x - x.mean()) / x.std()
            proba = proba.groupby(level=0).transform(standard_scaler)
        if minmax:
            #データ全体を0~1にする
            proba = (proba - proba.min()) / (proba.max() - proba.min())
        return proba
    
    #0か1かを予測
    def predict(self, X, threshold=0.5):
        y_pred = self.predict_proba(X)
        self.proba = y_pred
        return [0 if p<threshold else 1 for p in y_pred]
    
    def score(self, y_true, X):
        return roc_auc_score(y_true, self.predict_proba(X))
    
    def feature_importance(self, X, n_display=20):
        importances = pd.DataFrame({"features": X.columns, 
                                    "importance": self.model.feature_importances_})
        return importances.sort_values("importance", ascending=False)[:n_display]
    
    def pred_table(self, X, threshold=0.5, bet_only=True):
        pred_table = X.copy()[['馬番', '単勝']]
        pred_table['pred'] = self.predict(X, threshold)
        pred_table['score'] = self.proba
        if bet_only:
            return pred_table[pred_table['pred']==1][['馬番', '単勝', 'score','pred']]
        else:
            return pred_table[['馬番', '単勝', 'score', 'pred']]
        
    def bet(self, race_id, kind, umaban, amount):
        if kind == 'fukusho':
            rt_1R = self.fukusho.loc[race_id]
            return_ = (rt_1R[['win_0', 'win_1', 'win_2']]==umaban).values * \
                rt_1R[['return_0', 'return_1', 'return_2']].values * amount/100
            return_ = np.sum(return_)
        if kind == 'tansho':
            rt_1R = self.tansho.loc[race_id]
            return_ = (rt_1R['win']==umaban) * rt_1R['return'] * amount/100
        if kind == 'umaren':
            rt_1R = self.umaren.loc[race_id]
            return_ = (set(rt_1R[['win_0', 'win_1']]) == set(umaban)) \
                * rt_1R['return']/100 * amount
        if kind == 'umatan':
            rt_1R = self.umatan.loc[race_id]
            return_ = (list(rt_1R[['win_0', 'win_1']]) == list(umaban))\
                * rt_1R['return']/100 * amount
        if kind == 'wide':
            rt_1R = self.wide.loc[race_id]
            return_ = (rt_1R[['win_0', 'win_1']].\
                           apply(lambda x: set(x)==set(umaban), axis=1)) \
                * rt_1R['return']/100 * amount
            return_ = return_.sum()
        if kind == 'sanrentan':
            rt_1R = self.sanrentan.loc[race_id]
            return_ = (list(rt_1R[['win_0', 'win_1', 'win_2']]) == list(umaban)) * \
                rt_1R['return']/100 * amount
        if kind == 'sanrenpuku':
            rt_1R = self.sanrenpuku.loc[race_id]
            return_ = (set(rt_1R[['win_0', 'win_1', 'win_2']]) == set(umaban)) \
                * rt_1R['return']/100 * amount
        if not (return_ >= 0):
                return_ = amount
        return return_
        
    def fukusho_return(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_list.append(np.sum([
                self.bet(race_id, 'fukusho', umaban, 1) for umaban in preds['馬番']
            ]))
        return_rate = np.sum(return_list) / n_bets
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        n_hits = np.sum([x>0 for x in return_list])
        return n_bets, return_rate, n_hits, std
    
    def tansho_return(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        self.sample = pred_table
        n_bets = len(pred_table)
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_list.append(
                np.sum([self.bet(race_id, 'tansho', umaban, 1) for umaban in preds['馬番']])
            )
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def tansho_return_proper(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_list.append(
                np.sum(preds.apply(lambda x: self.bet(
                    race_id, 'tansho', x['馬番'], 1/x['単勝']), axis=1)))
        
        bet_money = (1 / pred_table['単勝']).sum()
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / bet_money
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / bet_money
        return n_bets, return_rate, n_hits, std
    
    def umaren_box(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                continue
            elif len(preds_jiku) >= 2:
                for umaban in combinations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'umaren', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def umatan_box(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                continue   
            elif len(preds_jiku) >= 2:
                for umaban in permutations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'umatan', umaban, 1)
                    n_bets += 1
            return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def wide_box(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                continue
            elif len(preds_jiku) >= 2:
                for umaban in combinations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'wide', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std  
        
    def sanrentan_box(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            if len(preds)<3:
                continue
            else:
                for umaban in permutations(preds['馬番'], 3):
                    return_ += self.bet(race_id, 'sanrentan', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def sanrenpuku_box(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            if len(preds)<3:
                continue
            else:
                for umaban in combinations(preds['馬番'], 3):
                    return_ += self.bet(race_id, 'sanrenpuku', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def umaren_nagashi(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                preds_aite = preds.sort_values('score', ascending = False)\
                    .iloc[1:(n_aite+1)]['馬番']
                return_ = preds_aite.map(
                    lambda x: self.bet(
                        race_id, 'umaren', [preds_jiku['馬番'].values[0], x], 1
                    )
                ).sum()
                n_bets += n_aite
                return_list.append(return_)
            elif len(preds_jiku) >= 2:
                for umaban in combinations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'umaren', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def umatan_nagashi(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                preds_aite = preds.sort_values('score', ascending = False)\
                    .iloc[1:(n_aite+1)]['馬番']
                return_ = preds_aite.map(
                    lambda x: self.bet(
                        race_id, 'umatan', [preds_jiku['馬番'].values[0], x], 1
                    )
                ).sum()
                n_bets += n_aite
                
            elif len(preds_jiku) >= 2:
                for umaban in permutations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'umatan', umaban, 1)
                    n_bets += 1
            return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def wide_nagashi(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                preds_aite = preds.sort_values('score', ascending = False)\
                    .iloc[1:(n_aite+1)]['馬番']
                return_ = preds_aite.map(
                    lambda x: self.bet(
                        race_id, 'wide', [preds_jiku['馬番'].values[0], x], 1
                    )
                ).sum()
                n_bets += len(preds_aite)
                return_list.append(return_)
            elif len(preds_jiku) >= 2:
                for umaban in combinations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'wide', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def sanrentan_nagashi(self, X, threshold = 1.5, n_aite=7):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                continue
            elif len(preds_jiku) == 2:
                preds_aite = preds.sort_values('score', ascending = False)\
                    .iloc[2:(n_aite+2)]['馬番']
                return_ = preds_aite.map(
                    lambda x: self.bet(
                        race_id, 'sanrentan',
                        np.append(preds_jiku['馬番'].values, x),
                        1
                    )
                ).sum()
                n_bets += len(preds_aite)
                return_list.append(return_)
            elif len(preds_jiku) >= 3:
                return_ = 0
                for umaban in permutations(preds_jiku['馬番'], 3):
                    return_ += self.bet(race_id, 'sanrentan', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
class DataProcessor:
    
    def __init__(self):
        self.data = pd.DataFrame() #raw data
        self.data_p = pd.DataFrame() #after preprocessing
        self.data_h = pd.DataFrame() #after merging horse_results
        self.data_pe = pd.DataFrame() #after merging peds
        self.data_c = pd.DataFrame() #after processing categorical features
        
    #馬の過去成績データの追加
    def merge_horse_results(self, hr, n_samples_list=[5, 9, 'all']):
        self.data_h = self.data_p.copy()
        for n_samples in n_samples_list:
            self.data_h = hr.merge_all(self.data_h, n_samples=n_samples)
        self.data_h.drop(['開催'], axis=1, inplace=True)
            
    #血統データ追加
    def merge_peds(self, peds):
        self.data_pe = self.data_h.merge(peds, left_on='horse_id', right_index=True,how='left')
#         重複データを削除
        self.data_pe = self.data_pe[~self.data_pe.duplicated()]
        self.no_peds = self.data_pe[self.data_pe['peds_0'].isnull()]['horse_id'].unique()
#         print("type :",type(self.no_peds)) ndarray
#         Peds.scrape()
        if len(self.no_peds) > 0:
            print('scrape peds at horse_id_list "no_peds"')
            
        #カテゴリ変数の処理
    def process_categorical(self, le_horse, le_jockey,results_m):
        df = self.data_pe.copy()
        
        #ラベルエンコーディング。horse_id, jockey_idを0始まりの整数に変換
        mask_horse = df['horse_id'].isin(le_horse.classes_)
        new_horse_id = df['horse_id'].mask(mask_horse).dropna().unique()
        le_horse.classes_ = np.concatenate([le_horse.classes_, new_horse_id])
        df['horse_id'] = le_horse.transform(df['horse_id'])
        
        mask_jockey = df['jockey_id'].isin(le_jockey.classes_)
        new_jockey_id = df['jockey_id'].mask(mask_jockey).dropna().unique()
        le_jockey.classes_ = np.concatenate([le_jockey.classes_, new_jockey_id])
        df['jockey_id'] = le_jockey.transform(df['jockey_id'])
#         pedsデータのラベルエンコーディング

#         for column in p.peds_e.columns:
# #             self.le_peds_dict[column] = LabelEncoder().fit_transform(df[column].fillna('Na'))
# #             mask_peds = df[column].isin(p.le_peds[column].classes_)
#             new_peds_id = df[column].dropna().unique()
# #             p.le_peds[column].classes_ = np.concatenate([p.le_peds[column].classes_, new_peds_id])
#             df[column] = p.le_peds[column].transform(df[column])
        
        
        #horse_id, jockey_idをpandasのcategory型に変換
        df['horse_id'] = df['horse_id'].astype('category')
        df['jockey_id'] = df['jockey_id'].astype('category')
        
        #そのほかのカテゴリ変数をpandasのcategory型に変換してからダミー変数化
        #列を一定にするため
        weathers = results_m['weather'].unique()
        race_types = results_m['race_type'].unique()
        ground_states = results_m['ground_state'].unique()
        sexes = results_m['性'].unique()
        df['weather'] = pd.Categorical(df['weather'], weathers)
        df['race_type'] = pd.Categorical(df['race_type'], race_types)
        df['ground_state'] = pd.Categorical(df['ground_state'], ground_states)
        df['性'] = pd.Categorical(df['性'], sexes)
        df = pd.get_dummies(df, columns=['weather', 'race_type', 'ground_state', '性'])
        
        self.data_c = df    
    
class ShutubaTable(DataProcessor):
    
    
    def __init__(self, shutuba_tables):
        super(ShutubaTable, self).__init__()
        self.data = shutuba_tables
    
    @classmethod
    def scrape(cls, race_id_list, date):
        data = pd.DataFrame()
        for race_id in tqdm(race_id_list):
            url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + race_id
            df = pd.read_html(url)[0]
            df = df.T.reset_index(level=0, drop=True).T

            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")

            texts = soup.find('div', attrs={'class': 'RaceData01'}).text
            texts = re.findall(r'\w+', texts)
            for text in texts:
                if 'm' in text:
                    df['course_len'] = [int(re.findall(r'\d+', text)[0])] * len(df)
                if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                    df["weather"] = [text] * len(df)
                if text in ["良", "稍重", "重","稍"]:
                    df["ground_state"] = [text] * len(df)
                if '不' in text:
                    df["ground_state"] = ['不良'] * len(df)
                if '芝' in text:
                    df['race_type'] = ['芝'] * len(df)
                if '障' in text:
                    df['race_type'] = ['障害'] * len(df)
                if 'ダ' in text:
                    df['race_type'] = ['ダート'] * len(df)
            df['date'] = [date] * len(df)

            # horse_id
            horse_id_list = []
            horse_td_list = soup.find_all("td", attrs={'class': 'HorseInfo'})
            for td in horse_td_list:
                horse_id = re.findall(r'\d+', td.find('a')['href'])[0]
                horse_id_list.append(horse_id)
            # jockey_id
            jockey_id_list = []
            jockey_td_list = soup.find_all("td", attrs={'class': 'Jockey'})
            for td in jockey_td_list:
                jockey_id = re.findall(r'\d+', td.find('a')['href'])[0]
                jockey_id_list.append(jockey_id)
            df['horse_id'] = list(map(lambda x: int(x),horse_id_list)) 
            df['jockey_id'] = jockey_id_list

            df.index = [race_id] * len(df)
#             win 環境だとなぜかintに直せない.floatならつかえる
            df.index = df.index.astype(int)
            data = data.append(df)

            
        return data
                
    def preprocessing(self):
        df = self.data.copy()
        
        df["性"] = df["性齢"].map(lambda x: str(x)[0])
        df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

#         体重変化をデータから消した
        # 馬体重を体重と体重変化に分ける
        df = df[df["馬体重(増減)"] != '--']
        df["体重"] = df["馬体重(増減)"].str.split("(", expand=True)[0].astype(int)
        df["体重変化"] = df["馬体重(増減)"].str.split("(", expand=True)[1].str[:-1].replace('前計不',0).astype(int)


        
        df["date"] = pd.to_datetime(df["date"])
        
        df['枠'] = df['枠'].astype(int)
        df['馬番'] = df['馬番'].astype(int)
        df['斤量'] = df['斤量'].astype(int)
        df['開催'] = df.index.map(lambda x:str(x)[4:6])
        df['n_horse'] = df.index.map(lambda x: len(df.loc[x]))

        # 不要な列を削除
        df = df[['枠', '馬番', '斤量', 'course_len', 'weather','race_type',
        'ground_state', 'date', 'horse_id', 'jockey_id', '性', '年齢','開催','n_horse','体重','体重変化']]
        
        self.data_p = df.rename(columns={'枠': '枠番'})
        
class Results(DataProcessor):
    def __init__(self, results):
        super(Results, self).__init__()
        self.data = results
        self.le_peds = None
        
        
    @staticmethod
    def scrape(race_id_list):
        #race_idをkeyにしてDataFrame型を格納
        race_results = {}
        for race_id in tqdm(race_id_list):
            time.sleep(0.5)
            try:
                url = "https://db.netkeiba.com/race/" + race_id
                #メインとなるテーブルデータを取得
                df = pd.read_html(url)[0]
                html = requests.get(url)
                html.encoding = "EUC-JP"
                soup = BeautifulSoup(html.text, "html.parser")

                #天候、レースの種類、コースの長さ、馬場の状態、日付をスクレイピング
                texts = (
                    soup.find("div", attrs={"class": "data_intro"}).find_all("p")[0].text
                    + soup.find("div", attrs={"class": "data_intro"}).find_all("p")[1].text
                )
                info = re.findall(r'\w+', texts)
                for text in info:
                    if text in ["芝", "ダート"]:
                        df["race_type"] = [text] * len(df)
                    if "障" in text:
                        df["race_type"] = ["障害"] * len(df)
                    if "m" in text:
                        df["course_len"] = [int(re.findall(r"\d+", text)[0])] * len(df)
                    if text in ["良", "稍重", "重", "不良"]:
                        df["ground_state"] = [text] * len(df)
                    if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                        df["weather"] = [text] * len(df)
                    if "年" in text:
                        df["date"] = [text] * len(df)

                #馬ID、騎手IDをスクレイピング
                horse_id_list = []
                horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/horse")}
                )
                for a in horse_a_list:
                    horse_id = re.findall(r"\d+", a["href"])
                    horse_id_list.append(horse_id[0])
                jockey_id_list = []
                jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/jockey")}
                )
                for a in jockey_a_list:
                    jockey_id = re.findall(r"\d+", a["href"])
                    jockey_id_list.append(jockey_id[0])
                df["horse_id"] = horse_id_list
                df["jockey_id"] = jockey_id_list

                #インデックスをrace_idにする
                df.index = [race_id] * len(df)

                race_results[race_id] = df
            #存在しないrace_idを飛ばす
            except IndexError:
                continue
            #wifiの接続が切れた時などでも途中までのデータを返せるようにする
            except Exception as e:
                print(e)
                break
            #Jupyterで停止ボタンを押した時の対処
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる
        race_results_df = pd.concat([race_results[key] for key in race_results])

        return race_results_df
        
    #前処理    
    def preprocessing(self):
        df = self.data.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)
#         rank学習の場合はそのまま
#         df['rank'] = df['着順'].map(lambda x:1 if x<4 else 0)
        df['rank'] = df['着順']

        # 性齢を性と年齢に分ける
        df["性"] = df["性齢"].map(lambda x: str(x)[0])
        df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

        # 馬体重を体重と体重変化に分ける
        df["体重"] = df["馬体重"].str.split("(", expand=True)[0].astype(int)
        df["体重変化"] = df["馬体重"].str.split("(", expand=True)[1].str[:-1].astype(int)

        # データをint, floatに変換
        df["単勝"] = df["単勝"].astype(float)
        df["course_len"] = df["course_len"].astype(float) // 100

        # 不要な列を削除
        df.drop(["タイム", "着差", "調教師", "性齢", "馬体重", '馬名', '騎手', '人気', '着順'],
                axis=1, inplace=True)

        df["date"] = pd.to_datetime(df["date"], format="%Y年%m月%d日")
        
        #開催場所
        df['開催'] = df.index.map(lambda x:str(x)[4:6])
        df['n_horse'] = df.index.map(lambda x: len(df.loc[x]))
        
        self.data_p = df
    
    #カテゴリ変数の処理
    def process_categorical(self):
        self.le_horse = LabelEncoder().fit(self.data_pe['horse_id'])
        self.le_jockey = LabelEncoder().fit(self.data_pe['jockey_id'])
#         self.le_peds = p.le_peds_dict
        super().process_categorical(self.le_horse, self.le_jockey,self.data_pe)
        
class Peds:

    def __init__(self, peds):
        self.peds = peds
        self.peds_cat = pd.DataFrame() #after label encoding and transforming into category
        self.peds_re = pd.DataFrame()
        self.peds_vec = pd.DataFrame()
    
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.concat([pd.read_pickle(path) for path in path_list])
        return cls(df)
    
    @staticmethod
    def scrape(horse_id_list):
        peds_dict = {}
        for horse_id in tqdm(horse_id_list):
#         for horse_id in horse_id_list:
            try:
                url = "https://db.netkeiba.com/horse/ped/" + horse_id
            
                df = pd.read_html(url)[0]

                #重複を削除して1列のSeries型データに直す
                generations = {}
                for i in reversed(range(5)):
                    generations[i] = df[i]
                    df.drop([i], axis=1, inplace=True)
                    df = df.drop_duplicates()
                ped = pd.concat([generations[i] for i in range(5)]).rename(horse_id)

                peds_dict[horse_id] = ped.reset_index(drop=True)
                time.sleep(0.5)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #列名をpeds_0, ..., peds_61にする
        peds_df = pd.concat([peds_dict[key] for key in peds_dict],
                            axis=1).T.add_prefix('peds_')
        peds_df.index =peds_df.index.astype(int)

        return peds_df
    
    
#     血統データが正規化されたいないデータに対して, 正規化する関数
    def regularize_peds(self):
        peds = self.peds.copy()
        error_idx_list = []
        for idx in tqdm(peds.index):
            for col in peds.columns:
            #     漢字 : 一-龥
                code_regex = re.compile('[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％一-龥\d]')
                try:
                    cleaned_text = code_regex.sub('', peds[col].loc[idx])
                    one_word = "".join(cleaned_text.split())
                    p_alphabet = re.compile('[a-zA-Z]+')
                    p_katakana = re.compile(r'[ァ-ヶー]+')

                    peds[col].loc[idx] = one_word
                    if (not p_alphabet.fullmatch(one_word)) and not (p_katakana.fullmatch(one_word)):
                        peds[col].loc[idx] = re.sub('[a-zA-Z]+', '', one_word)
                except:
                    error_idx_list.append(idx)
        self.error_idx_list_r = error_idx_list
        self.peds_re = peds

    
    def categorize(self):
        df = self.peds.copy()
        self.le_peds_dict = {}
        
        
        for column in df.columns:
            
            self.le_peds_dict[column] = LabelEncoder()
            df[column] = self.le_peds_dict[column].fit_transform(df[column].fillna('Na'))
#             df[column] = self.le_peds_dict[column]
        self.peds_cat = df.astype('category')
        self.le_peds = self.le_peds_dict
        
        
#         血統データをベクトル化する関数
# peds_re は 正規化済み血統データを仮定
# model_ft : fasttextモデル
    def vectorize(self,peds_re,model_ft):
        df = peds_re.copy()
        error_idx_list = []
        for idx in tqdm(df.index):
            text = ','.join(df.loc[idx].tolist())
            df.loc[idx] = model_ft[text]
#             except:
#                 error_idx_list.append(idx)
        self.error_idx_list_v = error_idx_list
        self.peds_vec = df.astype('float')
#     def vectorize(self,peds_re,model_ft):
#         df = peds_re.copy()
        
#         for idx in tqdm(df.index):
#             for column in df.columns:
#                 horse_name = df[column].loc[idx]
#                 df[column].loc[idx] = model_ft[horse_name][0]

#         self.peds_vec = df.astype('float')

class Simulater():
    
    
    def __init__(self, model):
        self.model = model
        self.return_tables = None
        self.pred_df = None
    

    #     当日のデータでシミュレートするとあかん
    def return_table(self, race_id_list):
        return_tables = Return.scrape(race_id_list)
        return_tables.rename(columns={'0':0,'1':1,'2':2,'3':3},inplace=True)
        self.return_tables = return_tables
    
    
    def return_table_today(self,race_id_list):
        return_tables = {}
        for race_id in tqdm(race_id_list):
            try:
                url = 'https://race.netkeiba.com/race/result.html?race_id='+race_id+'&amp;rf=race_submenu'
                dfs = pd.read_html(url)
                df = pd.concat([dfs[1], dfs[2]])
                df.index = [race_id] * len(df)
                return_tables[race_id] = df
                time.sleep(0.5)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break
            #pd.DataFrame型にして一つのデータにまとめる
        return_tables_df = pd.concat([return_tables[key] for key in return_tables])
        self.return_tables = return_tables_df
    
   
    def return_pred_table(self,st,return_tables):
        me_st = ModelEvaluator(self.model, return_tables)
        #予測
        scores = me_st.predict_proba(st.data_c.drop(['date'],axis=1),train=False)
        pred = st.data_c[['馬番']].copy()
        pred['scores'] = scores
        pred.index = pred.index.astype(int)
        return pred
        
        
    def show_results(self , st ,race_id_list , return_tables,bet = 100):
        acc_dict = {'単勝':0,'複勝':0,'ワイド':0}
        return_dict = {'単勝':0,'複勝':0,'ワイド':0}
        target_race_dict = {}
        self.pred_df = self.return_pred_table(st,return_tables)
        tansho_list = []
        fukusho_list = []
        wide_list =[]

        for race_id in race_id_list:
            df_  = self.return_tables.loc[race_id]
            print("-------------------")
            print("predict")
            pred_df = self.pred_df.loc[int(race_id)]
            pred_df = pred_df.sort_values('scores',ascending=False)
            print(pred_df.iloc[:3])
            print("actual")
            print(self.return_tables.loc[race_id])
            pred_1 = str(pred_df['馬番'].iloc[0])
            pred_2 = str(pred_df['馬番'].iloc[1])


            if  pred_1 == df_[df_[0]=='単勝'][1].values[0]:
                acc_dict['単勝'] += 1
                profit = int(df_[df_[0]=='単勝'][2].values[0].replace('円','').replace(',',''))
                return_dict['単勝'] += profit
                acc_dict['複勝'] += 1
                return_index = df_[df_[0]=='複勝'][1].str.split(' ')[0].index(str(pred_1))
                profit = int(df_[df_[0]=='複勝'][2].str.split('円')[0][return_index].replace(',',''))
                return_dict['複勝'] += profit 
                tansho_list.append(race_id[-2:])
                fukusho_list.append(race_id[-2:])

            elif pred_1 in df_[df_[0]=='複勝'][1].str.split(' ')[0]:
                acc_dict['複勝'] += 1
                return_index = df_[df_[0]=='複勝'][1].str.split(' ')[0].index(str(pred_1))
                profit = int(df_[df_[0]=='複勝'][2].str.split('円')[0][return_index].replace(',',''))
                return_dict['複勝'] += profit 
                fukusho_list.append(race_id[-2:])
                

            for i in range(len(df_[df_[0]=='ワイド'][1].str.split(' ')[0])//2):
                if set([pred_1,pred_2])==set(df_[df_[0]=='ワイド'][1].str.split(' ')[0][i:i+2]):
                    if i!=0:
                        return_index = i-1
                    else:
                        return_index = i
                    profit = int(df_[df_[0]=='ワイド'][2].str.split('円')[0][return_index].replace(',',''))
                    return_dict['ワイド'] += profit
                    print("profit",profit)
                    acc_dict['ワイド'] += 1
                    wide_list.append(race_id[-2:])
                    break
        
        
        for i, key in enumerate(acc_dict):
            return_dict[key] -= bet * len(race_id_list)
        
        print("---------------------")
        print("単勝")
        print("的中率 :",acc_dict['単勝'],'/',len(race_id_list))
        print("収支   :",return_dict['単勝'],'円')
        print("的中レース",tansho_list)
        print("---------------------")
        print("複勝")
        print("的中率 :",acc_dict['複勝'],'/',len(race_id_list))
        print("収支   :",return_dict['複勝'],'円')
        print("的中レース",fukusho_list)
        print("---------------------")
        print("ワイド")
        print("的中率 :",acc_dict['ワイド'],'/',len(race_id_list))
        print("収支   :",return_dict['ワイド'],'円')
        print("的中レース",wide_list)
             
class RankSimulater(Simulater):
    
    
    def return_pred_table(self,data_c,is_long=False):
        
        #予測
        if not is_long:
            scores = pd.Series(self.model.predict(data_c.drop(['date'],axis=1)),index=data_c.index)
        else:
            scores = pd.Series(self.model.predict(data_c.drop(['date','rank','単勝'],axis=1)),index=data_c.index)
        pred = data_c[['馬番']].copy()
        pred['scores'] = scores
        pred = pred.sort_values('scores',ascending=False)
        return pred
    
    
#     odds以上の馬券しか買わない
    def show_long_results(self, data_c, return_tables, kaime='tansho', odds=2.0, bet = 100):
        if kaime=='tansho':
            pass
        elif kaime=='fukusho':
            pass
        elif kaime=='wide':
            pass
        elif kaime=='wide_3_box':
            pass
        elif kaime=='umaren':
            pass
        elif kaime=='umatan':
            pass
        elif kaime=='sanrentan':
            pass
        elif kaime=='sanrenpuku':
            pass
        else:
            print("No such kaime.")

            
    def calc_tansho(self,data_c,return_tables,odds=2.0,bet=100):
#         data_c = r.data_cを仮定
        acc_dict = {'単勝':0,'複勝':0,'ワイド':0}
        return_dict = {'単勝':0,'複勝':0,'ワイド':0}
        tansho_list = []
        race_id_list = list(set(data_c.index))
        not_bet_count = 0
        for race_id in race_id_list: # race_id : int
            pred_df = self.return_pred_table(data_c.loc[race_id],is_long=True)
            df_  = return_tables.loc[race_id]
            pred_df = pred_df.loc[race_id]
            pred_df = pred_df.sort_values('scores',ascending=False)
            pred_1 = pred_df['馬番'].iloc[0]
            pred_2 = pred_df['馬番'].iloc[1]
#             上位２着の予測スコアが同じなら賭けない
            score_1 = pred_df['scores'].iloc[0]
            score_2 = pred_df['scores'].iloc[1]

            
            pred_odds = data_c[data_c['馬番']==pred_1].loc[race_id]['単勝']
            try:
                rank = data_c[data_c['rank']==1].loc[race_id]['馬番']
            except:
                continue


            if type(rank)!=pd.core.series.Series:
                if  pred_1 == rank:
                    if pred_odds>=odds and score_1!= score_2:
                        acc_dict['単勝'] += 1
                        profit = pred_odds*bet
                        return_dict['単勝'] += profit
                        tansho_list.append(race_id)
                    else: #odds　低い or 出力の信頼性がないときは買わない
                        not_bet_count += 1
#                     odds低かったら買わない
                elif data_c[data_c['馬番']==int(pred_1)].loc[race_id]['単勝']<odds:
                    not_bet_count+=1
            else:
                if  pred_1 == rank.values[0] or pred_1 == rank.values[1]:
                    if pred_odds>=odds and score_1!= score_2:
                        acc_dict['単勝'] += 1
                        profit = pred_odds*bet
                        return_dict['単勝'] += profit
                        tansho_list.append(race_id)
                    else: #odds　低い or 出力の信頼性がないときは買わない
                        not_bet_count += 1
                elif data_c[data_c['馬番']==int(pred_1)].loc[race_id]['単勝']<odds:
                    not_bet_count+=1

        real_race_len = len(race_id_list) - not_bet_count
        return_dict['単勝'] -= bet * real_race_len
        print("not_bet_count",not_bet_count)
        print("---------------------")
        print("単勝")
        print("的中率 :",acc_dict['単勝'],'/',real_race_len)
        print("的中% :",'{:.2f}'.format(acc_dict['単勝']/real_race_len*100),'%')
        print("収支   :",return_dict['単勝'],'円')
#         print("的中レース",tansho_list)

    
    def calc_tansho_top3(self,data_c,return_tables,odds=2.0,bet=100):
        acc_dict = {'単勝':0,'複勝':0,'ワイド':0}
        return_dict = {'単勝':0,'複勝':0,'ワイド':0}
        tansho_list = []
        race_id_list = list(set(data_c.index))
        not_bet_count = 0
        for race_id in race_id_list: # race_id : int
            pred_df = self.return_pred_table(data_c.loc[race_id],is_long=True)
            df_  = return_tables.loc[race_id]
            pred_df = pred_df.loc[race_id]
            pred_df = pred_df.sort_values('scores',ascending=False)
            pred_1 = pred_df['馬番'].iloc[0]
            pred_2 = pred_df['馬番'].iloc[1]
            pred_3 = pred_df['馬番'].iloc[2]
#             上位２着の予測スコアが同じなら賭けない
            score_1 = pred_df['scores'].iloc[0]
            score_2 = pred_df['scores'].iloc[1]
        
        
            odds_tmp = return_tables.loc[race_id].iloc[0][2].split('br')
            real_odds = int(odds_tmp[0])/100
            
            
            
            rank_tmp = df_.iloc[0][1].split('br')
            rank = int(rank_tmp[0])
            # df_.iloc[0]が単勝
            # df_.iloc[1]が複勝, etc..
            # df_.iloc[x][1] が１着の馬番
            # df_.iloc[x][2] がodds
            # df_.iloc[x][3] が人気

            if  pred_1 == rank or pred_2 == rank or pred_3==rank:
                if real_odds>=odds and score_1!= score_2:
                    acc_dict['単勝'] += 1
                    profit = real_odds*bet
                    return_dict['単勝'] += profit
                    tansho_list.append(race_id)
                else: #odds　低い or 出力の信頼性がないときは買わない
                    not_bet_count += 1
        
#         top3 全てに賭けるから賭け金の3倍
        real_race_len = len(race_id_list) - not_bet_count
        return_dict['単勝'] -= 3*bet * real_race_len
        print("not_bet_count",not_bet_count)
        print("---------------------")
        print("単勝")
        print("的中率 :",acc_dict['単勝'],'/',len(race_id_list)-not_bet_count)
        print("的中% :",'{:.2f}'.format(acc_dict['単勝']/len(race_id_list)*100),'%')
        print("収支   :",return_dict['単勝'],'円')
        
    
    def calc_fukusho(self,data_c,return_tables,odds=2.0,bet=100):
#         data_c = r.data_cを仮定
        acc_dict = {'単勝':0,'複勝':0,'ワイド':0}
        return_dict = {'単勝':0,'複勝':0,'ワイド':0}
        race_id_list = list(set(data_c.index))
        not_bet_count = 0
        
        
        for race_id in race_id_list: # race_id : int
            pred_df = self.return_pred_table(data_c.loc[race_id],is_long=True)
            df_  = return_tables.loc[race_id]
            pred_df = pred_df.loc[race_id]
            pred_df = pred_df.sort_values('scores',ascending=False)
            pred_1 = str(pred_df['馬番'].iloc[0])
            pred_2 = str(pred_df['馬番'].iloc[1])
#             上位２着の予測スコアが同じなら賭けない
            score_1 = pred_df['scores'].iloc[0]
            score_2 = pred_df['scores'].iloc[1]
            
            
            
            # df_.iloc[0]が単勝
            # df_.iloc[1]が複勝, etc..
            # df_.iloc[x][1] が１着の馬番
            # df_.iloc[x][2] がodds
            # df_.iloc[x][3] が人気
#             # 一着にのみかける
# ############### 確定した odds と 単勝 odds が混在している, よくない
            if pred_1 in df_[df_[0]=='複勝'][1].str.split('br').tolist()[0] and score_1!= score_2:
                return_index = df_[df_[0]=='複勝'][1].str.split('br').tolist()[0].index(pred_1)
                real_odds = int(df_[df_[0]=='複勝'][2].str.split('br').tolist()[0][return_index].replace(',',''))/100
                
                
                if real_odds>=odds:    
                    acc_dict['複勝'] += 1
                    profit = real_odds*bet
                    return_dict['複勝'] += profit 
                else:
                    not_bet_count+=1
#             odds が低かったら賭けない
            elif data_c[data_c['馬番']==int(pred_1)].loc[race_id]['単勝']<odds:
                not_bet_count+=1
            
            
            

        real_race_len = len(race_id_list) - not_bet_count
        return_dict['複勝'] -= bet * real_race_len

        print("---------------------")
        print("not_bet_count",not_bet_count)
        print("複勝")
        print("的中率 :",acc_dict['複勝'],'/',real_race_len)
        print("的中% :",'{:.2f}'.format((acc_dict['複勝']/real_race_len)*100),'%')
        print("収支   :",return_dict['複勝'],'円')
        
        
    def calc_wide(self,data_c,return_tables,odds=2.0,bet=100):
        acc_dict = {'単勝':0,'複勝':0,'ワイド':0}
        return_dict = {'単勝':0,'複勝':0,'ワイド':0}
        wide_list = []
        race_id_list = data_c.index.tolist()
        
        for i in range(len(df_[df_[0]=='ワイド'][1].str.split(' ')[0])//2):
            if set([pred_1,pred_2])==set(df_[df_[0]=='ワイド'][1].str.split(' ')[0][i:i+2]):
                if i!=0:
                    return_index = i-1
                else:
                    return_index = i

            profit = int(df_[df_[0]=='ワイド'][2].str.split('円')[0][return_index].replace(',',''))
            return_dict['ワイド'] += profit
            print("profit",profit)
            acc_dict['ワイド'] += 1
            wide_list.append(race_id[-2:])
            break
            
            
    def calc_wide_3box(self,data_c,return_tables,odds=2.0,bet=100):
        pass
            
    
    def calc_sanrenpuku(self,data_c,return_tables,bet=100):
        acc_dict = {'三連複':0}
        return_dict = {'三連複':0}
        sanrenpuku_list = []
        race_id_list = list(set(data_c.index))
        not_bet_count = 0
        
        
        for race_id in race_id_list: # race_id : int
            pred_df = self.return_pred_table(data_c.loc[race_id],is_long=True)
            df_  = return_tables.loc[race_id]
            pred_df = pred_df.loc[race_id]
            pred_df = pred_df.sort_values('scores',ascending=False)
            pred_1 = pred_df['馬番'].iloc[0]
            pred_2 = pred_df['馬番'].iloc[1]
            try:
                pred_3 = pred_df['馬番'].iloc[2]
            except:
                print("race_id",race_id)
                print("pred_df",pred_df)
#             上位２着の予測スコアが同じなら賭けない
            score_1 = pred_df['scores'].iloc[0]
            score_2 = pred_df['scores'].iloc[1] 
            
#             data_cから観測できる odds は100をかけた時の ×odds だが, return_tables の オッズは, 100円をかけた時の払い戻し金額
            odds_tmp = df_[df_[0]=='三連複'][2].values[0].replace(',','').split('br')
            if len(odds_tmp)==1:
                odds = int(odds_tmp[0])
            else:
                odds = int(odds_tmp[0])
                odds2 = int(odds_tmp[1])

            if score_1 != score_2:
#                 当たってた時
                try:
                    if [int(i) for i in df_[df_[0]=='三連複'][1].values[0].replace(' ','').split('-')] == sorted([pred_1,pred_2,pred_3]):
                        acc_dict['三連複'] += 1
                        profit = (bet/100)*odds
                        return_dict['三連複'] += profit
                except:
                    print()
                    print('race_id',race_id)
            else:
                not_bet_count += 1
            
        real_race_len = len(race_id_list) - not_bet_count
        return_dict['三連複'] -= bet * real_race_len
#         この辺のロジック同じだから, 関数でまとめたい
        print("---------------------")
        print("not_bet_count",not_bet_count)
        print("三連複")
        print("的中率 :",acc_dict['三連複'],'/',real_race_len)
        print("的中% :",'{:.2f}'.format((acc_dict['三連複']/real_race_len)*100),'%')
        print("収支   :",return_dict['三連複'],'円')
    
    def calc_sanrenpuku_box(self,data_c,return_tables,odds=2.0,bet=100):
        pass
    
    
    def calc_sanrentan(self,data_c,return_tables,bet=100):
        acc_dict = {'三連単':0}
        return_dict = {'三連単':0}
        sanrenpuku_list = []
        race_id_list = list(set(data_c.index))
        not_bet_count = 0
        
        
        for race_id in race_id_list: # race_id : int
            pred_df = self.return_pred_table(data_c.loc[race_id],is_long=True)
            df_  = return_tables.loc[race_id]
            pred_df = pred_df.loc[race_id]
            pred_df = pred_df.sort_values('scores',ascending=False)
            pred_1 = pred_df['馬番'].iloc[0]
            pred_2 = pred_df['馬番'].iloc[1]
            try:
                pred_3 = pred_df['馬番'].iloc[2]
            except:
                print("race_id",race_id)
                print("pred_df",pred_df)
#             上位２着の予測スコアが同じなら賭けない
            score_1 = pred_df['scores'].iloc[0]
            score_2 = pred_df['scores'].iloc[1] 
            
#             data_cから観測できる odds は100をかけた時の ×odds だが, return_tables の オッズは, 100円をかけた時の払い戻し金額
            odds_tmp = df_[df_[0]=='三連単'][2].values[0].replace(',','').split('br')
            if len(odds_tmp)==1:
                odds = int(odds_tmp[0])
            else:
                odds = int(odds_tmp[0])
                odds2 = int(odds_tmp[1])

            if score_1 != score_2:
#                 当たってた時
                try:
                    if [int(i) for i in df_[df_[0]=='三連単'][1].values[0].replace(' ','').split('→')] == [pred_1,pred_2,pred_3]:
                        acc_dict['三連単'] += 1
                        profit = (bet/100)*odds
                        return_dict['三連単'] += profit
                except:
                    print()
                    print('race_id',race_id)
            else:
                not_bet_count += 1
            
        real_race_len = len(race_id_list) - not_bet_count
        return_dict['三連単'] -= bet * real_race_len
#         この辺のロジック同じだから, 関数でまとめたい
        print("---------------------")
        print("not_bet_count",not_bet_count)
        print("三連単")
        print("的中率 :",acc_dict['三連単'],'/',real_race_len)
        print("的中% :",'{:.2f}'.format((acc_dict['三連単']/real_race_len)*100),'%')
        print("収支   :",return_dict['三連単'],'円')
    
    
    def show_results_today(self , st ,race_id_list ,bet = 100):
        acc_dict = {'単勝':0,'複勝':0,'ワイド':0}
        return_dict = {'単勝':0,'複勝':0,'ワイド':0}
        tansho_list = []
        fukusho_list = []
        wide_list =[]

        for race_id in race_id_list:
            self.pred_df = self.return_pred_table(st.data_c.loc[int(race_id)])
#             self.return_tables.index =  self.return_tables.index.astype(int)
            df_  = self.return_tables.loc[race_id]
            print("-------------------")
            print("predict")
            pred_df = self.pred_df.loc[int(race_id)]
            pred_df = pred_df.sort_values('scores',ascending=False)
            print(pred_df.iloc[:3])
            print("actual")
            print(self.return_tables.loc[race_id])
            pred_1 = str(pred_df['馬番'].iloc[0])
            pred_2 = str(pred_df['馬番'].iloc[1])


            if  pred_1 == df_[df_[0]=='単勝'][1].values[0]:
                acc_dict['単勝'] += 1
                profit = int(df_[df_[0]=='単勝'][2].values[0].replace('円','').replace(',',''))
                return_dict['単勝'] += profit
                acc_dict['複勝'] += 1
                return_index = df_[df_[0]=='複勝'][1].str.split(' ')[0].index(str(pred_1))
                profit = int(df_[df_[0]=='複勝'][2].str.split('円')[0][return_index].replace(',',''))
                return_dict['複勝'] += profit 
                tansho_list.append(race_id[-2:])
                fukusho_list.append(race_id[-2:])

            elif pred_1 in df_[df_[0]=='複勝'][1].str.split(' ')[0]:
                acc_dict['複勝'] += 1
                return_index = df_[df_[0]=='複勝'][1].str.split(' ')[0].index(str(pred_1))
                profit = int(df_[df_[0]=='複勝'][2].str.split('円')[0][return_index].replace(',',''))
                return_dict['複勝'] += profit 
                fukusho_list.append(race_id[-2:])
                

            for i in range(len(df_[df_[0]=='ワイド'][1].str.split(' ')[0])//2):
                if set([pred_1,pred_2])==set(df_[df_[0]=='ワイド'][1].str.split(' ')[0][i:i+2]):
                    if i!=0:
                        return_index = i-1
                    else:
                        return_index = i
                    profit = int(df_[df_[0]=='ワイド'][2].str.split('円')[0][return_index].replace(',',''))
                    return_dict['ワイド'] += profit
                    print("profit",profit)
                    acc_dict['ワイド'] += 1
                    wide_list.append(race_id[-2:])
                    break
        
        
        for i, key in enumerate(acc_dict):
            return_dict[key] -= bet * len(race_id_list)
        
        print("---------------------")
        print("単勝")
        print("的中率 :",acc_dict['単勝'],'/',len(race_id_list))
        print("的中% :",'{:.2f}'.format(acc_dict['単勝']/len(race_id_list)*100),'%')
        print("収支   :",return_dict['単勝'],'円')
        print("的中レース",tansho_list)
        print("---------------------")
        print("複勝")
        print("的中率 :",acc_dict['複勝'],'/',len(race_id_list))
        print("的中% :",'{:.2f}'.format(acc_dict['複勝']/len(race_id_list)*100),'%')
        print("収支   :",return_dict['複勝'],'円')
        print("的中レース",fukusho_list)
        print("---------------------")
        print("ワイド")
        print("的中率 :",acc_dict['ワイド'],'/',len(race_id_list))
        print("的中% :",'{:.2f}'.format(acc_dict['ワイド']/len(race_id_list)*100),'%')
        print("収支   :",return_dict['ワイド'],'円')
        print("的中レース",wide_list)
        
class LearnLGBM():
    
    
    def __init__(self):
        self.model = None
        
        
    def get_train_data(self):
        pass
    
    
        
    

回収率 \
(profit - real_race_len*bet) /real_race_len * bet

In [93]:
df_ = return_tables.loc[202001010101]

In [165]:
[int(i) for i in df_[df_[0]=='三連複'][1].values[0].replace(' ','').split('-')]

[2, 3, 6]

In [149]:
'9,060br9,810'.replace(',','').split('br')

['9060', '9810']

In [171]:
r.data_c.iloc[0]['単勝']

3.8

In [190]:
df_[df_[0]=='三連単'][1].values[0].replace(' ','').split('→')

['12', '10', '6br12', '10', '11']

# sorted(['3','2','6'])

In [163]:
type(r.data_c['馬番'].iloc[0])

numpy.int64

# Simulate


In [223]:
sl = RankSimulater(lgb_rank)
start_time = time.time()
for odds in [1.1,2.0,3.0,4.0,5.0,6.0,7.0]:
    print("odds",odds)
    sl.calc_tansho(r.data_c.iloc[-1000:].fillna(0),return_tables,odds=odds)
    print("time", time.time() - start_time)

odds 1.1
not_bet_count 0
---------------------
単勝
的中率 : 13 / 71
的中% : 18.31 %
収支   : 3920.0 円
time 0.8535747528076172
odds 2.0
not_bet_count 6
---------------------
単勝
的中率 : 12 / 65
的中% : 18.46 %
収支   : 4350.0 円
time 1.6458847522735596
odds 3.0
not_bet_count 16
---------------------
単勝
的中率 : 9 / 55
的中% : 16.36 %
収支   : 4640.0 円
time 2.434467077255249
odds 4.0
not_bet_count 29
---------------------
単勝
的中率 : 6 / 42
的中% : 14.29 %
収支   : 4840.0 円
time 3.224350929260254
odds 5.0
not_bet_count 37
---------------------
単勝
的中率 : 5 / 34
的中% : 14.71 %
収支   : 5240.0 円
time 4.0129499435424805
odds 6.0
not_bet_count 38
---------------------
単勝
的中率 : 5 / 33
的中% : 15.15 %
収支   : 5340.0 円
time 4.82038688659668
odds 7.0
not_bet_count 44
---------------------
単勝
的中率 : 3 / 27
的中% : 11.11 %
収支   : 4630.0 円
time 5.676491975784302


In [226]:
60/38

1.5789473684210527

In [225]:
sl = RankSimulater(lgb_rank)
sl.calc_sanrenpuku(r.data_c.iloc[-1000:].fillna(0),return_tables)

---------------------
not_bet_count 0
三連複
的中率 : 3 / 71
的中% : 4.23 %
収支   : 400.0 円


In [224]:
sl = RankSimulater(lgb_rank)
sl.calc_sanrentan(r.data_c.iloc[-1000:].fillna(0),return_tables)

---------------------
not_bet_count 0
三連単
的中率 : 0 / 71
的中% : 0.00 %
収支   : -7100 円


In [180]:
df_ = return_tables.loc[202206030211]

In [181]:
df_[df_[0]=='三連複'][1].values[0].replace(' ','').split('-')

['6', '10', '12br10', '11', '12']

In [170]:
return_tables.loc[202209020404]

Unnamed: 0,0,1,2,3
202209020404,単勝,4,410,2
202209020404,複勝,4br15br11,120br180br110,2br3br1
202209020404,枠連,2 - 7,1300,5
202209020404,馬連,4 - 15,2510,8
202209020404,ワイド,4 - 15br4 - 11br11 - 15,620br170br360,7br1br2
202209020404,馬単,4 → 15,4120,11
202209020404,三連複,4 - 11 - 15,1060,1
202209020404,三連単,4 → 15 → 11,8530,17


# load data

In [63]:
results = load_csv(path_mac+'results.csv')
horse_results = load_csv(path_mac+'horse_results.csv')
peds = load_csv(path_mac+'peds.csv')
# 何回やってもロードすると, nanが出る
peds.fillna('nan',inplace=True)
return_tables = load_csv(path_mac+'return.csv')
return_tables.rename(columns={'0':0,'1':1,'2':2,'3':3},inplace=True)

# 日付に注意

In [6]:
date = '2022/12/31'

# race_id_list

In [6]:


# 202206030101
race_id_list = ['2022060301{}'.format(str(i).zfill(2)) for i in range(1,13)]
race_id_list += ['2022090201{}'.format(str(i).zfill(2)) for i in range(1,13)]
race_id_list += ['2022070205{}'.format(str(i).zfill(2)) for i in range(1,13)]

race_id_list += ['2022060207{}'.format(str(i).zfill(2)) for i in range(1,13)]
race_id_list += ['2022090112{}'.format(str(i).zfill(2)) for i in range(1,13)]
# race_id_list += ['2022100204{}'.format(str(i).zfill(2)) for i in range(1,13)]


race_id_list += ['2022060208{}'.format(str(i).zfill(2)) for i in range(1,13)]
race_id_list += ['2022070204{}'.format(str(i).zfill(2)) for i in range(1,13)]

race_id_list += ['2022060301{}'.format(str(i).zfill(2)) for i in range(1,13)]
race_id_list += ['2022090201{}'.format(str(i).zfill(2)) for i in range(1,13)]
race_id_list += ['2022070205{}'.format(str(i).zfill(2)) for i in range(1,13)]

race_id_list += ['2022060302{}'.format(str(i).zfill(2)) for i in range(1,13)]
race_id_list += ['2022090202{}'.format(str(i).zfill(2)) for i in range(1,13)]
race_id_list += ['2022070206{}'.format(str(i).zfill(2)) for i in range(1,13)]

race_id_list += ['2022060303{}'.format(str(i).zfill(2)) for i in range(1,13)]
race_id_list += ['2022090203{}'.format(str(i).zfill(2)) for i in range(1,13)]
# race_id_list += ['2022070206{}'.format(str(i).zfill(2)) for i in range(1,13)]

race_id_list += ['2022060304{}'.format(str(i).zfill(2)) for i in range(1,13)]
race_id_list += ['2022090204{}'.format(str(i).zfill(2)) for i in range(1,13)]

# Results scraping

In [7]:
# race_id_list = results.index.astype('str')

results = Results.scrape(race_id_list)

results.to_csv(path_mac+'results_new.csv')

  0%|          | 0/204 [00:00<?, ?it/s]

# Horse_results scraping

In [8]:
horse_id_list = results['horse_id'].astype(str).unique()
horse_results = HorseResults.scrape(horse_id_list)
# save_path = '/Users/rince/Desktop/Horse/Data/horse_2020.csv'
horse_results.to_csv(path_mac+'horse_results_new.csv')

  0%|          | 0/2260 [00:00<?, ?it/s]

# Peds scraping

In [13]:
peds_2021 = Peds.scrape(horse_id_list)
pe_2021 = Peds(peds_2021)
pe_2021.regularize_peds()
pe_2021.peds_re.to_csv(path_mac+'peds_new.csv')

  0%|          | 0/2260 [00:00<?, ?it/s]

  0%|          | 0/2260 [00:00<?, ?it/s]

# Return scraping

In [14]:
returns_2021 = Return.scrape(race_id_list)
returns_2021.to_csv(path_mac+'returns_new.csv')

  0%|          | 0/204 [00:00<?, ?it/s]

# update

In [15]:
new_results = update_data(load_csv(path_mac+'results.csv'), load_csv(path_mac+'results_new.csv'))
new_horse_results = update_data(load_csv(path_mac+'horse_results.csv'), load_csv(path_mac+'horse_results_new.csv'))
new_peds = update_data(load_csv(path_mac+'peds.csv'), load_csv(path_mac+'peds_new.csv'))
new_return = update_data(load_csv(path_mac+'return.csv'), load_csv(path_mac+'returns_new.csv'))

# save

In [16]:
new_results.to_csv(path_mac2+'results.csv')
new_horse_results.to_csv(path_mac2+'horse_results.csv')
new_peds.to_csv(path_mac2+'peds.csv')
new_return.to_csv(path_mac2+'return.csv')

# 今後の方針
1. XGB試してみる
2. ME 自己流につくりかえる
3. シミュレーションとか, 自分流に変える.

# rank　学習

In [84]:
# peds_id = results['horse_id'].astype(str).unique()
# peds_tmp = Peds.scrape(peds_id)
# new_peds = update_data(peds, peds_tmp)
# ここで初めて学習データを作る
date = '2022/12/31'

pe = Peds(peds)
# pe.regularize_peds()
pe.vectorize(pe.peds,model_ft)


# pe.categorize()
r = Results(results)
#前処理
r.preprocessing()
#馬の過去成績データ追加
# 過去聖遺跡データも最新にupdateする
# horse_id_list = data['horse_id'].astype(str).unique()
# horse_results_tmp = HorseResults.scrape(horse_id_list)
# new_horse_results = update_data(horse_results,horse_results_tmp)
hr = HorseResults(horse_results)
r.merge_horse_results(hr)

r.merge_peds(pe.peds_vec)

# r.merge_peds(pe.peds_cat)

#カテゴリ変数の処理
# pedsは既にカテゴリ化したdataをconcatしているので, ここでカテゴリ化せずとも良い
r.process_categorical()

  0%|          | 0/237 [00:00<?, ?it/s]

  0%|          | 0/237 [00:00<?, ?it/s]

  0%|          | 0/237 [00:00<?, ?it/s]

In [85]:
# 欠損値は 0 とした
train, test = split_data(r.data_c.fillna(0),test_size=0.2,rank_learning=False)
# x_train = train.drop(['rank', 'date','体重','体重変化','単勝'], axis=1)
x_train = train.drop(['rank', 'date','単勝'], axis=1)
y_train = train['rank']

x_test = test.drop(['rank', 'date','単勝'], axis=1)
y_test = test['rank']

train_query = x_train.groupby(x_train.index).size()
test_query = x_test.groupby(x_test.index).size()

In [207]:
# best parameters: {'lambdarank_truncation_level': 7, 'learning_rate': 0.06729249537901785}

# 血統データベクトル前の最適パラメタ
# lgbm_params =  {
#     'objective': 'lambdarank',
#     'metric': 'ndcg',
#     'lambdarank_truncation_level': 2,
#     #     上位３着を考慮
#     'ndcg_eval_at': [1,2,3],
#     'learning_rate': 0.09841058786136925,
#     'boosting_type': 'gbdt',
#     'random_state': 777
# }
# 'lambdarank_truncation_level': 2, 'learning_rate': 0.06748036714102541
        
# lgbm_params = {
#     'lambdarank_truncation_level': 10,
#     'metric': 'ndcg',
#     'objective': 'lambdarank',
#     'ndcg_eval_at': [1,2,3],
#     'learning_rate': 0.016371907499492487,
#     'boosting_type': 'gbdt',
#     'random_state': 777
# }
lgbm_params = {
    'lambdarank_truncation_level': 2,
    'metric': 'ndcg',
    'objective': 'lambdarank',
    'ndcg_eval_at': [1,2,3],
    'learning_rate': 0.06748036714102541,
    'boosting_type': 'gbdt',
    'random_state': 777
}

 #学習 
train = lgb.Dataset(x_train, y_train, group=train_query)
valid = lgb.Dataset(x_test, y_test, reference=train, group=test_query)

lgb_rank = lgb.train(
   lgbm_params,
   train,
   num_boost_round=100,
#    valid_sets=valid,
   valid_names=['train'],
#    early_stopping_rounds=20,
#    verbose_eval=5
)

# early stopping -> test data ないと怒られる

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 45706
[LightGBM] [Info] Number of data points in the train set: 82319, number of used features: 172


# Grid search

In [202]:
x_test

Unnamed: 0,枠番,馬番,斤量,course_len,horse_id,jockey_id,年齢,体重,体重変化,n_horse,...,race_type_ダート,race_type_芝,race_type_障害,ground_state_稍重,ground_state_良,ground_state_不良,ground_state_重,性_牡,性_牝,性_セ
202106040907,2,2,54.0,22.0,12477,120,3,486,2,9,...,0,1,0,1,0,0,0,1,0,0
202106040907,6,6,57.0,22.0,4070,74,4,482,0,9,...,0,1,0,1,0,0,0,1,0,0
202106040907,8,8,54.0,22.0,11380,161,3,480,-2,9,...,0,1,0,1,0,0,0,1,0,0
202106040907,3,3,54.0,22.0,8542,8,3,496,-12,9,...,0,1,0,1,0,0,0,1,0,0
202106040907,4,4,57.0,22.0,7064,61,4,438,-14,9,...,0,1,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202206030408,5,7,52.0,18.0,9517,132,4,462,6,13,...,1,0,0,1,0,0,0,0,1,0
202206030408,1,1,57.0,18.0,6192,1,5,506,8,13,...,1,0,0,1,0,0,0,0,0,1
202206030408,7,10,52.0,18.0,3743,126,6,456,8,13,...,1,0,0,1,0,0,0,0,1,0
202206030408,6,9,55.0,18.0,3605,58,6,432,-2,13,...,1,0,0,1,0,0,0,0,1,0


In [204]:
train = lgb.Dataset(x_train, y_train, group=train_query)
valid = lgb.Dataset(x_test.iloc[:10599], y_test.iloc[:10599], reference=train, group=test_query)

In [205]:
def score(params):
    print("Training start:")

    N_boost_round = []
    Score = []

    lgb_results={}  #履歴格納用
    train = lgb.Dataset(x_train, y_train, group=train_query)
    valid = lgb.Dataset(x_test, y_test, reference=train, group=test_query)

    
    lgb_clf = lgb.train(
       params,
       train,
       num_boost_round=1000,
       valid_sets=valid,
       valid_names=['valid'],
       early_stopping_rounds=20,
       verbose_eval=5,
       evals_result=lgb_results
    )
#     return lgb_results
    return {'loss': -1.0 * lgb_results['valid']['ndcg@3'][lgb_clf.best_iteration], 'status': STATUS_OK}

def optimize(trials):
#探索スペース
    space = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'ndcg_eval_at': [1,2,3],
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.1),
#         なぜか, uniformだと strに変換されてしまう
#         lambda_rank_truncation_levelは int型
#         よって, int以外はstrに勝手に変換されてしまい, エラーとなったのではないか
        'lambdarank_truncation_level': hp.choice('lambdarank_truncation_level',[ 1,2
                                                                                ,4,6,8,10]),
#         best paramsの返り値は, choiceだとindexか？
#         n_estimaterとか サーチしてみたい
#         'n_estimators': hp.choice('n_estimators',[ 1,10,100,500,750]),
        'boosting_type': 'gbdt',
        'random_state': 777,
    }

    max_evals = 50      #探索回数(25くらいで十分)
    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=max_evals)

    print("best parameters:", best)

#     return {'loss': -1.0 * lgb_results['ndcg@3'][lgb_clf.best_iteration], 'status': STATUS_OK}



In [206]:
trials = Trials()
optimize(trials)

Training start:                                       
  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]






You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45706                    
[LightGBM] [Info] Number of data points in the train set: 82319, number of used features: 172
  0%|          | 0/50 [00:01<?, ?trial/s, best loss=?]





Training until validation scores don't improve for 20 rounds
[5]	valid's ndcg@1: 0.514094	valid's ndcg@2: 0.502846	valid's ndcg@3: 0.49499
[10]	valid's ndcg@1: 0.515436	valid's ndcg@2: 0.501073	valid's ndcg@3: 0.496077
[15]	valid's ndcg@1: 0.516107	valid's ndcg@2: 0.495868	valid's ndcg@3: 0.494235
[20]	valid's ndcg@1: 0.511409	valid's ndcg@2: 0.4987	valid's ndcg@3: 0.495594
Early stopping, best iteration is:                    
[1]	valid's ndcg@1: 0.519463	valid's ndcg@2: 0.505975	valid's ndcg@3: 0.502451
Training start:                                                                   
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45706                                                
[LightGBM] [Info] Number of data points in the train set: 82319, number of used features: 172
Training until validation scores don't improve for 20 rounds                      
[5]	valid's ndcg@1: 0.512081	valid's ndcg@2: 0.496227	valid's ndcg@3: 0.489634    
[10]	

# milestone

# 実際に予測するときの手順

In [15]:
race_id_list = ['2022050108{}'.format(str(i).zfill(2)) for i in range(1,13)]
race_id_list += ['2022090104{}'.format(str(i).zfill(2)) for i in range(1,13)]
race_id_list += ['2022100204{}'.format(str(i).zfill(2)) for i in range(1,13)]
data =  ShutubaTable.scrape(race_id_list, date)

  0%|          | 0/36 [00:00<?, ?it/s]

In [16]:

# 中京
# 202207010301

# race_id = 202205010211
# race_id_list = ['2022070102{}'.format(str(i).zfill(2)) for i in range(1,13)]
# data =  ShutubaTable.scrape([str(race_id)], date)
data =  ShutubaTable.scrape(race_id_list, date)

# race_id_list = ['2020010106{}'.format(str(i).zfill(2)) for i in range(1,13)]
st = ShutubaTable(data)


#前処理
st.preprocessing()

#馬の過去成績データ追加
st.merge_horse_results(hr)

#血統データ追加
st.merge_peds(p.peds_e)

#カテゴリ変数の処理
st.process_categorical(r.le_horse, r.le_jockey, r.data_pe)
# sl = RankSimulater(lgb_rank)
# sl.return_pred_table(st.data_c)

  0%|          | 0/36 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

scrape peds at horse_id_list "no_peds"


# 重要度

In [88]:
importances = pd.DataFrame(
{'features' : x_train.columns, 'importances' : lgb_rank.feature_importance()})
print(importances.sort_values('importances', ascending=False)[:30])

               features  importances
4              horse_id         1846
5             jockey_id          554
25      賞金_race_type_5R          128
24      着順_race_type_5R           53
10                着順_5R           50
32             賞金_開催_5R           41
52      着順_race_type_9R           36
38                着順_9R           34
80    着順_race_type_allR           32
6                    年齢           30
11                賞金_5R           27
53      賞金_race_type_9R           24
40                着差_9R           21
73   着順_course_len_allR           20
7                    体重           18
18     賞金_course_len_5R           12
12                着差_5R           12
68              着差_allR           10
45     着順_course_len_9R            9
75   着差_course_len_allR            4
60             賞金_開催_9R            4
82    着差_race_type_allR            3
9               n_horse            3
46     賞金_course_len_9R            3
150             peds_56            2
66              着順_allR            2
8

# Rank Simulate

In [41]:
sl = RankSimulater(lgb_rank)
sl.return_table_today(race_id_list)
# return_tables.rename(columns={'0':0,'1':1,'2':2,'3':3},inplace=True)
sl.show_results(st ,race_id_list)

  0%|          | 0/36 [00:00<?, ?it/s]

-------------------
predict
              馬番    scores
202205010801  10 -1.234598
202205010801   3 -1.313891
202205010801   6 -1.316911
actual
                0                1             2          3
202205010801   単勝               14          230円        1人気
202205010801   複勝          14 16 3  130円220円150円  1人気6人気2人気
202205010801   枠連              7 8          620円        2人気
202205010801   馬連            14 16        1,220円        4人気
202205010801  ワイド  14 16 3 14 3 16  510円310円760円  4人気1人気9人気
202205010801   馬単            14 16        1,740円        5人気
202205010801  3連複          3 14 16        2,340円        6人気
202205010801  3連単          14 16 3        9,840円       16人気
-------------------
predict
              馬番    scores
202205010802  16 -1.251063
202205010802  12 -1.396060
202205010802   5 -1.604587
actual
                0                  1             2          3
202205010802   単勝                 16          270円        1人気
202205010802   複勝           16 15 14  130円190円130円

# fastText

流れ
1. fasttext用の血統データの学習データを作る (血統の情報のみ, index ヘッダはいらない)
2. fasttext学習
3. 学習モデルを使って, 血統データをベクトル化
4. ベクトル化して r.data_cに concat
5. 学習


教師あり, 教師なしでも生成されるベクトルは等しい

# model_ft 作成

In [59]:
# 相対パスしかできない
# dim : 出力の次元
# minn : n_gramの最小単位
# maxn : n_gramの最大単位
model_ft = ft.train_unsupervised('test.txt',dim=62,minn=2,maxn=14)

In [77]:
# model_txt['hoge'] で 'hoge'の単語ベクトル入手
model_ft[model_ft.words[1]]

array([ 3.23876448e-05,  1.15252602e-04, -9.46200526e-05,  1.97496520e-05,
       -3.14811296e-05,  6.10383358e-05,  3.84694722e-05,  4.08472006e-05,
       -2.62596750e-05, -6.39620412e-05,  2.14720822e-05, -4.69113438e-05,
        3.77046381e-05, -1.26615938e-04,  4.62611060e-05, -4.64162804e-05,
       -1.04648252e-05,  8.02415016e-05,  5.22616428e-05,  2.21860992e-05,
       -1.75977038e-05, -8.26951291e-05,  3.14370882e-05,  6.86578787e-05,
       -3.35702607e-05,  1.14919050e-04, -8.21495541e-06, -9.01657186e-05,
       -7.84629883e-05,  2.17205616e-05, -1.27823092e-04,  7.07987565e-05,
        2.46920517e-05,  2.06759105e-05,  1.44077581e-04,  2.31686881e-05,
       -3.09964562e-05, -7.95884553e-05, -4.59835537e-05, -1.93069845e-05,
        3.55003340e-06,  1.18724784e-04, -6.99495213e-05, -5.45399816e-05,
       -7.00177625e-05,  4.58251998e-05, -5.90208510e-05,  1.51029690e-05,
        1.06203879e-05, -4.25494000e-05, -5.48500502e-05,  1.97607969e-05,
       -1.11221507e-05, -

In [82]:
model_ft.get_subwords(model_ft.words[1])[-1].shape

(8503,)

In [76]:
model_ft.words

['</s>',
 'ディープインパクト,クロウキャニオン,サンデーサイレンス,ウインドインハーヘア,フレンチデピュティ,クロカミ,Halo,WishingWell,Alzao,Burghclere,DeputyMinister,Mitterand,Caerleon,ミルド,HailtoReason,Cosmah,Understanding,MountainFlower,Lyphard,LadyRebecca,Busted,Highclere,ViceRegent,MintCopy,HoldYourPeace,LaredoLass,Nijinsky,Foreseer,DesertWine,MargieBelle,Turnto,Nothirdchance,CosmicBomb,Almahmoud,PromisedLand,PrettyWays,Montparnasse,Edelweiss,NorthernDancer,Goofed,SirIvor,Pocahontas,Crepello,SansleSou,QueensHussar,Highlight,NorthernDancer,VictoriaRegina,BuntysFlight,Shakney,SpeakJohn,BlueMoon,BoldRuler,FortunateIsle,NorthernDancer,FlamingPage,RoundTable,RegalGleam,Damascus,AnneCampbell,VaguelyNoble,Margravine',
 'ディープインパクト,ロベルタ,サンデーサイレンス,ウインドインハーヘア,ブライアンズタイム,グレースアドマイヤ,Halo,WishingWell,Alzao,Burghclere,Roberto,KelleysDay,トニービン,バレークイーン,HailtoReason,Cosmah,Understanding,MountainFlower,Lyphard,LadyRebecca,Busted,Highclere,HailtoReason,Bramalea,Graustark,GoldenTrail,カンパラ,SevernBridge,SadlersWells,SunPrincess,Turnto,Nothirdchance,CosmicBo

In [37]:
model_ft.get_input_vector(ind=10)

array([ 6.5724351e-03,  2.1741162e-03, -6.2224795e-03,  7.9450803e-03,
        9.2663774e-03,  1.2499948e-03, -1.9003255e-03, -1.4434209e-03,
       -7.0348015e-04, -2.3646757e-03,  8.4775481e-03,  7.6517521e-04,
        1.2443915e-04,  6.6868594e-04, -2.2353258e-03, -3.1006148e-03,
       -5.1603146e-04, -6.8897572e-03,  8.9530461e-03,  6.7916275e-03,
        2.7701540e-03, -1.4256138e-03,  9.4705867e-03, -9.9947751e-03,
       -4.6277489e-03, -7.6256958e-03,  7.5081405e-03, -3.4463892e-03,
       -3.8665808e-03,  1.7099102e-03,  9.7447438e-03, -7.7348817e-03,
       -2.9115018e-04, -3.1618846e-03,  6.5541668e-03, -7.9466682e-03,
       -5.2352938e-05, -8.1277534e-04, -1.2927600e-03, -9.0491874e-03,
       -4.8794332e-03, -1.3042025e-03, -8.9057656e-03, -2.9853662e-03,
        6.7775971e-03,  9.9755134e-03, -2.7005693e-03,  5.2640764e-03,
       -6.6113472e-03,  5.9692696e-04, -9.7892229e-03, -4.3281284e-03,
        6.6364333e-03, -9.9991390e-04,  3.3466963e-03, -7.2530257e-03,
      

model[model.words[1]] と model.get_input_vector(ind=1) は等価

In [111]:
model[test_str]

# model[model.words[1]] と model.get_input_vector(ind=1) は等価

array([ 8.64855465e-05,  1.37111065e-05,  1.41594879e-04,  3.69198642e-05,
        9.37314871e-06,  9.83630889e-05, -4.32917550e-05, -5.60286717e-05,
       -1.21071007e-05,  3.47241585e-05, -1.29177488e-05,  5.48821408e-05,
       -7.11681787e-05,  1.35873206e-05, -6.51547089e-05,  1.05369854e-05,
        2.46712134e-05, -2.98814448e-05, -6.97223822e-06,  5.47772688e-05,
       -4.34648828e-05, -6.77032876e-05,  3.82750259e-05,  4.62639291e-05,
        3.87809414e-05, -5.79457264e-05, -3.11739132e-05, -3.45420995e-05,
        2.56179737e-05,  1.88591548e-05, -1.06936168e-04, -3.09621441e-06,
       -3.30380026e-05, -2.44859002e-05,  2.54371498e-05,  2.28005192e-05,
       -1.14125714e-05, -7.71405212e-06, -2.62292688e-05,  4.95023669e-05,
        6.83483158e-05,  7.41472240e-06, -7.45871466e-06, -1.99570986e-05,
       -8.77055936e-06,  6.14155870e-05, -3.37384336e-05, -7.03690312e-05,
       -6.21120780e-05, -3.50524570e-05, -2.38443281e-05,  3.41939740e-05,
       -5.05409917e-05, -

In [63]:
# 文字列のベクトル表現
model.get_sentence_vector(test_str)

array([ 0.22636686,  0.0358811 ,  0.3706047 ,  0.09663288,  0.02452924,
        0.2574478 , -0.11331306, -0.1466456 , -0.03169499,  0.0908782 ,
       -0.03380894,  0.14364257, -0.18627687,  0.03555746, -0.17052874,
        0.0275799 ,  0.06457247, -0.07821366, -0.01824913,  0.14337559,
       -0.11376097, -0.1772068 ,  0.10017442,  0.12109151,  0.10149854,
       -0.15165876, -0.08159366, -0.09040555,  0.0670519 ,  0.04935378,
       -0.27988228, -0.00810147, -0.08646543, -0.06408333,  0.06658382,
        0.05967693, -0.02987295, -0.02018429, -0.06864916,  0.12956837,
        0.17888325,  0.01940197, -0.0195243 , -0.05223562, -0.02295211,
        0.16074347, -0.08830138, -0.18417585, -0.16256206, -0.09174566,
       -0.06240372,  0.0894906 , -0.1322762 , -0.00837872, -0.09055168,
        0.076395  ,  0.22093119,  0.10093257, -0.09220136, -0.19917955,
        0.11172937,  0.15837023], dtype=float32)

In [39]:
# これまで入力した行列を返す関数
model.get_input_matrix()

array([[-0.01349934,  0.01271002, -0.01006453, ...,  0.00531607,
        -0.0106207 ,  0.00814016],
       [ 0.01048684,  0.00816879, -0.00584027, ...,  0.01594336,
         0.00641512,  0.01121091],
       [-0.01389093, -0.00994238, -0.01586624, ...,  0.00450218,
         0.00770794,  0.00581788],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)

In [78]:
model_ft.get_line(model_ft.words[1])

(['ディープインパクト,クロウキャニオン,サンデーサイレンス,ウインドインハーヘア,フレンチデピュティ,クロカミ,Halo,WishingWell,Alzao,Burghclere,DeputyMinister,Mitterand,Caerleon,ミルド,HailtoReason,Cosmah,Understanding,MountainFlower,Lyphard,LadyRebecca,Busted,Highclere,ViceRegent,MintCopy,HoldYourPeace,LaredoLass,Nijinsky,Foreseer,DesertWine,MargieBelle,Turnto,Nothirdchance,CosmicBomb,Almahmoud,PromisedLand,PrettyWays,Montparnasse,Edelweiss,NorthernDancer,Goofed,SirIvor,Pocahontas,Crepello,SansleSou,QueensHussar,Highlight,NorthernDancer,VictoriaRegina,BuntysFlight,Shakney,SpeakJohn,BlueMoon,BoldRuler,FortunateIsle,NorthernDancer,FlamingPage,RoundTable,RegalGleam,Damascus,AnneCampbell,VaguelyNoble,Margravine',
  '</s>'],
 [])

In [129]:
model.get_output_matrix()

array([], shape=(0, 62), dtype=float32)

In [70]:
S1 = model.get_word_vector('サトノダイヤモンド')

In [74]:
S2 = model.get_word_vector('ディープインパクト')

In [75]:
import numpy as np
np.dot(S1,S2)

-5.7157024e-05

In [107]:
# model[test_str]と等価
model.get_word_vector(test_str)

array([ 8.64855465e-05,  1.37111065e-05,  1.41594879e-04,  3.69198642e-05,
        9.37314871e-06,  9.83630889e-05, -4.32917550e-05, -5.60286717e-05,
       -1.21071007e-05,  3.47241585e-05, -1.29177488e-05,  5.48821408e-05,
       -7.11681787e-05,  1.35873206e-05, -6.51547089e-05,  1.05369854e-05,
        2.46712134e-05, -2.98814448e-05, -6.97223822e-06,  5.47772688e-05,
       -4.34648828e-05, -6.77032876e-05,  3.82750259e-05,  4.62639291e-05,
        3.87809414e-05, -5.79457264e-05, -3.11739132e-05, -3.45420995e-05,
        2.56179737e-05,  1.88591548e-05, -1.06936168e-04, -3.09621441e-06,
       -3.30380026e-05, -2.44859002e-05,  2.54371498e-05,  2.28005192e-05,
       -1.14125714e-05, -7.71405212e-06, -2.62292688e-05,  4.95023669e-05,
        6.83483158e-05,  7.41472240e-06, -7.45871466e-06, -1.99570986e-05,
       -8.77055936e-06,  6.14155870e-05, -3.37384336e-05, -7.03690312e-05,
       -6.21120780e-05, -3.50524570e-05, -2.38443281e-05,  3.41939740e-05,
       -5.05409917e-05, -

In [7]:
import random
random.randint(97,122)

119

# ランダム文字列でテスト

In [13]:
word_list = []
for i in range(62):
    rand_name = ''
    for j in range(4):
        rand_name += chr(random.randint(97,122))
    word_list.append(rand_name)

In [14]:
test_str = ",".join(word_list)

In [158]:
import gensim 
gen_model = gensim.models.KeyedVectors.load_word2vec_format('fastText/ketto_model.vec', binary=False)

# most_similarメソッドを使って演算
# positiveに足し合わせるデータをリストで渡し、negativeに差し引くデータをリストで渡す。

gen_model.most_similar(
    positive=[ "ゴールドシップ"],
#     negative=["ディープインパクト"]
)

[('ポイントフラッグ', 0.951915442943573),
 ('カレイメモワール', 0.8899344801902771),
 ('リヤンドファミユ', 0.8506640791893005),
 ('パストラリズム', 0.8343662619590759),
 ('コスモスカイライン', 0.8293111324310303),
 ('ドリームジャーニー', 0.828325092792511),
 ('ハッシュバンバン', 0.8259212374687195),
 ('ナカヤマフェスタ', 0.8249091506004333),
 ('タイセイレジェンド', 0.8150109648704529),
 ('オーシャンブルー', 0.8144512176513672)]

In [159]:
gen_model.most_similar(
    positive=[ "ディープインパクト"],
#     negative=["ディープインパクト"]
)

[('ウインドインハーヘア', 0.8664582371711731),
 ('ローリエ', 0.7226738333702087),
 ('オンヴェラ', 0.7160682082176208),
 ('アイスドール', 0.7097043395042419),
 ('アローム', 0.7028155326843262),
 ('サトノアラジン', 0.7014816403388977),
 ('クロノロジスト', 0.6986632943153381),
 ('ピンクアリエス', 0.6985989212989807),
 ('ナイトマジック', 0.6953324675559998),
 ('ヘヴンリークルーズ', 0.6944254636764526)]

In [15]:
model_txt = ft.train_unsupervised('fastText/text.txt',minn=2,maxn=14)

In [17]:
len(model_txt.words)

9368

In [49]:
coords

array([[0.]], dtype=float32)

In [46]:
vectors  = model_txt['キズナ'].reshape(1, -1)

In [38]:
vectors.shape

(100,)

In [41]:
vectors.reshape(1, -1)

array([[-0.0507768 , -0.03158796,  0.3719879 , -0.43337092,  0.04407024,
        -0.04416358,  0.56797636,  0.3727986 ,  0.52248025, -0.4481369 ,
         0.2606985 , -0.20405166, -0.3789636 ,  0.09112091, -0.27974096,
         0.18359385, -0.2400678 ,  0.74059075,  0.3779946 , -0.7949204 ,
        -0.6549442 ,  0.00354731, -0.62105113, -0.04626863, -0.10678367,
         0.35455292, -0.48053965,  0.8271895 ,  1.0350835 ,  0.06584841,
        -0.2702481 , -0.17578961,  0.2406562 , -0.01046936, -0.09063246,
         0.296278  , -0.76684433, -0.2213059 ,  1.2849661 , -0.22702964,
        -0.1260167 , -0.45229167, -0.30201095, -0.08013313,  0.38698265,
        -0.25812092,  0.1184909 ,  0.177462  ,  0.0502384 ,  0.12209569,
        -0.7180075 ,  0.3745045 , -0.82095605,  0.40734512, -0.3441998 ,
         0.7743839 , -0.1306628 , -0.6589422 ,  0.10314472,  0.17247966,
        -0.01692947,  0.15154183,  0.7737049 , -0.86726075,  0.38742578,
        -0.29168054, -0.21751773,  0.30418798, -0.2

In [16]:
model_txt['キズナ']

array([-0.4545997 , -0.08741464,  0.31173626,  0.13845074, -0.76570636,
        0.07802508,  0.64899915,  0.6832365 ,  1.4973719 , -0.3659583 ,
       -0.10207128, -0.51044196, -0.40104282, -0.064634  , -0.6910132 ,
        0.6079259 , -0.05301953,  0.8091327 , -0.10024333, -0.18845513,
       -0.21310434,  0.01393075, -0.61000365,  0.5268892 , -0.14041117,
       -0.10406648, -0.71680737,  0.81365883,  1.0771564 ,  0.00515136,
        0.0345313 ,  0.13858005,  0.79001707, -0.2743086 , -0.191795  ,
       -0.30741683, -0.23364004, -0.44694647,  0.4207134 , -0.55999047,
       -0.10824247, -0.3253643 , -0.05974116, -0.49804798,  0.06789133,
       -0.37461394, -0.02879322,  0.10663368, -0.19275875,  0.8938551 ,
       -0.52291214,  0.39916897, -0.7857111 ,  0.4856257 , -0.40032613,
        1.27588   ,  0.22451466, -1.2030284 ,  0.00870536,  0.34859702,
        0.05455235,  0.12034642,  0.60126066, -0.76645064,  0.4021038 ,
       -0.59574425,  0.27805454, -0.1309369 , -0.70244116,  0.45

In [15]:
model_txt['ウインドインハーヘア']

array([1.1273632], dtype=float32)

In [177]:
'キセキ' in model_txt

False

In [197]:
model_txt['ggggg']

array([-0.10564891], dtype=float32)

In [253]:
model_txt.get_subwords('キセキ')

(['<キ', '<キセ', '<キセキ', '<キセキ>', 'キセ', 'キセキ', 'キセキ>', 'セキ', 'セキ>', 'キ>'],
 array([1668417, 1684405,  727383,  376875,  705019, 1474665, 1347925,
        1148731, 1613759,  471987]))

In [252]:
model_txt['キセキ']

array([2.3261814], dtype=float32)

In [203]:
model['アカイイト']

array([ 6.5716173e-05,  1.8187744e-03, -9.1101311e-04,  2.1235285e-04,
        7.0718746e-04,  1.0516392e-03,  1.1351848e-04,  8.2758488e-04,
        2.7018306e-03, -9.5881027e-05,  2.9185105e-05, -1.0434553e-03,
       -7.9279731e-04,  9.7809790e-04,  1.0020308e-03, -9.9704921e-06,
        1.7076595e-03,  2.0725920e-04,  1.9977726e-03,  2.2366480e-03,
        2.1179956e-03,  2.2501138e-03, -1.3245193e-03,  8.5473980e-04,
        2.1319906e-04,  1.9163315e-03, -1.8604699e-04, -2.4861936e-03,
       -2.0311947e-04,  1.5762139e-03, -1.0374290e-03, -3.4588840e-04,
        1.3026310e-03, -7.2811241e-04,  5.4169149e-04, -1.7128785e-03,
       -4.9187336e-04,  8.6225322e-05,  1.0082403e-03,  1.5551767e-03,
       -9.9105854e-04, -5.5738556e-04,  6.8321481e-04, -8.1636908e-04,
        1.1570966e-03, -6.2821846e-04,  1.1838707e-03,  2.3540622e-03,
       -4.7980080e-04, -1.3543551e-03, -9.9284640e-05, -5.7554425e-04,
        6.3399930e-04, -1.7117684e-03,  2.8442516e-04,  9.7960467e-04,
      

In [254]:
model_txt.get_subwords(model.words[1])

(['<デ',
  '<ディ',
  '<ディー',
  '<ディープ',
  '<ディープイ',
  '<ディープイン',
  '<ディープインパ',
  '<ディープインパク',
  '<ディープインパクト',
  '<ディープインパクト,',
  '<ディープインパクト,ク',
  '<ディープインパクト,クロ',
  '<ディープインパクト,クロウ',
  'ディ',
  'ディー',
  'ディープ',
  'ディープイ',
  'ディープイン',
  'ディープインパ',
  'ディープインパク',
  'ディープインパクト',
  'ディープインパクト,',
  'ディープインパクト,ク',
  'ディープインパクト,クロ',
  'ディープインパクト,クロウ',
  'ディープインパクト,クロウキ',
  'ィー',
  'ィープ',
  'ィープイ',
  'ィープイン',
  'ィープインパ',
  'ィープインパク',
  'ィープインパクト',
  'ィープインパクト,',
  'ィープインパクト,ク',
  'ィープインパクト,クロ',
  'ィープインパクト,クロウ',
  'ィープインパクト,クロウキ',
  'ィープインパクト,クロウキャ',
  'ープ',
  'ープイ',
  'ープイン',
  'ープインパ',
  'ープインパク',
  'ープインパクト',
  'ープインパクト,',
  'ープインパクト,ク',
  'ープインパクト,クロ',
  'ープインパクト,クロウ',
  'ープインパクト,クロウキ',
  'ープインパクト,クロウキャ',
  'ープインパクト,クロウキャニ',
  'プイ',
  'プイン',
  'プインパ',
  'プインパク',
  'プインパクト',
  'プインパクト,',
  'プインパクト,ク',
  'プインパクト,クロ',
  'プインパクト,クロウ',
  'プインパクト,クロウキ',
  'プインパクト,クロウキャ',
  'プインパクト,クロウキャニ',
  'プインパクト,クロウキャニオ',
  'イン',
  'インパ',
  'インパク',
  'インパクト',
  'インパクト,',
  'インパクト,ク',
  'インパクト,クロ',
  'インパクト,クロウ'

In [None]:
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

# 今後の方針
1. とりあえず, Peds class で, 馬名 -> 馬名正規化
2. 仕様は決めていないが, 学習ずみ, fasttext モデルで血統をベクトル化
3. 

In [228]:
horse_results

Unnamed: 0,日付,開催,天気,R,レース名,映像,頭数,枠番,馬番,オッズ,...,着差,ﾀｲﾑ指数,通過,ペース,上り,馬体重,厩舎ｺﾒﾝﾄ,備考,勝ち馬(2着馬),賞金
2017105800,2021/07/08,名古屋,曇,7.0,C12組,,10.0,4.0,4.0,6.6,...,,**,4-3,0.0-39.5,,479(-1),,,ナツミフルーリー,
2017105800,2021/06/30,名古屋,曇,5.0,C14組,,10.0,3.0,3.0,12.2,...,0.7,**,5-6-5,0.0-40.3,39.4,480(-4),,,マルカブレーブ,10.2
2017105800,2021/06/16,名古屋,雨,5.0,美穂ちゃんバースデイ,,11.0,2.0,2.0,13.5,...,2.7,**,10-10-8,0.0-38.2,38.4,484(+3),,,タイキスピネル,2.6
2017105800,2021/06/03,名古屋,晴,6.0,C14組,,10.0,8.0,9.0,3.8,...,1.0,**,3-4-4,0.0-39.6,40.1,481(-6),,,トーホウボッカ,
2017105800,2021/05/19,名古屋,雨,5.0,C15組,,11.0,8.0,11.0,29.8,...,0.5,**,3-3-3,0.0-38.4,38.6,487(0),,,キングリアリティー,10.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017106185,2020/03/15,1阪神6,晴,6.0,3歳1勝クラス,,16.0,4.0,8.0,8.5,...,1.7,**,11-13,34.2-38.5,38.9,474(-2),,,サトノラファール,
2017106185,2020/02/29,1阪神1,曇,6.0,3歳1勝クラス,,13.0,8.0,13.0,6.1,...,0.6,**,4-4,36.5-35.4,35.6,476(-8),,,ジェネティクス,290.0
2017106185,2020/02/09,2京都4,晴,3.0,3歳1勝クラス,,12.0,1.0,1.0,6.8,...,0.9,**,10-11,35.6-37.1,36.8,484(+10),,,アウトウッズ,73.0
2017106185,2019/11/23,5東京7,雨,9.0,カトレア賞(1勝クラス),,16.0,4.0,8.0,65.4,...,6.5,**,7-8,33.9-37.1,42.7,474(0),,,デュードヴァン,
