In [6]:
import time
import warnings

import numpy as np
import optuna.integration.lightgbm as lgb
import pandas as pd
import tensorflow as tf

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from tensorflow import feature_column
from tensorflow.keras import layers

df = pd.read_csv('horsedata.csv', encoding='shift-jis')

# 前走成績
df = df.dropna(how='any')

df['days'] = pd.to_datetime(df['days'])
name_days_df = df[["horsename", "days", "pop",
                   "odds", "rank3", "rank4", "3ftime", "result"]].sort_values(['horsename', 'days'])

name_list = name_days_df['horsename'].unique()
df_list = []

for name in name_list:
    name_df = name_days_df[name_days_df['horsename'] == name]
    shift_name_df = name_df[["pop", "odds", "rank3",
                             "rank4", "3ftime", "result"]].shift(1)
    shift_name_df['horsename'] = name
    df_list.append(shift_name_df)

df_before = pd.concat(df_list)
df_before['days'] = name_days_df['days']

df_before = df_before.rename(columns={'pop': 'pre_pop', 'odds': 'pre_odds', 'rank3': 'pre_rank3',
                                      'rank4': 'pre_rank4', '3ftime': 'pre_3ftime', 'result': 'pre_result'})

df = pd.merge(df, df_before, on=['horsename', 'days'], how='inner')


# 騎手のコースごとの連対率
df.loc[df['result'] >= 3, 'result'] = 0
df.loc[df['result'] == 2, 'result'] = 1

table_jockey = pd.pivot_table(
    df, index='jocky', columns='place', values='result', aggfunc='mean', dropna=False)
table_jockey = table_jockey.fillna(0)

table_jockey = pd.DataFrame(table_jockey)
table_jockey = table_jockey.round(4)
table_jockey = table_jockey.add_prefix('jockey_')


Collecting tensorflow
  Downloading tensorflow-2.12.0-cp310-cp310-win_amd64.whl (1.9 kB)
Collecting tensorflow-intel==2.12.0
  Downloading tensorflow_intel-2.12.0-cp310-cp310-win_amd64.whl (272.8 MB)
     -------------------------------------- 272.8/272.8 MB 2.0 MB/s eta 0:00:00
Collecting jax>=0.3.15
  Using cached jax-0.4.10.tar.gz (1.3 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting astunparse>=1.6.0
  Using cached astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting flatbuffers>=2.0
  Using cached flatbuffers-23.5.26-py2.py3-none-any.whl (26 kB)
Collecting termcolor>=1.1.0
  Using cached termcolor-2.3.0-py3-none-any.whl (6.9 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4

  table_jockey = pd.pivot_table(


In [7]:
#父親のコース、距離、馬場の連対率と脚質

df.loc[df['result'] >= 3, 'result'] = 0
df.loc[df['result'] == 2, 'result'] = 1

index = 'father'

table_father_place = pd.pivot_table(df, index=index, columns='place', values='result', aggfunc='mean',
                                    dropna=False)
table_father_distance = pd.pivot_table(df, index=index, columns='distance', values='result', aggfunc='mean',
                                       dropna=False)
table_father_turf = pd.pivot_table(df, index=index, columns='turf', values='result', aggfunc='mean',
                                   dropna=False)
table_father_condition = pd.pivot_table(df, index=index, columns='condition', values='result', aggfunc='mean',
                                        dropna=False)

table_father = pd.merge(table_father_place, table_father_distance, on=index, how='left')
table_father = pd.merge(table_father, table_father_turf, on=index, how='left')
table_father = pd.merge(table_father, table_father_condition, on=index, how='left')

table_father1 = table_father.fillna(0)

df['legtype'] = df['legtype'].map({'逃げ': 0, '先行': 1, '差し': 2, '追込': 3, '自在': 4})
legtypes = df.groupby(index).legtype.apply(lambda x: x.mode()).reset_index()

legtype = pd.DataFrame(legtypes)
legtype['legtype'] = legtype['legtype'].map({0: '逃げ', 1: '先行', 2: '差し', 3: '追込', 4: '自在'})

legtype = legtype.drop('level_1', axis=1)

time_3f = df.groupby(index).mean()['3ftime']
time3f = pd.DataFrame(time_3f)

father = pd.merge(table_father1, legtype, on=index, how='left')
father = pd.merge(father, time3f, on=index, how='left')

father = father.round(3)
father = father.add_prefix('{}_'.format(index))

  time_3f = df.groupby(index).mean()['3ftime']


In [8]:
#特徴量の生成

d_ranking = lambda x: 1 if x in [1, 2] else 0
df['flag'] = df['result'].map(d_ranking)

drop_list = ['result', 'rank3', 'rank4', '3ftime', 'time']
df = df.drop(drop_list, axis=1)

df['odds_hi'] = (df['odds'] / df['pop'])
df['re_odds_hi'] = (df['pre_odds'] / df['pre_pop'])
df['odds_hi*2'] = df['odds_hi'] ** 2
df['re_odds_hi*2'] = df['re_odds_hi'] ** 2
df['re_3_to_4time'] = (df['pre_rank4'] - df['pre_rank3'])
df['re_3_to_4time_hi*2'] = (df['pre_rank4'] / df['pre_rank3']) ** 2
df['father_3f_to_my'] = (df['father_3ftime'] - df['pre_3ftime'])
df['fathertype_3f_to_my'] = (df['fathertype_3ftime'] - df['pre_3ftime'])
df['re_pop_now_pop'] = (df['pre_pop'] - df['pop'])
df['re_odds_now_odds'] = (df['pre_odds'] - df['odds'])
df['re_result_to_pop'] = (df['pre_result'] - df['pre_pop'])

feature_list = ['odds_hi', 're_odds_hi', 'odds_hi*2', 're_odds_hi*2', 're_3_to_4time', 're_3_to_4time_hi*2',
                        'father_3f_to_my', 'fathertype_3f_to_my', 're_pop_now_pop', 're_odds_now_odds',
                        're_result_to_pop']
for feature in feature_list:
    df[feature] = df[feature].replace([np.inf, -np.inf], np.nan)
    df[feature] = df[feature].fillna(0)

KeyError: 'father_3ftime'

In [None]:
#LightGBMでの実装
cat_cols = ['place', 'class', 'turf', 'distance', 'weather', 'condition', 'sex', 'father', 'mother',
                    'fathertype', 'fathermon', 'legtype', 'jocky', 'trainer', 'father_legtype']
for c in cat_cols:
    le = LabelEncoder()
    le.fit(df[c])
    df[c] = le.transform(df[c])

In [None]:
df['days'] = pd.to_datetime(df['days'])
df = df.dropna(how='any')

df_pred = df[df['days'] >= datetime(2021, 11, 7)]
df_pred_droped = df_pred.drop(['flag', 'days', 'horsename', 'raceid', 'odds', 'pop'], axis=1)

df = df[df['days'] < datetime(2021, 11, 7)]

train_x = df.drop(['flag', 'days', 'horsename', 'raceid', 'odds', 'pop'], axis=1)
train_y = df['flag']

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y,
                                                    stratify=train_y,
                                                    random_state=0, test_size=0.3, shuffle=True)

cat_cols = ['place', 'class', 'turf', 'distance', 'weather', 'condition', 'sex', 'father', 'mother',
            'fathertype', 'fathermon', 'legtype', 'jocky', 'trainer', 'father_legtype']

lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=cat_cols)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, categorical_feature=cat_cols)

params = {
    'task': 'predict',
    'objective': 'binary',
    'verbosity': -1,
}

model = lgb.train(
    params,
    lgb_train,
    categorical_feature=cat_cols,
    valid_sets=lgb_eval,
    num_boost_round=100,
    early_stopping_rounds=20,
)
best_params = model.params

model = lgb.train(
    best_params,
    lgb_train,
    categorical_feature=cat_cols,
    valid_sets=lgb_eval,
    num_boost_round=100,  # 100
    early_stopping_rounds=20,  # 20
)

predict_proba = model.predict(df_pred_droped, num_iteration=model.best_iteration)

gbm_predict = pd.DataFrame({"raceid": df_pred['raceid'],
                        "gbm_pred": predict_proba})

In [None]:
#TensorFlowでの実装
scaler = StandardScaler()
sc = scaler.fit(df[num_data])

scalered_df = pd.DataFrame(sc.transform(df[num_data]), columns=num_data, index=df.index)
df.update(scalered_df)

# ここからTensorFlow
feature_columns = []

num_data = datalist.num_datas

for header in num_data:
    feature_columns.append(feature_column.numeric_column(header))

horsenum = feature_column.numeric_column('horsenum')
horsenum_buckets = feature_column.bucketized_column(horsenum, [2, 4, 6, 8, 10, 12, 14, 16, 18])
feature_columns.append(horsenum_buckets)

cat_data = ['place', 'class', 'turf', 'weather', 'condition', 'sex', 'father', 'mother', 'fathermon',
            'fathertype', 'legtype', 'jocky', 'trainer']

for cat in cat_data:
    category = feature_column.categorical_column_with_vocabulary_list(cat, list(df[cat].unique()))
    feature_columns.append(feature_column.embedding_column(category, dimension=8))

feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('flag')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

df['days'] = pd.to_datetime(df['days'])
df = df.dropna(how='any')

df_pred = df[df['days'] >= datetime(2021, 11, 7)]
df_pred_droped = df_pred.drop(['flag', 'days', 'horsename', 'raceid', 'odds', 'pop'], axis=1)

df = df[df['days'] < datetime(2021, 11, 7)]
df = df.drop(['days', 'horsename', 'raceid', 'odds', 'pop'], axis=1)

train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

batch_size = 32
train_ds = self.df_to_dataset(train, batch_size=batch_size)
val_ds = self.df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = self.df_to_dataset(test, shuffle=False, batch_size=batch_size)

pred_ds = tf.data.Dataset.from_tensor_slices(dict(df_pred_droped))
pred_ds = pred_ds.batch(batch_size=batch_size)

model = tf.keras.Sequential([
    feature_layer,
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=5)

# loss, accuracy = model.evaluate(test_ds)

predictions = model.predict(pred_ds)
predict = [i for i in predictions]

d = {
    "raceid": df_pred['raceid'],
    "tf_pred": predict
}

tf_predict = pd.DataFrame(data=d)

In [None]:
#平均をとりフラグの作成

df_pred = main_df[main_df['days'] >= datetime(2021, 11, 7)]

df = pd.merge(gbm_predict, tf_predict, on='raceid', how='left')

# gbm_pred, tf_pred
df['new_mark_flag'] = '×'
df['new_flag'] = 0

# # 0.5が1個以上のフラグ作成。〇
df['new_mark_flag'].mask((df['gbm_pred'] >= 0.5) | (df['tf_pred'] >= 0.5), '〇', inplace=True)

# 0.5が2個以上のフラグ作成。◎
df['new_mark_flag'].mask((df['gbm_pred'] >= 0.5) & (df['tf_pred'] >= 0.5), '◎', inplace=True)

# 0.5以上をフラグ追加
df['new_flag'].mask(((df['gbm_pred'] * 0.5) + (df['tf_pred'] * 0.5)) >= 0.5, 1, inplace=True)

df = pd.merge(df_pred, df, on='raceid', how='left')

SyntaxError: invalid syntax (2465742824.py, line 7)