In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 目標は回収率１００％超えにすること。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
from itertools import cycle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
!pip install eli5
from eli5.sklearn import PermutationImportance
from keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [31]:
# データの読み込み
# パスは適宜変更してください
keiba_data = pd.read_csv('/content/drive/MyDrive/競馬.csv', encoding = "shift-jis")

In [None]:
keiba_data

In [33]:
# Unnamed0のカラムは日付、レース番号、馬番号なのでrenameして分割。
keiba_data.rename(columns={"Unnamed: 0":"date_num"},inplace=True)
keiba_data["date_num"]=keiba_data["date_num"].astype(str)
keiba_data["race_num"]=keiba_data["date_num"].str[0:12].astype(int)
keiba_data["horse_num"]=keiba_data["date_num"].str[12:14].astype(int)
keiba_data.drop(columns=["date_num"],inplace=True)
# 扱いやすいようにrace_numとhorse_numは一番左に配置。
keiba_data=keiba_data.reindex(columns=["race_num","horse_num",'age', 'c_weight', 'course', 'date', 'field', 'gender', 'head_count',
       'horse_name', 'j_weight', 'jackie', 'odds', 'popu', 'race', 'race_name',
       'rank', 'trainerA', 'trainerB', 'weight', 'year'])

#欠損値の確認


In [None]:
keiba_data.isnull().sum()

In [None]:
keiba_data.head()

In [35]:
#欠損値処理
keiba_data["c_weight"].fillna(0,inplace=True)
keiba_data["j_weight"].fillna(keiba_data["j_weight"].mean(),inplace=True)
keiba_data["weight"].fillna(keiba_data["weight"].mean(),inplace=True)
keiba_data.dropna(subset=["odds"],inplace=True)
keiba_data.dropna(subset=["popu"],inplace=True)
keiba_data.dropna(subset=["race_name"],inplace=True)

In [36]:
#labelencoderを使って、カテゴリ変数を変換。
le=LabelEncoder()
keiba_categorical = keiba_data[["gender","field","horse_name","course","head_count","trainerA","trainerB","race","jackie","race_name"]].apply(le.fit_transform)
keiba_categorical = keiba_categorical.rename(columns={"race_name":"race_name_c","filed":"field_c","gender":"gender_c","horse_name":"horse_name_c","course":"course_c","head_count":"head_count_c","trainerA":"trainerA_c","trainerB":"trainerB_c","jackie":"jackie_c"})
keiba_data = pd.concat([keiba_data,keiba_categorical],axis=1)
# 変換前と不要な列を削除
keiba_data.drop(columns=["race_num","horse_num","date","year","race_name","race","trainerA","trainerB","course","field","gender","jackie","head_count","horse_name"],inplace=True)

In [37]:
# 特徴量生成
# １つ目はにoddsとpopuの積
# ２つ目は前回の体重
keiba_data["odds_popu"]=keiba_data["odds"]*keiba_data["popu"]
keiba_data["pre_weight"]=keiba_data["weight"]-keiba_data["c_weight"]

In [None]:
# rankの確認
keiba_data["rank"].unique()

In [None]:
#中止、失格の数をそれぞれ確認。

a=keiba_data["rank"][keiba_data["rank"]=="中止"].count()
b=keiba_data["rank"][keiba_data["rank"]=="失格"].count()
print(a,b)

In [40]:
#中止、失格の行は全て削除する。

delete_index = keiba_data.index[(keiba_data["rank"]=="中止") | (keiba_data["rank"]=="失格")]
keiba_data.drop(delete_index,inplace=True)

In [41]:
# 1,2,3着かそれ以外かを分割して、２値分類問題にする。
keiba_data["rank"]=keiba_data["rank"].astype(int)
keiba_data = keiba_data.assign(target = (keiba_data['rank'] <= 3).astype(int))

In [42]:
keiba_data

Unnamed: 0,age,c_weight,j_weight,odds,popu,rank,weight,gender_c,horse_name_c,course_c,head_count_c,trainerA_c,trainerB_c,jackie_c,race_name_c,odds_popu,pre_weight,target
0,2,0.0,54.0,4.3,2.0,1,468.0,1,30089,17,4,4,621,379,1461,8.6,468.0,1
1,2,0.0,54.0,5.2,4.0,2,460.0,2,191,17,4,2,426,699,1461,20.8,460.0,1
2,2,2.0,54.0,3.1,1.0,3,442.0,2,4764,17,4,4,165,407,1461,3.1,440.0,1
3,2,-4.0,54.0,23.0,8.0,4,482.0,2,56823,17,4,2,327,428,1461,184.0,486.0,0
4,2,-20.0,54.0,21.9,6.0,5,484.0,1,52788,17,4,2,446,739,1461,131.4,504.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535589,4,-2.0,55.0,243.4,16.0,12,424.0,1,12382,6,6,2,617,431,1470,3894.4,426.0,0
535590,3,2.0,52.0,69.2,12.0,13,472.0,1,4913,6,6,2,732,374,1470,830.4,470.0,0
535591,4,0.0,55.0,39.5,9.0,14,444.0,1,50537,6,6,2,797,403,1470,355.5,444.0,0
535592,4,-2.0,53.0,27.5,7.0,15,468.0,1,30314,6,6,2,654,95,1470,192.5,470.0,0


In [43]:
# 目的変数との分割
X = keiba_data.drop(['rank','target'], axis=1)
y = keiba_data['target']

In [44]:
# 標準化してないtrainデータとtestデータの分割
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, test_size=0.33, random_state=0)

In [45]:
# 標準化
ss = StandardScaler()
X_ss = ss.fit_transform(X)

In [46]:
# 標準化したtrainデータとtestデータの分割
X_train, X_test, y_train, y_test = train_test_split(X_ss, y, test_size=0.33, random_state=0)

モデルの構築

In [47]:
# ニューラルネットワークに必要なモジュールのimport

import keras
from keras.models import Model,Sequential
from keras.layers import Input, Dense, Activation, Dropout
from keras.layers import Conv2D, GlobalAveragePooling2D
from keras.layers import BatchNormalization, Add, Dense
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical
from keras.initializers import he_normal
import keras.backend as K

In [48]:
def create_model(optimizer='adam', init='glorot_normal'): 
    
    model = Sequential()
    model.add(Dense(16, input_dim=X.shape[1], kernel_initializer=init, activation='relu'))
    model.add(Dense(8, kernel_initializer=init, activation='relu'))
    model.add(Dense(4, kernel_initializer=init, activation='relu'))
    model.add(Dense(1, kernel_initializer=init, activation='sigmoid')) #最後の活性化関数は２値分類なのでsigmoidを使用。多値分類だとsoftmaxを使用するとよい。sigmoidの一般化がsoftmax。

    #2値分類なのでbinary_crossentropyを使う
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

model = create_model()

In [None]:
model.summary()

In [None]:
X_train.shape

In [None]:
#checkpointとearystoppingの設定
es = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
cp = ModelCheckpoint(filepath = 'best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

history = model.fit(x=X_train, y=y_train, 
          validation_data=(X_test, y_test), 
          batch_size=512, 
          epochs=100, 
          shuffle=True,
          callbacks=[es, cp]
         )

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(len(acc))

plt.figure()               
plt.title("accuracy")
plt.plot(epochs, acc, label="train_accuracy")
plt.plot(epochs, val_acc, label="val_accuracy")

plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

y_pred = model.predict(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_pred)

plt.plot(fpr, tpr, marker='o')
plt.xlabel('FPR: False positive rate')
plt.ylabel('TPR: True positive rate')
plt.grid()

In [62]:
pred = model.predict(X_test)
print(roc_auc_score(y_test, pred))

0.8123455644872033
