In [23]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import hashlib
import math, os
from tqdm import tqdm

# 関数、クラス　定義

In [24]:
pd.set_option('display.max_columns', 40)

In [25]:
# トレーニング用の入力データの選択
#X_columns = ['locality', 'age', 'rank', 'leg', 'racing piont', \
#             'S', 'B', 'Nige', 'Maki', 'Sashi', 'Ma', \
#             '1st', '2nd', '3rd', 'Chakugai', 'win', '2ren', '3ren']
X_columns = ['rank', 'racing piont', '1st', '2nd', '3rd', 'Chakugai', 'win', '2ren', '3ren']


In [26]:
def zscore(x, axis = None):
    xmean = x.mean(axis=axis, keepdims=True)
    xstd  = np.std(x, axis=axis, keepdims=True)
    zscore = (x-xmean)/xstd
    return zscore

In [27]:
def zscore_cor(x, y, axis = None):
    xmean = x.mean(axis=axis, keepdims=True)
    xstd  = np.std(x, axis=axis, keepdims=True)
    zscore = (y-xmean)/xstd
    return zscore

In [28]:
# Webスクレイピングで取得した戦績データをファイルから読み取り、データフレームに変換＋データ前処理
def get_df_train(places):
    
    init_flag = True
    for place in places:
        print('loading data for ' + place)
        filename = "data/" + place + "_train_data.csv"
        df_train = pd.read_csv(filename, encoding="SHIFT_JIS", header=0, nrows=None)

        targets = []
        name_ids = []
        localities = []

        for index, row in df_train.iterrows():

            # 1位を予想するため One-Hot表現にする
            result = row['result']
            if result == 1:
                target = 1
            else:
                target = 0
            targets.append(target)    

            # 名前をハッシュを使ってID化
            name = row['name']
            name_hash = hashlib.md5(name.encode()).hexdigest()
            name_id = name_hash[-8:]
            name_ids.append(name_id)

            # 　ランクの例外処理
            if row['rank'] == 'SS':
                df_train.loc[index, 'rank'] = '0'
            elif row['rank'] == 'L1':
                df_train.loc[index, 'rank'] = '6'

            # 出身地を地区毎にグループ化
            prefecture = row['prefecture']
            if prefecture in {'1', '2', '3', '5'}:
                locality = '1' #北東北
            elif prefecture in {'4', '6', '7'}:
                locality = '2' #南東北
            elif prefecture in {'8', '9'}:
                locality = '3' #茨栃
            elif prefecture in {'11', '13'}:
                locality = '4' #埼京
            elif prefecture in {'10', '15', '19', '20'}:
                locality = '5' #上信越
            elif prefecture in {'12', '14', '22'}:
                locality = '6' #南関東
            elif prefecture in {'16', '17', '21', '23', '24'}:
                locality = '7' #中部
            elif prefecture in {'18', '25', '26', '27', '28', '29', '30'}:
                locality = '8' #近畿
            elif prefecture in {'31', '32', '33', '34', '35'}:
                locality = '9' #中国
            elif prefecture in {'36', '37', '38', '39'}:
                locality = '10' #四国
            elif prefecture in {'40', '41', '42', '43', '44', '45', '46', '47'}:
                locality = '11' #九州
            else:
                locality = '12' #外国

            localities.append(locality)

        # 前処理したデータのデータフレームへの置き換え
        df_train['target'] = targets
        df_train['name_id'] = name_ids
        df_train['locality'] = localities

        # カラムの順番入れ替え（見やすさのため）
        columns = list(df_train.columns)
        columns.remove('name_id')
        columns.insert(columns.index("name") + 1, "name_id")
        columns.remove('locality')
        columns.insert(columns.index("prefecture") + 1, "locality")

        df_train = df_train.loc[:,columns]
        
        if init_flag:
            df_train_concat = df_train
            init_flag = False
        else:
            df_train_concat = pd.concat([df_train_concat, df_train])
    
    return df_train_concat

In [55]:
# Webスクレイピングで取得した戦績データをファイルから読み取り、データフレームに変換＋データ前処理
def get_df_predict(filename):
    
    init_flag = True
    print('loading data for predict')
    df_predict = pd.read_csv(filename, encoding="SHIFT_JIS", header=0, nrows=None)

    targets = []
    name_ids = []
    localities = []

    for index, row in df_predict.iterrows():

        # 名前をハッシュを使ってID化
        name = row['name']
        name_hash = hashlib.md5(name.encode()).hexdigest()
        name_id = name_hash[-8:]
        name_ids.append(name_id)

        # 　ランクの例外処理
        if row['rank'] == 'SS':
            df_predict.loc[index, 'rank'] = '0'
        elif row['rank'] == 'L1':
            df_predict.loc[index, 'rank'] = '6'

        # 出身地を地区毎にグループ化
        prefecture = row['prefecture']
        if prefecture in {'1', '2', '3', '5'}:
            locality = '1' #北東北
        elif prefecture in {'4', '6', '7'}:
            locality = '2' #南東北
        elif prefecture in {'8', '9'}:
            locality = '3' #茨栃
        elif prefecture in {'11', '13'}:
            locality = '4' #埼京
        elif prefecture in {'10', '15', '19', '20'}:
            locality = '5' #上信越
        elif prefecture in {'12', '14', '22'}:
            locality = '6' #南関東
        elif prefecture in {'16', '17', '21', '23', '24'}:
            locality = '7' #中部
        elif prefecture in {'18', '25', '26', '27', '28', '29', '30'}:
            locality = '8' #近畿
        elif prefecture in {'31', '32', '33', '34', '35'}:
            locality = '9' #中国
        elif prefecture in {'36', '37', '38', '39'}:
            locality = '10' #四国
        elif prefecture in {'40', '41', '42', '43', '44', '45', '46', '47'}:
            locality = '11' #九州
        else:
            locality = '12' #外国

        localities.append(locality)

    # 前処理したデータのデータフレームへの置き換え
    df_predict['name_id'] = name_ids
    df_predict['locality'] = localities

    # カラムの順番入れ替え（見やすさのため）
    columns = list(df_predict.columns)
    columns.remove('name_id')
    columns.insert(columns.index("name") + 1, "name_id")
    columns.remove('locality')
    columns.insert(columns.index("prefecture") + 1, "locality")

    df_predict = df_predict.loc[:,columns]

    return df_predict

In [30]:
def get_train_test_data(df_train):
    X = []
    target = []
        
    # 各レース毎に
    grouped = df_train.groupby(['date', 'place', 'race_num'])
    for race_name, group in tqdm(grouped):
        #print(race_name)
        racer_count = group.shape[0]
        # もし、９輪ではないレースは、トレーニングの対象から外す（モデルを固めるため）
        if racer_count != 9:
            continue
        X.append(group[X_columns].values)
        target.append(group['target'].values)

    X = np.array(X, dtype='float')
    X = X.reshape(X.shape[0], X.shape[1] * X.shape[2])
    d_ = np.array(target)

    X_train, X_test, d_train, d_test = train_test_split(X, d_, test_size = 0.2)

    return X_train, X_test, d_train, d_test


In [31]:
def get_predict_data(df_predict):
    X = []
    race_info = []
        
    # 各レース毎に
    grouped = df_predict.groupby(['date', 'place', 'race_num'])
    for race_name, group in tqdm(grouped):
        #print(race_name)
        racer_count = group.shape[0]
        # もし、９輪ではないレースは、トレーニングの対象から外す（モデルを固めるため）
        if racer_count != 9:
            continue
        X.append(group[X_columns].values)
        race_info.append(race_name)

    X = np.array(X, dtype='float')
    X = X.reshape(X.shape[0], X.shape[1] * X.shape[2])

    return X, race_info

## クラス定義　Deeep Learning Network

In [32]:
class DNN(object):
    def __init__(self, n_in, n_hiddens, n_out):
        self.n_in = n_in
        self.n_hiddens = n_hiddens
        self.n_out = n_out
        self.weights = []
        self.biases = []

        self._x = None
        self._y = None
        self._t = None
        self._keep_prob = None
        self._sess = None
        self._history = {
            'accuracy': [],
            'loss': []
        }

    def weight_variable(self, shape):
        # He 初期化
        n_sum = 1
        for n in shape:
            n_sum *= n
        stddev = math.sqrt(2.0 / n_sum)
        print('stddev: ', stddev)
        initial = tf.truncated_normal(shape, stddev=stddev)
        return tf.Variable(initial)

    def bias_variable(self, shape):
        initial = tf.zeros(shape)
        return tf.Variable(initial)

    def inference(self, x, keep_prob):
        # 入力層 - 隠れ層、隠れ層 - 隠れ層
        for i, n_hidden in enumerate(self.n_hiddens):
            if i == 0:
                input = x
                input_dim = self.n_in
            else:
                input = output
                input_dim = self.n_hiddens[i-1]

            self.weights.append(self.weight_variable([input_dim, n_hidden]))
            self.biases.append(self.bias_variable([n_hidden]))

            input = tf.layers.batch_normalization(input)
            h = tf.nn.relu(tf.matmul(input, self.weights[-1]) + self.biases[-1])
            output = tf.nn.dropout(h, keep_prob)

        # 隠れ層 - 出力層
        self.weights.append(self.weight_variable([self.n_hiddens[-1], self.n_out]))
        self.biases.append(self.bias_variable([self.n_out]))

        y = tf.nn.softmax(tf.matmul(output, self.weights[-1]) + self.biases[-1])
        
        return y

    def predict(self, X_pred, p_keep=1.0):
        # 予測
        x = tf.placeholder(tf.float32, shape=[None, self.n_in])
        keep_prob = tf.placeholder(tf.float32)
        y_pred = self.inference(x, keep_prob)
        
        sess = tf.Session()
        init = tf.global_variables_initializer()
        sess.run(init)
        
        Y_pred = np.empty((X_pred.shape[0], 9))
        for i in range(len(X_pred)):
            X_ = X_pred[i].reshape(1, X_pred.shape[1])
            prob = y_pred.eval(session=sess, feed_dict={
                x: X_,
                keep_prob: p_keep
            })
            Y_pred[i] = prob
        
        return Y_pred
    
    def loss(self, y, t):
        # クロスエントロピー  Nan 問題回避のためのコードに変更
        #cross_entropy = tf.reduce_mean(-tf.reduce_sum(t * tf.log(y), axis=1))
        cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=t, logits=y))
        #return cross_entropy
        # L2 正則化
        l2_decay = 0.0001
        l2_losses = [tf.nn.l2_loss(w) for w in self.weights]
        l2_loss = l2_decay * tf.add_n(l2_losses)
        loss = cross_entropy + l2_loss
        return loss

    def training(self, loss):
        optimizer = tf.train.AdamOptimizer()
        train_step = optimizer.minimize(loss)
        return train_step

    def accuracy(self, y, t):
        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(t, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        return accuracy

    def fit(self, X_train, Y_train, nb_epoch=100, batch_size=100, p_keep=0.5, verbose=1):
        x = tf.placeholder(tf.float32, shape=[None, self.n_in])
        t = tf.placeholder(tf.float32, shape=[None, self.n_out])
        keep_prob = tf.placeholder(tf.float32)

        self._x = x
        self._t = t
        self._keep_prob = keep_prob

        y = self.inference(x, keep_prob)
        loss = self.loss(y, t)
        train_step = self.training(loss)
        accuracy = self.accuracy(y, t)

        sess = tf.Session()
        
        # TensorBoardで追跡する変数を定義
        with tf.name_scope('summary'):
            tf.summary.scalar('loss', loss)
            merged = tf.summary.merge_all()
            writer = tf.summary.FileWriter('./logs', sess.graph)
        
        init = tf.global_variables_initializer()
        sess.run(init)

        writer.close()
        
        self._y = y
        self._sess = sess

        N_train = len(X_train)
        n_batches = N_train // batch_size

        for epoch in range(nb_epoch):
            X_, Y_ = shuffle(X_train, Y_train)

            for i in range(n_batches):
                start = i * batch_size
                end = start + batch_size

                sess.run(train_step, feed_dict={
                    x: X_[start:end],
                    t: Y_[start:end],
                    keep_prob: p_keep
                })
            loss_ = loss.eval(session=sess, feed_dict={
                x: X_train,
                t: Y_train,
                keep_prob: 1.0
            })
            accuracy_ = accuracy.eval(session=sess, feed_dict={
                x: X_train,
                t: Y_train,
                keep_prob: 1.0
            })
            self._history['loss'].append(loss_)
            self._history['accuracy'].append(accuracy_)

            if verbose:
                print('epoch:', epoch,
                      ' loss:', loss_,
                      ' accuracy:', accuracy_)

        saver = tf.train.Saver()
        saver.save(sess, "model/training_model")
                
        return self._history

    def evaluate(self, X_test, Y_test):
        accuracy = self.accuracy(self._y, self._t)
        return accuracy.eval(session=self._sess, feed_dict={
            self._x: X_test,
            self._t: Y_test,
            self._keep_prob: 1.0
        })

In [33]:
def plot(history):

    fig, ax1 = plt.subplots()
    ax2 = ax1.twinx()  # 2つのプロットを関連付ける

    ax1.plot(history['loss'], label='loss', color='orange')
    ax1.set_ylabel('loss')
    ax1.set_ylim(0, 2.5)
    ax1.legend(loc='best', bbox_to_anchor=(1.01, 0.71, 0.322, .100), borderaxespad=0.,)

    ax2.plot(history['accuracy'], label='accuracy', color='dodgerblue')
    ax2.set_ylabel('accuracy')
    ax2.set_ylim(0, 1.0)
    ax2.legend(loc='best', bbox_to_anchor=(1.01, 0.8, 0.4, .100), borderaxespad=0.,)

    plt.savefig("output.jpg", bbox_inches='tight')
    plt.show()


# 学習

### クローラーで取得データを読み込む

In [34]:
places = []
for filename in os.listdir('data/'):
    place = filename.split('_')[0]
    places.append(place)
print(places)

# クロスエントロピーが Nan になる場所を除外 (いわき平、熊本)
places.remove('iwakitaira')
places.remove('kumamoto')

df_train = get_df_train(places)

['aomori', 'beppu', 'chiba', 'fukui', 'gifu', 'hakodate', 'hiratsuka', 'hiroshima', 'hofu', 'ito', 'iwakitaira', 'kawasaki', 'keiokaku', 'kishiwada', 'kochi', 'kokura', 'komatsushima', 'kumamoto', 'kurume', 'maebashi', 'matsudo', 'matsusaka', 'matsuyama', 'mukomachi', 'nagoya', 'nara', 'odawara', 'ogaki', 'omiya', 'sasebo', 'seibuen', 'shizuoka', 'tachikawa', 'takamatsu', 'takeo', 'tamano', 'toride', 'toyama', 'toyohashi', 'utsunomiya', 'wakayama', 'yahiko', 'yokkaichi']
loading data for aomori


  if self.run_code(code, result):


loading data for beppu
loading data for chiba
loading data for fukui
loading data for gifu
loading data for hakodate
loading data for hiratsuka
loading data for hiroshima
loading data for hofu
loading data for ito
loading data for kawasaki
loading data for keiokaku
loading data for kishiwada


  if self.run_code(code, result):


loading data for kochi
loading data for kokura
loading data for komatsushima
loading data for kurume
loading data for maebashi
loading data for matsudo
loading data for matsusaka
loading data for matsuyama
loading data for mukomachi


  if self.run_code(code, result):


loading data for nagoya
loading data for nara
loading data for odawara
loading data for ogaki
loading data for omiya
loading data for sasebo
loading data for seibuen
loading data for shizuoka
loading data for tachikawa
loading data for takamatsu
loading data for takeo
loading data for tamano
loading data for toride
loading data for toyama
loading data for toyohashi
loading data for utsunomiya
loading data for wakayama


  if self.run_code(code, result):


loading data for yahiko


  if self.run_code(code, result):


loading data for yokkaichi


In [45]:
print("Generating Training/Test Data")
X_train, X_test, Y_train, Y_test = get_train_test_data(df_train)

Generating Training/Test Data


100%|████████████████████████████████████████████████████████████████████████| 146536/146536 [01:03<00:00, 2299.59it/s]


### 列方向にデータを標準化:（数値 - 平均) / 標準偏差

In [48]:

X_train_z = zscore(X_train, axis=0)
X_test_z = zscore_cor(X_train, X_test, axis=0)

### モデルの定義

In [None]:
model = DNN(n_in = X_train.shape[1], n_hiddens=[256, 512, 512], n_out=9)

### 学習

In [50]:
print("Training ...")
history = model.fit(X_train_z, Y_train, nb_epoch = 20, batch_size=32, p_keep=0.8)

accuracy = model.evaluate(X_test_z, Y_test)
print('accuracy: ', accuracy)

plot(history)

Training ...
stddev:  0.009820927516479826
stddev:  0.00390625
stddev:  0.0027621358640099515
stddev:  0.020833333333333332
epoch: 0  loss: 2.0557158  accuracy: 0.31981388
epoch: 1  loss: 2.049768  accuracy: 0.32785642
epoch: 2  loss: 2.0491657  accuracy: 0.32858756
epoch: 3  loss: 2.0529008  accuracy: 0.32505152
epoch: 4  loss: 2.0365078  accuracy: 0.33968762
epoch: 5  loss: 2.0313203  accuracy: 0.34566966
epoch: 6  loss: 2.02795  accuracy: 0.34755734
epoch: 7  loss: 2.0280712  accuracy: 0.34733135
epoch: 8  loss: 2.0236275  accuracy: 0.3514922
epoch: 9  loss: 2.0219772  accuracy: 0.3531007
epoch: 10  loss: 2.0167193  accuracy: 0.35750082
epoch: 11  loss: 2.018298  accuracy: 0.35663676
epoch: 12  loss: 2.0194106  accuracy: 0.3547092
epoch: 13  loss: 2.016806  accuracy: 0.35702226
epoch: 14  loss: 2.0200057  accuracy: 0.3533001
epoch: 15  loss: 2.016092  accuracy: 0.3585776
epoch: 16  loss: 2.016062  accuracy: 0.35770023
epoch: 17  loss: 2.0163982  accuracy: 0.35912263
epoch: 18  loss:

# 予測

### クローラーで取得したデータを読み込む

In [56]:
for filename in os.listdir('predict/'):
    df_predict = get_df_predict('predict/' + filename)
print("Generating Predict Data")
X_pred, race_info = get_predict_data(df_predict)

loading data for predict


In [53]:

# 列方向にデータを標準化:（数値 - 平均) / 標準偏差
X_pred_z = zscore_cor(X_train, X_pred, axis=0)

Generating Predict Data


100%|████████████████████████████████████████████████████████████████████████████████| 68/68 [00:00<00:00, 2199.51it/s]


In [54]:
print("Predicting ...")
Y_pred = model.predict(X_pred)

for index in range(len(Y_pred)):
    print('レース', race_info[index])
    print('予想順位（車番左から１ -> 9位）', np.argsort(-Y_pred[index]) + 1)
    print('勝率予測', Y_pred[index])

Predicting ...
stddev:  0.009820927516479826
stddev:  0.00390625
stddev:  0.0027621358640099515
stddev:  0.020833333333333332
レース (20180928, 'aomori', 1)
予想順位（車番左から１ -> 9位） [8 9 3 2 6 5 1 4 7]
勝率予測 [0.11105618 0.11121772 0.11122823 0.11091369 0.1110792  0.1110834
 0.11088657 0.11129215 0.11124282]
レース (20180928, 'aomori', 2)
予想順位（車番左から１ -> 9位） [8 2 3 9 5 1 6 4 7]
勝率予測 [0.1110535  0.11123874 0.11122317 0.110903   0.11108829 0.11104236
 0.11089164 0.11133846 0.11122084]
レース (20180928, 'aomori', 3)
予想順位（車番左から１ -> 9位） [8 9 3 2 5 6 1 7 4]
勝率予測 [0.11104006 0.11121042 0.11124181 0.11088446 0.11108008 0.11105745
 0.1109013  0.11133311 0.11125126]
レース (20180928, 'aomori', 4)
予想順位（車番左から１ -> 9位） [8 2 9 3 5 6 1 4 7]
勝率予測 [0.11104326 0.11124472 0.11121678 0.11091257 0.11110825 0.11105936
 0.11089316 0.11129594 0.11122599]
レース (20180928, 'aomori', 5)
予想順位（車番左から１ -> 9位） [8 9 2 3 5 1 6 7 4]
勝率予測 [0.11108845 0.11122432 0.11121502 0.11086198 0.11110114 0.11105837
 0.11091642 0.11130004 0.11123422]
レース (

 0.11089599 0.11131157 0.11121595]
レース (20180928, 'toyama', 11)
予想順位（車番左から１ -> 9位） [8 2 9 3 5 6 1 7 4]
勝率予測 [0.11104558 0.11126039 0.11122594 0.11085167 0.11111929 0.11105021
 0.11086798 0.11134032 0.11123863]
