In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import hashlib

In [None]:
pd.set_option('display.max_columns', 32)

In [None]:
df_train = pd.read_csv("data/hakodate_train_data.csv", encoding="SHIFT_JIS", header=0, nrows=None)

In [None]:
# Generate targets for training (not using Result)
targets = []
for index, row in df_train.iterrows():
    result = row['result']
    target = (10 - result) / 45
    targets.append(target)
    
df_train['target'] = targets

In [None]:
# 名前をハッシュを使ってID化
name_ids = []
for index, row in df_train.iterrows():
    name = row['name']
    name_hash = hashlib.md5(name.encode()).hexdigest()
    name_id = name_hash[-8:]
    name_ids.append(name_id)
    
df_train['name_id'] = name_ids

In [None]:
for index, row in df_train.iterrows():
    if row['rank'] == 'SS':
        df_train.loc[index, 'rank'] = '0'
    elif row['rank'] == 'L1':
        df_train.loc[index, 'rank'] = '6'

In [None]:
df_train['rank'].unique()

In [None]:
# 出身地を地区毎にグループ化
localities = []
for index, row in df_train.iterrows():
    prefecture = row['prefecture']
    if prefecture in {'1', '2', '3', '5'}:
        locality = '1' #北東北
    elif prefecture in {'4', '6', '7'}:
        locality = '2' #南東北
    elif prefecture in {'8', '9'}:
        locality = '3' #茨栃
    elif prefecture in {'11', '13'}:
        locality = '4' #埼京
    elif prefecture in {'10', '15', '19', '20'}:
        locality = '5' #上信越
    elif prefecture in {'12', '14', '22'}:
        locality = '6' #南関東
    elif prefecture in {'16', '17', '21', '23', '24'}:
        locality = '7' #中部
    elif prefecture in {'18', '25', '26', '27', '28', '29', '30'}:
        locality = '8' #近畿
    elif prefecture in {'31', '32', '33', '34', '35'}:
        locality = '9' #中国
    elif prefecture in {'36', '37', '38', '39'}:
        locality = '10' #四国
    else:
        locality = '11' #九州
    
    localities.append(locality)

df_train['locality'] = localities

'''
('北海道', '1').('青森', '2').('岩手', '3').('宮城', '4')
('秋田', '5').('山形', '6').('福島', '7')
('茨城', '8').('栃木', '9').('群馬', '10').('埼玉', '11').('千葉', '12').
('東京', '13').('神奈川', '14').('新潟', '15').('富山', '16').('石川', '17').('福井', '18').
('山梨', '19').('長野', '20').('岐阜', '21').('静岡', '22').('愛知', '23').('三重', '24').
('滋賀', '25').('京都', '26').('大阪', '27').('兵庫', '28').('奈良', '29').('和歌山', '30').
('鳥取', '31').('島根', '32').('岡山', '33').('広島', '34').('山口', '35').('徳島', '36').
('香川', '37').('愛媛', '38').('高知', '39').('福岡', '40').('佐賀', '41').('長崎', '42').
('熊本', '43').('大分', '44').('宮崎', '45').('鹿児島', '46').('沖縄', '47')
'''

In [None]:
columns = list(df_train.columns)
columns.remove('name_id')
columns.insert(columns.index("name") + 1, "name_id")
columns.remove('locality')
columns.insert(columns.index("prefecture") + 1, "locality")
print(columns)

In [None]:
df_train = df_train.loc[:,columns]
display(df_train)

In [None]:
X_columns = ['locality', 'age', 'rank', 'leg', 'racing piont', 'S', 'B', 'Nige', 'Maki', 'Sashi', 'Ma', '1st', '2nd', '3rd', 'Chakugai', 'win', '2ren', '3ren']

In [None]:
iters_num = 100
batch_size = 100
plot_interval = 1

x = tf.placeholder(tf.float32, [None, 9 * len(X_columns)])
d = tf.placeholder(tf.float32, [None, 9])
W = tf.Variable(tf.random_normal([9 * len(X_columns), 9], stddev=0.01))
b = tf.Variable(tf.zeros([9]))
y = tf.nn.softmax(tf.matmul(x, W) + b)

# 交差エントロピー
cross_entropy = -tf.reduce_sum(d * tf.log(y), reduction_indices=[1])
loss = tf.reduce_mean(cross_entropy)
train = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

# 正誤を保存
correct = tf.equal(tf.argmax(y, 1), tf.argmax(d, 1))
# 正解率
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

In [None]:
grouped = df_train.groupby(['date', 'place', 'race_num'])
print(len(grouped))

In [None]:
accuracies = []
race_count = 0
for race_name, group in grouped:
    print(race_name)
    racer_count = group.shape[0]
    if racer_count != 9:
        continue

    X = group[X_columns].values.reshape(1, -1)
    target = group['target'].values.reshape(1,-1)
    
    sess.run(train, feed_dict={x: X, d: target})
    print(sess.run(correct, feed_dict={x: X, d: target}))
    accuracy_val = sess.run(accuracy, feed_dict={x: X, d: target})
    accuracies.append(accuracy_val)
    print('Generation: ' + str(i+1) + '. 正解率 = ' + str(accuracy_val))

    race_count += 1
    
lists = range(0, race_count, plot_interval)
plt.plot(lists, accuracies)
plt.title("accuracy")
plt.ylim(0, 1.0)
plt.show()        
