In [6]:
import pandas as pd
import numpy as np
import pandas.core
from pandas import DataFrame
from sklearn.model_selection import StratifiedKFold
import chainer
import chainer.links as L
import chainer.functions as F
from chainer import Sequential
import matplotlib.pyplot as plt


split_num = 10


In [24]:
def data_extraction(data:pandas.core.frame.DataFrame, label:str) -> pandas.core.frame.DataFrame:
    """
    Extract only the data of label of second argument from the inputted data.

    @param data: the inputted data
    @label: label you want to extract
    @return: the extracted data
    """
    return data[data.label == label]


def data_column_conversion(data:pandas.core.frame.DataFrame, label:str) -> pandas.core.frame.DataFrame:
    """
    Extract only the data of label of second argument from the inputted data.

    @param data: the extracted data
    @label: label you want to converse
    @return: the conversed data
    """
    data = data.assign(W = (label == 'W') + 0,D = (label == 'D') + 0,L = (label == 'L') + 0)
    data = data.drop("label",axis=1)
    return data


def data_randomization(data:pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame:
    """
    Randomize data.

    @param data: the conversed data you want to separate
    @return: the randomized data
    """
    return data.sample(n = len(data))


def data_separate(data:pandas.core.frame.DataFrame, split_num:int) -> list:
    """
    Separate data.
    
    @param data: the data you want to separate
    @param split_num: the division number
    """
    data_separate = []
    for i in range(split_num):
        data_separate.append(data[i::split_num])
    return data_separate


def divided_data_making_for_stratified_sampling(data:pandas.core.frame.DataFrame, split_num:int, label:str) -> list:
    data = data_column_conversion(data,label)
    data = data_randomization(data)
    separated_data_list = data_separate(data, split_num)
    return separated_data_list


def data_list_wdl_merge(data_list1:list, data_list2:list, data_list3:list)-> list:
    list_size = len(data_list1)
    merged_data_list = []
    for i in range(list_size):
        merged_data_list.append(pd.concat([data_list1[i],data_list2[i],data_list3[i]]))
    return merged_data_list


def assign_group_numbers_to_data(data_list:list) -> list:
    list_size = len(data_list)
    for i in range(list_size):
        data_list[i].assign(separate_num=i)
    print(data_list[i]['separate_num'])
    return data_list


def data_list_put_together(data_list:list) -> pandas.core.frame.DataFrame:
    list_size = len(data_list)
    data = data_list[0]
    for i in range(1,list_size):
        data = data.append(data_list[i])
    return data


def making_dataset_list_train(data:pandas.core.frame.DataFrame, split_num:int) -> list:
    train_data_list = []
    for i in range(split_num):
        train_data_list.append(data[data['separate_num'] != i])
    for i in range(split_num):
        train_data_list[i] = train_data_list[i].drop(['separate_num'], axis = 1)
        train_data_list = data_randomization(train_data_list[i])
    return train_data_list


def making_dataset_list_val(data:pandas.core.frame.DataFrame, split_num:int) -> list:
    val_data_list = []
    for i in range(split_num):
        val_data_list.append(data[data['separate_num'] == i])
    for i in range(split_num):
        val_data_list[i] = train_data_list[i].drop(['separate_num'], axis = 1)
        val_data_list = data_randomization(train_data_list[i])
    return val_data_list


def making_dataset_list_x(data_list:list) -> list:
    list_size = len(data_list)
    for i in range(list_size):
        data_list[i].drop(["W","D","L"],axis=1)
    return data_list


def making_dataset_list_y(data_list:list) -> list:
    list_size = len(data_list)
    data_list = []
    for i in range(list_size):
        data_list.append(data_list[i][["W","D","L"]])
    return data_list


def translate_pandas_to_numpy_x(data_list:list) -> list:
    list_size = len(data_list)
    for i in range(list_size):
        data_list[i] = data_list[i].values.astype('float32')
    return data_list


def translate_pandas_to_numpy_y(data_list:list) -> list:
    list_size = len(data_list)
    for i in range(list_size):
        data_list[i] = data_list[i].values
    return data_list


def data_processing_for_stratified_sampling(data:pandas.core.frame.DataFrame, split_num:int) -> pandas.core.frame.DataFrame:
    win_data = data_extraction(data, 'W')
    draw_data = data_extraction(data, 'D')
    lose_data = data_extraction(data, 'L')
    win_separated_data_list = divided_data_making_for_stratified_sampling(win_data, split_num, 'W')
    draw_separated_data_list = divided_data_making_for_stratified_sampling(draw_data, split_num, 'D')
    lose_separated_data_list = divided_data_making_for_stratified_sampling(lose_data, split_num, 'L')
    separated_data_list = data_list_wdl_merge(win_separated_data_list
                                          , draw_separated_data_list
                                          , lose_separated_data_list)
    separated_data_list = assign_group_numbers_to_data(separated_data_list)
    integrated_data = data_list_put_together(separated_data_list)
    return integrated_data


def making_x_train_data_list_for_kfold(data:pandas.core.frame.DataFrame, split_num:int) -> list:
    train_data_list = making_dataset_list_train(data, split_num)
    x_train_data_list = making_dataset_list_x(train_data)
    return translate_pandas_to_numpy_x(x_train_data_list)


def making_x_val_data_list_for_kfold(data:pandas.core.frame.DataFrame, split_num:int) -> list:
    val_data_list = making_dataset_list_val(data, split_num)
    x_val_data_list = making_dataset_list_x(val_data)
    return translate_pandas_to_numpy_x(x_val_data_list)


def making_y_train_data_list_for_kfold(data:pandas.core.frame.DataFrame, split_num:int) -> list:
    train_data_list = making_dataset_list_train(data, split_num)
    y_train_data_list = making_dataset_list_y(train_data)
    return translate_pandas_to_numpy_y(y_train_data_list)


def making_y_val_data_list_for_kfold(data:pandas.core.frame.DataFrame, split_num:int) -> list:
    val_data_list = making_dataset_list_val(data, split_num)
    y_val_data_list = making_dataset_list_y(val_data)
    return translate_pandas_to_numpy_y(y_val_data_list)


def RPS(y_true, y_pred):
    output = 0.
    data_num = len(y_true)
    for i in range(data_num):
        times = len(y_true[i]) - 1 
        cumulative_sum = 0.
        score = 0.
        for time in range(times):
            cumulative_sum += y_true[i,time] - y_pred[i,time]
            score += cumulative_sum ** 2
        score /= times
        output += score
    
    output /= data_num
    return output

In [25]:
# data input 
path = 'trainrat_new.txt'
data = pd.read_csv(path,sep=' ')

# dataset making for stratified sampling and k-fold
data = data_processing_for_stratified_sampling(data,split_num)
x_train = making_x_train_data_list_for_kfold(data,split_num)
x_test = making_x_test_data_list_for_kfold(data,split_num)
y_train = making_y_train_data_list_for_kfold(data,split_num)
y_test = making_y_test_data_list_for_kfold(data,split_num)



KeyError: 'separate_num'

In [7]:

split_num = 10


# data input 
path = 'trainrat_new.txt'
data = pd.read_csv(path,sep=' ')


# Data extraction,Win
def data_exrtaction_win(data :):
win_data = data[data.label == "W"]
win_data = win_data.assign(W=1,D=0,L=0) 
win_data = win_data.drop("label",axis=1)
win_data = win_data.sample(n=len(win_data))# random sort

# Data extraction,Draw
draw_data = data[data.label == "D"]
draw_data = draw_data.assign(W=0,D=1,L=0)
draw_data = draw_data.drop("label",axis=1)
draw_data = draw_data.sample(n=len(draw_data))# random sort

# Data extraction,Lose
lose_data = data[data.label == "L"]
lose_data = lose_data.assign(W=0,D=0,L=1) 
lose_data = lose_data.drop("label",axis=1)
lose_data = lose_data.sample(n=len(lose_data))# random sort



# Data separate and making dataset
win_data_separate = []
draw_data_separate = []
lose_data_separate = []
all_data_separate = []
wdl_separate = []
for i in range(split_num):
    win_data_separate.append(win_data[i::split_num])
    draw_data_separate.append(draw_data[i::split_num])
    lose_data_separate.append(lose_data[i::split_num])
    all_data_separate.append(all_data[i::split_num])
    # merge for stratified sampling
    wdl_separate.append(pd.concat([win_data_separate[i],draw_data_separate[i],lose_data_separate[i]]))
    # assign a number to make final input data
    wdl_separate[i] = wdl_separate[i].assign(separate_num=i)
    all_data_separate[i] = all_data_separate[i].assign(separate_num=i)

# integrate everything once
wdl_separate_merge = wdl_separate[0]
all_data_separate_merge = all_data_separate[0]
for i in range(1,split_num):
    wdl_separate_merge = wdl_separate_merge.append(wdl_separate[i])
    all_data_separate_merge = all_data_separate_merge.append(all_data_separate[i])

    
    
# make final input data
x_train = []
y_train = []
x_val = []
y_val = []
xAll_train = []
yAll_train = []
xAll_val = []
yAll_val = []
for i in range(split_num):
    x_train.append(wdl_separate_merge[wdl_separate_merge['separate_num'] != i])
    x_val.append(wdl_separate_merge[wdl_separate_merge['separate_num'] == i])
    xAll_train.append(all_data_separate_merge[all_data_separate_merge['separate_num'] != i])
    xAll_val.append(all_data_separate_merge[all_data_separate_merge['separate_num'] == i])
for i in range(split_num):
    # delete separate_num
    x_train[i] = x_train[i].drop(['separate_num'],axis=1)
    x_val[i] = x_val[i].drop(['separate_num'],axis=1)
    xAll_train[i] = xAll_train[i].drop(['separate_num'],axis=1)
    xAll_val[i] = xAll_val[i].drop(['separate_num'],axis=1)
    # random sort
    x_train[i] = x_train[i].sample(n=len(x_train[i]))
    x_val[i] = x_val[i].sample(n=len(x_val[i]))
    xAll_train[i] = xAll_train[i].sample(n=len(xAll_train[i]))
    xAll_val[i] = xAll_val[i].sample(n=len(xAll_val[i]))
    
    # separate x and y
    y_train.append(x_train[i][["W","D","L"]])
    y_val.append(x_val[i][["W","D","L"]])
    yAll_train.append(xAll_train[i][["W","D","L"]])
    yAll_val.append(xAll_val[i][["W","D","L"]])
    x_train[i] = x_train[i].drop(["W","D","L"],axis=1)
    x_val[i] = x_val[i].drop(["W","D","L"],axis=1)
    xAll_train[i] = xAll_train[i].drop(["W","D","L"],axis=1)
    xAll_val[i] = xAll_val[i].drop(["W","D","L"],axis=1)
    
    #translate pandas to numpy
    x_train[i] = x_train[i].values.astype('float32') 
    x_val[i] = x_val[i].values.astype('float32') 
    y_train[i] = y_train[i].values
    y_val[i] = y_val[i].values


28186


In [None]:
import chainer
import chainer.links as L
import chainer.functions as F
from chainer import Sequential
import matplotlib.pyplot as plt

learning_rate_list = [0.01]
batchsize_list = [64]

# craete model
# net としてインスタンス化
n_input = 8
n_hidden = 5
n_output = 3

n_epoch = 60

for learning_rate in learning_rate_list:
    for batchsize in batchsize_list:
        # create 10 model for 10-fold-crossvalidation
        optimizer = []
        net = []

        for i in range(split_num):
            net.append(Sequential(
                L.Linear(n_input, n_hidden), F.relu,
                L.Linear(n_hidden, n_hidden), F.relu,
                L.Linear(n_hidden, n_hidden), F.relu,
                L.Linear(n_hidden, n_output), F.softmax)
            )
            optimizer.append(chainer.optimizers.Adam(alpha=learning_rate))
            optimizer[i].setup(net[i])

        # ログの保存用
        results_train_data = []
        results_valid_data = []


        for data_num in range(len(x_train)):
            # ログの保存用
            results_train = {
                'loss': [],
                'accuracy': []
            }
            results_valid = {
                'loss': [],
                'accuracy': []
            }
            iteration = 0
            for epoch in range(n_epoch):
                # 各バッチ毎の目的関数の出力と分類精度の保存用
                loss_list = []
                #accuracy_list = []

                for i in range(0, len(x_train[data_num]), batchsize):
                    # バッチを準備
                    x_train_batch = x_train[data_num][i:i+batchsize,:]
                    y_train_batch = y_train[data_num][i:i+batchsize,:]

                    # 予測値を出力
                    y_train_batch_pred = net[data_num](x_train_batch)
                    # 目的関数を適用し、分類精度を計算
                    loss_train_batch = RPS(y_train_batch, y_train_batch_pred)

                    loss_list.append(loss_train_batch.array)

                    # 勾配のリセットと勾配の計算
                    net[data_num].cleargrads()
                    loss_train_batch.backward()

                    # パラメータの更新
                    optimizer[data_num].update()

                    # カウントアップ
                    iteration += 1
    
                # 訓練データに対する目的関数の出力と分類精度を集計
                loss_train = np.mean(loss_list)

                # 1エポック終えたら、検証データで評価
                # 検証データで予測値を出力
                with chainer.using_config('train', False), chainer.using_config('enable_backprop', False):
                    y_val_pred = net[data_num](x_val[data_num])

                # 目的関数を適用し、分類精度を計算
                loss_val = RPS(y_val_pred, y_val[data_num])

                # 結果の表示
                print('epoch: {}, iteration: {}, loss (train): {:.4f}, loss (valid): {:.4f}'.format(
                    epoch, iteration, loss_train, loss_val.array))

                # ログを保存
                results_train['loss'] .append(loss_train)
                #results_train['accuracy'] .append(accuracy_train)
                results_valid['loss'].append(loss_val.array)
                #results_valid['accuracy'].append(accuracy_val.array)

            results_train_data.append(results_train)
            results_valid_data.append(results_valid)

        results_train_data_all = []
        results_valid_data_all = []
        results_train_data_all = np.zeros(n_epoch)
        results_valid_data_all = np.zeros(n_epoch)
        # 目的関数の出力 (loss)
        for i in range(split_num):   
            results_train_data_all += results_train_data[i]['loss']
            results_valid_data_all += results_valid_data[i]['loss']

        print('learning_rate: {}, batch_size: {}'.format(learning_rate,batchsize))
        plt.plot(results_train_data_all / split_num, label='train')  # label で凡例の設定
        plt.plot(results_valid_data_all / split_num, label='valid')  # label で凡例の設定
        plt.legend()  # 凡例の表示
        plt.figure()

        print('train: {}'.format(results_train_data_all / split_num))
        print('valid: {}'.format(results_valid_data_all / split_num))


epoch: 0, iteration: 397, loss (train): 0.2134, loss (valid): 0.2079
epoch: 1, iteration: 794, loss (train): 0.2100, loss (valid): 0.2075
epoch: 2, iteration: 1191, loss (train): 0.2096, loss (valid): 0.2075
epoch: 3, iteration: 1588, loss (train): 0.2095, loss (valid): 0.2075
epoch: 4, iteration: 1985, loss (train): 0.2093, loss (valid): 0.2077
epoch: 5, iteration: 2382, loss (train): 0.2093, loss (valid): 0.2075
epoch: 6, iteration: 2779, loss (train): 0.2093, loss (valid): 0.2075
epoch: 7, iteration: 3176, loss (train): 0.2092, loss (valid): 0.2074
epoch: 8, iteration: 3573, loss (train): 0.2092, loss (valid): 0.2074
epoch: 9, iteration: 3970, loss (train): 0.2092, loss (valid): 0.2074
epoch: 10, iteration: 4367, loss (train): 0.2091, loss (valid): 0.2074
epoch: 11, iteration: 4764, loss (train): 0.2091, loss (valid): 0.2074
epoch: 12, iteration: 5161, loss (train): 0.2091, loss (valid): 0.2074
epoch: 13, iteration: 5558, loss (train): 0.2091, loss (valid): 0.2073
epoch: 14, iterati

epoch: 55, iteration: 22232, loss (train): 0.2089, loss (valid): 0.2070
epoch: 56, iteration: 22629, loss (train): 0.2089, loss (valid): 0.2070
epoch: 57, iteration: 23026, loss (train): 0.2089, loss (valid): 0.2069
epoch: 58, iteration: 23423, loss (train): 0.2087, loss (valid): 0.2069
epoch: 59, iteration: 23820, loss (train): 0.2090, loss (valid): 0.2066
epoch: 0, iteration: 397, loss (train): 0.2123, loss (valid): 0.2091
epoch: 1, iteration: 794, loss (train): 0.2104, loss (valid): 0.2085
epoch: 2, iteration: 1191, loss (train): 0.2101, loss (valid): 0.2084
epoch: 3, iteration: 1588, loss (train): 0.2099, loss (valid): 0.2084
epoch: 4, iteration: 1985, loss (train): 0.2098, loss (valid): 0.2083
epoch: 5, iteration: 2382, loss (train): 0.2097, loss (valid): 0.2085
epoch: 6, iteration: 2779, loss (train): 0.2096, loss (valid): 0.2084
epoch: 7, iteration: 3176, loss (train): 0.2096, loss (valid): 0.2083
epoch: 8, iteration: 3573, loss (train): 0.2095, loss (valid): 0.2082
epoch: 9, it

epoch: 50, iteration: 20247, loss (train): 0.2085, loss (valid): 0.2104
epoch: 51, iteration: 20644, loss (train): 0.2084, loss (valid): 0.2102
epoch: 52, iteration: 21041, loss (train): 0.2084, loss (valid): 0.2103
epoch: 53, iteration: 21438, loss (train): 0.2084, loss (valid): 0.2106
epoch: 54, iteration: 21835, loss (train): 0.2084, loss (valid): 0.2104
epoch: 55, iteration: 22232, loss (train): 0.2084, loss (valid): 0.2106
epoch: 56, iteration: 22629, loss (train): 0.2083, loss (valid): 0.2102
epoch: 57, iteration: 23026, loss (train): 0.2085, loss (valid): 0.2106
epoch: 58, iteration: 23423, loss (train): 0.2084, loss (valid): 0.2103
epoch: 59, iteration: 23820, loss (train): 0.2084, loss (valid): 0.2105
epoch: 0, iteration: 397, loss (train): 0.2134, loss (valid): 0.2103
epoch: 1, iteration: 794, loss (train): 0.2103, loss (valid): 0.2097
epoch: 2, iteration: 1191, loss (train): 0.2097, loss (valid): 0.2098
epoch: 3, iteration: 1588, loss (train): 0.2094, loss (valid): 0.2097
ep

epoch: 45, iteration: 18262, loss (train): 0.2090, loss (valid): 0.2130
epoch: 46, iteration: 18659, loss (train): 0.2092, loss (valid): 0.2130
epoch: 47, iteration: 19056, loss (train): 0.2092, loss (valid): 0.2129
epoch: 48, iteration: 19453, loss (train): 0.2091, loss (valid): 0.2128
epoch: 49, iteration: 19850, loss (train): 0.2092, loss (valid): 0.2138
epoch: 50, iteration: 20247, loss (train): 0.2092, loss (valid): 0.2132
epoch: 51, iteration: 20644, loss (train): 0.2090, loss (valid): 0.2127
epoch: 52, iteration: 21041, loss (train): 0.2092, loss (valid): 0.2139
epoch: 53, iteration: 21438, loss (train): 0.2092, loss (valid): 0.2131
epoch: 54, iteration: 21835, loss (train): 0.2091, loss (valid): 0.2131
epoch: 55, iteration: 22232, loss (train): 0.2091, loss (valid): 0.2127
epoch: 56, iteration: 22629, loss (train): 0.2090, loss (valid): 0.2130
epoch: 57, iteration: 23026, loss (train): 0.2091, loss (valid): 0.2126
epoch: 58, iteration: 23423, loss (train): 0.2090, loss (valid):