In [1]:
import torch
import numpy as np
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt

from CharToIndex import CharToIndex
from MyDatasets import Cross_Validation
from MyCustomLayer import WeightedTenHotEncodeLayer

import time
import math

In [2]:
chars_file_path = r"data\tegaki_katsuji\all_chars_3812.npy"
tokens = CharToIndex(chars_file_path)
data_file_path = r"data\tegaki_katsuji\tegaki_distance.npz"
data = np.load(data_file_path,allow_pickle=True)

EMBEDDING_DIM = 10
HIDDEN_SIZE = 128
BATCH_SIZE = 64
VOCAB_SIZE = len(tokens)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [3]:
class _Distanced_TenHot_Dataset_sest5(torch.utils.data.Dataset):
    def __init__(self,data,chars_file_path,device=torch.device('cpu')):
        self.data = data
        self.val_idx = []
        self.ans_idx = []
        self.char2index = CharToIndex(chars_file_path)
        self.length = len(data['answer'])-4
        self.device = device

        values = data['value']
        for chars in values:
            indexes = []
            for idx in map(self.char2index.get_index,chars):
                indexes.append(idx)
            self.val_idx.append(indexes)

        answers = data['answer']
        for idx in map(self.char2index.get_index,answers):
            self.ans_idx.append(idx)


        #距離値付きのten_hot_encodeにvalueを変換
        distances = data['distance']
        self.distanced_ten_hot_encoded_value = np.zeros(shape=(values.shape[0],VOCAB_SIZE),dtype=np.float32)
        for row,indicies in enumerate(self.val_idx):
            for id_distance,id_value in enumerate(indicies):
                self.distanced_ten_hot_encoded_value[row][id_value]=distances[row][id_distance]


    def __len__(self):
        return self.length


    def __getitem__(self,idx):
        assert idx < self.length
        out_val = self.distanced_ten_hot_encoded_value[idx:idx+5]
        out_ans = self.ans_idx[idx+4]
        return torch.tensor(out_val).to(self.device),torch.tensor(out_ans).to(self.device)

In [4]:
data = data
val_idx = []
ans_idx = []
char2index = CharToIndex(chars_file_path)
length = len(data['answer'])-8
device = device

values = data['value']
val_idx = []
for chars in values:
    indexes = []
    for idx in map(char2index.get_index,chars):
        indexes.append(idx)
    val_idx.append(indexes)

answers = data['answer']
for idx in map(char2index.get_index,answers):
    ans_idx.append(idx)

distances = data['distance']

[31mERROR: No such char --> [0mb'\xe3\x82\x91'
[31mERROR: No such char --> [0mb'\xe7\xb8\x8a'


In [5]:
distanced_ten_hot_encoded_value = np.zeros(shape=(values.shape[0],VOCAB_SIZE),dtype=np.float32)

for row,indicies in enumerate(val_idx):
    for id_distance,id_value in enumerate(indicies):
        if id_distance == 10:
            print(id_value)
        distanced_ten_hot_encoded_value[row][id_value]=distances[row][id_distance]

    

In [6]:
import random
i = int(random.random()*len(tokens))

hotted_indices = np.argsort(-distanced_ten_hot_encoded_value[i])[:10]
for item in range(i,i+10):
    hotted_indices = np.argsort(-distanced_ten_hot_encoded_value[item])[:10]
    for idx in hotted_indices:
        print(tokens.get_decoded_char(idx), end='')
    print('\t',tokens.get_decoded_char(ans_idx[item]))

広法伝宏玄庄去右芸店	 広
゛４いぃＨリＭＸＹ糺	 い
分今令劣冷合台会Ｇ什	 分
野時貯第路努好踏畔男	 野
へヘぺべベ八人入久バ	 へ
適通過週遍遇道遮逼糸	 適
用風円肉周丹舟月痢何	 用
さ土をざェエき士ょよ	 さ
れ札礼花丸托ね孔社九	 れ
てマ々で２スセユュえ	 て


In [7]:

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def show_ans_pred(answers,predictions):
    for ans,pred in zip(answers,predictions):
        correct = '✓' if ans.item() == pred.item() else '✗'
        print(f'{tokens.get_decoded_char(ans.item())}{tokens.get_decoded_char(pred.item()):2} {correct}',end=' ')
    print()



def train(model,train_dataloader,learning_rate=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    batch_size = next(iter(train_dataloader))[0].size(0)
    running_loss = 0
    accuracy = 0

    model.train()
    for i,(x,y) in enumerate(train_dataloader):
        output = model(x)
        loss = criterion(output, y) #損失計算
        prediction = output.data.max(1)[1] #予測結果
        accuracy += prediction.eq(y.data).sum().item()/batch_size
        optimizer.zero_grad() #勾配初期化
        loss.backward(retain_graph=True) #逆伝播
        optimizer.step()  #重み更新
        running_loss += loss.item()

    loss_result = running_loss/len(train_dataloader)
    accuracy_result = accuracy/len(train_dataloader)

    return loss_result,accuracy_result


def eval(model,valid_dataloader,is_show_ans_pred=False):
    accuracy = 0
    batch_size = next(iter(valid_dataloader))[0].size(0)
    model.eval()
    for x,y in valid_dataloader:
        output = model(x)
        prediction = output.data.max(1)[1] #予測結果
        accuracy += prediction.eq(y.data).sum().item()/batch_size
        if is_show_ans_pred:
            ans_pred_list=show_ans_pred(y,prediction)
            print(ans_pred_list)

    return accuracy/len(valid_dataloader)


class Proofreader(nn.Module):
    def __init__(self, input_size, hidden_dim, output_size,n_layers):
        super(Proofreader, self).__init__()

        self.output_size = output_size
        self.hidden_dim = hidden_dim
        self.n_layers  = n_layers
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.rnn = nn.RNN(output_size, self.hidden_dim, batch_first=True,bidirectional=True)
        self.fc = nn.Linear(self.hidden_dim*2, output_size)
        self.dropout = torch.nn.Dropout(p=0.5)
        self.to(self.device)

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers*2, batch_size, self.hidden_dim)
        return hidden

    def forward(self, x):
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size).to(self.device)

        out, hidden = self.rnn(x.float(), hidden)
        out = out[:,-1,:]
        out = self.dropout(out)
        out = self.fc(out)

        return out



def get_correct_char(model,valid_dataloader,correct_char):
    accuracy = 0
    batch_size = next(iter(valid_dataloader))[0].size(0)
    model.eval()
    for x,y in valid_dataloader:
        output = model(x)
        prediction = output.data.max(1)[1] #予測結果
        accuracy += prediction.eq(y.data).sum().item()/batch_size

        for correct,idx in zip(prediction.eq(y.data),y.data):
            if correct:
                correct_char[idx]+=1


    return accuracy/len(valid_dataloader),correct_char



In [8]:
from DistancedDatasets import Distanced_TenHot_Dataset_sest5 as MyDataset
tegaki_dataset = MyDataset(data,chars_file_path,device=device)

final_accuracies = []
final_losses = []
correct_char=torch.zeros(len(tokens),dtype=torch.int)

cross_validation = Cross_Validation(tegaki_dataset)
k_num = cross_validation.k_num #デフォルトは10
k_num = 1


##学習
for i in range(k_num):
    train_dataset,valid_dataset = cross_validation.get_datasets(k_idx=i)

    print(f'Cross Validation: k=[{i+1}/{k_num}]')

    train_dataloader=DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True,drop_last=True) #訓練データのみシャッフル
    valid_dataloader=DataLoader(valid_dataset,batch_size=BATCH_SIZE,shuffle=False,drop_last=True)
    model = Proofreader(VOCAB_SIZE, hidden_dim=HIDDEN_SIZE, output_size=VOCAB_SIZE, n_layers=1)
    # model.load_state_dict(torch.load("data/tegaki_katsuji/pre_trained_model.pth"))

    epochs = 1
    acc_record=[]
    loss_record=[]
    start = time.time() #開始時間の設定

    for epoch in range(1,epochs+1):
        #進捗表示
        i = (epoch-1)%10
        pro_bar = ('=' * i) + (' ' * (10 - i))
        print('\r[{0}] {1}%'.format(pro_bar, i / 10 * 100.), end='')


        loss,acc = train(model,train_dataloader,learning_rate=0.01)

        valid_acc = eval(model,valid_dataloader)
        loss_record.append(loss)
        acc_record.append(valid_acc)


        if epoch%10==0:
            print(f'\repoch:[{epoch:3}/{epochs}] | {timeSince(start)} - loss: {loss:.7},  accuracy: {acc:.7},  valid_acc: {valid_acc:.7}')
            start = time.time() #開始時間の設定

    acc,correct_char=get_correct_char(model,valid_dataloader,correct_char)


    print(f'final_loss: {loss_record[-1]:.7},   final_accuracy:{acc_record[-1]:.7}\n\n')

    final_accuracies.append(acc_record[-1])
    final_losses.append(loss_record[-1])

print(f'=================================================')
print(f'accuracies: {final_accuracies}')
print(f'losses: {final_losses}')

print(f'accu average: {np.mean(final_accuracies)}')
print(f'loss average: {np.mean(final_losses)}')

[31mERROR: No such char --> [0mb'\xe3\x82\x91'
[31mERROR: No such char --> [0mb'\xe7\xb8\x8a'
Cross Validation: k=[1/1]
[          ] 0.0%final_loss: 2.934099,   final_accuracy:0.671875


accuracies: [0.671875]
losses: [2.93409939456325]
accu average: 0.671875
loss average: 2.93409939456325
