# 80. ID番号への変換

In [23]:
import numpy as np
import re
from functools import reduce
from nltk.tokenize import word_tokenize
from collections import defaultdict

In [4]:
def load_data(dir_name, file_name):
    with open(f'{dir_name}{file_name}') as f:
        X = list()
        Y = list()
        for line in f:
            line = line.strip()
            splited_line = line.split('\t')
            X.append(splited_line[0])
            Y.append(splited_line[1])
        return np.asarray(X), np.asarray(Y)

def save_file_npy(dir_name, file_name, x):
    np.save(f'{dir_name}{file_name}', x)
        
def load_file_npy(dir_name, file_name):
    return np.load(f'{dir_name}{file_name}')

def chr2num(y):
    converter = {'b':0, 't':1, 'e':2, 'm':3}
    return np.asarray([converter[article_type] for article_type in y])

In [221]:
xtrain, ytrain = load_data('data/', 'train.txt')
xvalid, yvalid = load_data('data/', 'valid.txt')
xtest, ytest = load_data('data/', 'test.txt')

In [242]:
class PreprocessTools:
    def __init__(self):
        self.word_count = defaultdict(int)       
        self.word_transformer = defaultdict(int)
        self.vocab_size = -1
        
    def tokenize(self, data):
        return [[word for word in word_tokenize(txt)] for txt in data]

    def make_word_transformar(self, train_data:list):
        for data in train_data:
            for word in data:
                self.word_count[word] += 1
        sorted_word_count = sorted(self.word_count.items(), key=lambda x: x[1], reverse=True)
        for idx, (word, count) in enumerate(sorted_word_count):
            if count < 2:
                break
            else:
                self.word_transformer[word] = idx + 1
        self.vocab_size = len(self.word_transformer) + 1

    def txt2ids(self, txt_list:list):
        txt_ids = list()
        for txt in txt_list:
            ids = list()
            for word in txt:
                ids.append(self.word_transformer[word])
            txt_ids.append(ids)
        return txt_ids


    def ids2vec(self, txt_ids:list):
        txt_vec = list()
        identity = np.identity(self.vocab_size)
        for ids in txt_ids:
            txt_vec.append(identity[ids])
        return txt_vec

In [243]:
preprocess = PreprocessTools()
xtrain, ytrain = load_data('data/', 'train.txt')
xvalid, yvalid = load_data('data/', 'valid.txt')
xtest, ytest = load_data('data/', 'test.txt')
xtrain = preprocess.tokenize(xtrain)
xvalid = preprocess.tokenize(xvalid)
xtest = preprocess.tokenize(xtest)

In [244]:
preprocess.make_word_transformar(xtrain)

In [245]:
xtrain_ids = preprocess.txt2ids(xtrain)
xvalid_ids = preprocess.txt2ids(xvalid)
xtest_ids = preprocess.txt2ids(xtest)

In [246]:
for word, ids in zip(xtrain[:10], xtrain_ids[:10]):
    print(word, ids)

['Kindred', 'Healthcare', 'to', 'buy', 'Gentiva', 'for', 'about', '$', '573', 'mln'] [5065, 3395, 2, 181, 3396, 13, 164, 19, 0, 220]
['US', 'to', 'boost', 'ground', ',', 'naval', 'forces', 'in', 'NATO', 'countries'] [15, 2, 586, 3397, 1, 0, 4067, 6, 5066, 5067]
['Robert', 'Pattinson', '-', 'Robert', 'Pattinson', 'Brushes', 'Off', 'Kristen', 'Stewart', "'s", 'Cheating', '...'] [237, 945, 11, 237, 945, 0, 385, 669, 1259, 4, 3398, 3]
['Piers', 'Morgan', 'Delivers', 'One', 'Final', 'Blow', 'To', 'Gun', 'Violence', 'In', 'Last', 'Show'] [5068, 399, 6690, 185, 1074, 5069, 16, 3399, 2225, 20, 785, 161]
['Here', 'We', 'Go', ':', "'Star", 'Wars', 'Episode', 'VII', "'", 'Kicks', 'Off', 'Filming', 'at', 'Pinewood'] [400, 196, 639, 7, 549, 210, 295, 587, 5, 5070, 385, 1371, 22, 0]
['Amazon', 'gets', 'in', 'the', 'game', ':', 'Retailer', 'beats', 'Google', 'to', 'buy', 'hit', 'console', 'broadcasting', '...'] [169, 330, 6, 17, 1619, 7, 0, 609, 82, 2, 181, 245, 5071, 0, 3]
['FOREX-Euro', 'retreats',

# 81. RNNによる予測

In [247]:
# one-hotにはしない
#xtrain_vec = preprocess.ids2vec(xtrain_ids) 
#xvalid_vec = preprocess.ids2vec(xvalid_ids)
#xtest_vec = preprocess.ids2vec(xtest_ids)

In [None]:
y_train = chr2num(y_train)
y_valid = chr2num(y_valid)
y_test = chr2num(y_test)

In [248]:
#save_file_json('work/', 'train_x', xtrain_vec)
#save_file_json('work/', 'train_y', ytrain)
#save_file_json('work/', 'valid_x', xvalid_vec)
#save_file_json('work/', 'valid_y', yvalid)
#save_file_json('work/', 'test_x', xtest_vec)
#save_file_json('work/', 'test_y', ytest)

In [249]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Inputs: input, h_0

- input of shape (seq_len, batch, input_size): tensor containing the features of the input sequence. The input can also be a packed variable length sequence. See torch.nn.utils.rnn.pack_padded_sequence() or torch.nn.utils.rnn.pack_sequence() for details.

- h_0 of shape (num_layers * num_directions, batch, hidden_size): tensor containing the initial hidden state for each element in the batch. Defaults to zero if not provided. If the RNN is bidirectional, num_directions should be 2, else it should be 1.

## Outputs: output, h_n

- output of shape (seq_len, batch, num_directions * hidden_size): tensor containing the output features (h_t) from the last layer of the RNN, for each t. If a torch.nn.utils.rnn.PackedSequence has been given as the input, the output will also be a packed sequence.

    For the unpacked case, the directions can be separated using output.view(seq_len, batch, num_directions, hidden_size), with forward and backward being direction 0 and 1 respectively. Similarly, the directions can be separated in the packed case.

- h_n of shape (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for t = seq_len.

    Like output, the layers can be separated using h_n.view(num_layers, num_directions, batch, hidden_size).

In [809]:
vocab_size = preprocess.vocab_size
dw, dh = 300, 50
torch.manual_seed(1234)
embed = nn.Embedding(vocab_size, dw, padding_idx=0) #idx 0 は 0埋め
rnn = nn.RNN(dw, dh, num_layers=2, bidirectional=True, batch_first=True)
linear = nn.Linear(50, 4, bias=True)
softmax = nn.LogSoftmax(dim=1) # dim=-1 or 1
input = torch.LongTensor([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])

In [810]:
output, hn = rnn(embed(input))
print(hn.shape)
x = linear(hn[-1])
print(x)
print(softmax(x))

torch.Size([4, 2, 50])
tensor([[ 0.3656,  0.1161, -0.2724,  0.4356],
        [ 0.1910,  0.0066, -0.2242,  0.0248]], grad_fn=<AddmmBackward>)
tensor([[-1.2179, -1.4674, -1.8559, -1.1479],
        [-1.2056, -1.3900, -1.6208, -1.3718]], grad_fn=<LogSoftmaxBackward>)


# 82. 確率的勾配降下法による学習

In [994]:
class MyRNN(torch.nn.Module):
    def __init__(self, vocab_size, dw=300, dh=50, L=4, num_layers=1, bidirectional=False, rnn_bias=True, word_vec=None):
        super(MyRNN, self).__init__()
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.dw, self.dh = dw, dh
        if word_vec:
            self.embed = nn.from_pretrained(word_vec)
        else:
            self.embed = nn.Embedding(vocab_size, dw, padding_idx=0)
            #self.embed = nn.Embedding.from_pretrained(torch.nn.init.xavier_uniform_(nn.Parameter(torch.FloatTensor(vocab_size, dw))), padding_idx=0)
            #self.embed.weight[0] = torch.zeros(dw)
        self.rnn = nn.RNN(dw, dh, bias=rnn_bias, num_layers=num_layers, bidirectional=bidirectional, batch_first=True, nonlinearity='tanh')
        if bidirectional:
            self.linear = nn.Linear(2 * dh, L, bias=True)
        else:
            self.linear = nn.Linear(dh, L, bias=True)
        self.softmax = nn.LogSoftmax(dim=1) # dim=-1 or 1
        
    '''
    x: ids (not one hot vector)
    '''
    def forward(self, x):
        x = self.embed(x)
        _, hidden = self.rnn(x)
        hidden = hidden.view(self.num_layers, 2 if self.bidirectional else 1, -1, self.dh)
        last_hidden = hidden[-1]
        if self.bidirectional:
            x = self.linear(torch.cat([last_hidden[0], last_hidden[1]], dim=1))
        else:
            x = self.linear(last_hidden[0])
        x = self.softmax(x)
        return x 

In [995]:
from torch.utils.tensorboard import SummaryWriter
import tqdm
from torch.utils.data import DataLoader

In [996]:
class MyDataSets(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = [torch.LongTensor(data) for data in x]
        self.y = [torch.LongTensor([data]) for data in y]

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [997]:
def collate_fn(batch):
    x = [data[0] for data in batch]
    x = nn.utils.rnn.pad_sequence(x, batch_first=True)
    y = torch.LongTensor([data[1] for data in batch])
    return x, y

In [998]:
def execution(data_x, data_y, op, criterion, model, batch_size=1, is_train=True, use_gpu=False):
    if is_train: model.train()
    else: model.eval()
    ndata = len(data_x)
    dataset = MyDataSets(data_x, data_y)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    sum_loss, acc_score = 0, 0
    for batch_x, batch_y in data_loader:
        op.zero_grad()
        out = model(batch_x)
        loss = criterion(out, batch_y)
        if is_train:
            loss.backward()
            op.step()
        sum_loss += loss.data.item() * len(batch_x)
        pred = torch.argmax(out, dim=1)
        acc_score += np.sum((pred == batch_y).cpu().detach().numpy())
    return sum_loss / ndata, acc_score / ndata * 100

In [999]:
preprocess = PreprocessTools()
x_train, y_train = load_data('data/', 'train.txt')
x_valid, y_valid = load_data('data/', 'valid.txt')
x_test, y_test = load_data('data/', 'test.txt')
x_train = preprocess.tokenize(x_train)
x_valid = preprocess.tokenize(x_valid)
x_test = preprocess.tokenize(x_test)
preprocess.make_word_transformar(x_train)
x_train = preprocess.txt2ids(x_train)
x_valid = preprocess.txt2ids(x_valid)
x_test = preprocess.txt2ids(x_test)
y_train = chr2num(y_train)
y_valid = chr2num(y_valid)
y_test = chr2num(y_test)

In [1002]:
vocab_size = preprocess.vocab_size
torch.manual_seed(1234)
model = MyRNN(vocab_size, dw=300, dh=50, L=4, num_layers=1, bidirectional=False)
ntrain = len(x_train)
nepoch = 10 
op = optim.Adam(model.parameters(), lr=0.001)
#criterion = nn.NLLLoss(ignore_index=0) ignore_indexしちゃダメ...
criterion = nn.NLLLoss() 

In [1003]:
print(model)

MyRNN(
  (embed): Embedding(9866, 300, padding_idx=0)
  (rnn): RNN(300, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=4, bias=True)
  (softmax): LogSoftmax()
)


In [1004]:
train_writer = SummaryWriter(log_dir='./work/logs/train')
valid_writer = SummaryWriter(log_dir='./work/logs/valid')
logger = list()
for epoch in tqdm.notebook.tqdm(range(nepoch)):
    train_loss, train_acc = execution(x_train, y_train, op, criterion, model, batch_size=1)
    train_writer.add_scalar("loss", train_loss, epoch) 
    train_writer.add_scalar("accuracy", train_acc, epoch)
    with torch.no_grad():
        valid_loss, valid_acc = execution(x_valid, y_valid, op, criterion, model, batch_size=1, is_train=False)
        valid_writer.add_scalar("loss", valid_loss, epoch)
        valid_writer.add_scalar("accuracy", valid_acc, epoch)
    logger.append({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
    print({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
train_writer.close()
valid_writer.close()

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

{'epoch': 0, 'train_loss': 0.8678550714299027, 'train_acc': 69.76591760299625, 'valid_loss': 0.6977099206982257, 'valid_acc': 76.92883895131087}
{'epoch': 1, 'train_loss': 0.5909373537211745, 'train_acc': 79.68164794007491, 'valid_loss': 0.6242007071808092, 'valid_acc': 79.70037453183521}
{'epoch': 2, 'train_loss': 0.4399147464107955, 'train_acc': 84.49438202247191, 'valid_loss': 0.6134451217897069, 'valid_acc': 80.0}
{'epoch': 3, 'train_loss': 0.34077258804525096, 'train_acc': 88.22097378277154, 'valid_loss': 0.6018027626990414, 'valid_acc': 81.72284644194757}
{'epoch': 4, 'train_loss': 0.28248478134771454, 'train_acc': 90.53370786516854, 'valid_loss': 0.6021872756818932, 'valid_acc': 81.42322097378278}
{'epoch': 5, 'train_loss': 0.24080713388487376, 'train_acc': 92.20973782771536, 'valid_loss': 0.5551284337183678, 'valid_acc': 82.24719101123596}
{'epoch': 6, 'train_loss': 0.2059694091253388, 'train_acc': 93.27715355805243, 'valid_loss': 0.6041555308879872, 'valid_acc': 82.77153558052

# 83. ミニバッチ化・GPU上での学習

# 84. 単語ベクトルの導入

# 85. 双方向RNN・多層化

# 86. 畳み込みニューラルネットワーク (CNN)

# 87. 確率的勾配降下法によるCNNの学習

# 88. パラメータチューニング

# 89. 事前学習済み言語モデルからの転移学習