# 80. ID番号への変換

In [3]:
import numpy as np
import re
from functools import reduce
from nltk.tokenize import word_tokenize
from collections import defaultdict
import json

In [4]:
def load_data(path):
    with open(path, mode='r') as f:
        X = list()
        Y = list()
        for line in f:
            line = line.strip()
            splited_line = line.split('\t')
            X.append(splited_line[0])
            Y.append(splited_line[1])
        return X, Y

def save_file_json(path, data):
    with open(path, mode='w') as out_file:
        out_file.write(json.dumps(data)+'\n')
        
def load_file_json(path):
    with open(path, mode='r') as in_file:
        data = json.load(in_file)
    return data

def chr2num(y):
    converter = {'b':0, 't':1, 'e':2, 'm':3}
    return [converter[article_type] for article_type in y]

In [5]:
class PreprocessTools:
    def __init__(self, vocab_path=None):
        self.word_count = defaultdict(int)       
        if vocab_path:
            self.word_transformer = load_file_json(vocab_path)
            self.vocab_size = len(self.word_transformer) + 1
        else:
            self.word_transformer = dict()
            self.vocab_size = -1
        
    def tokenize(self, data):
        return [[word for word in word_tokenize(txt)] for txt in data]

    def make_word_transformar(self, train_data:list):
        for data in train_data:
            for word in data:
                self.word_count[word] += 1
        sorted_word_count = sorted(self.word_count.items(), key=lambda x: x[1], reverse=True)
        for idx, (word, count) in enumerate(sorted_word_count):
            if count < 2:
                break
            else:
                self.word_transformer[word] = idx + 1
        self.vocab_size = len(self.word_transformer) + 1

    def txt2ids(self, txt_list:list):
        txt_ids = list()
        for txt in txt_list:
            ids = list()
            for word in txt:
                if word in self.word_transformer:
                    ids.append(self.word_transformer[word])
                else:
                    ids.append(0)
            txt_ids.append(ids)
        return txt_ids


    def ids2vec(self, txt_ids:list):
        txt_vec = list()
        identity = np.identity(self.vocab_size)
        for ids in txt_ids:
            txt_vec.append(identity[ids])
        return txt_vec

In [6]:
preprocess = PreprocessTools()
x_train, y_train = load_data('data/train.txt')
x_valid, y_valid = load_data('data/valid.txt')
x_test, y_test = load_data('data/test.txt')
x_train = preprocess.tokenize(x_train)
x_valid = preprocess.tokenize(x_valid)
x_test = preprocess.tokenize(x_test)

In [7]:
preprocess.make_word_transformar(x_train)

In [18]:
x_train_ids = preprocess.txt2ids(x_train)
x_valid_ids = preprocess.txt2ids(x_valid)
x_test_ids = preprocess.txt2ids(x_test)

In [19]:
for word, ids in zip(x_train[:10], x_train_ids[:10]):
    print(word, ids)

['Kindred', 'Healthcare', 'to', 'buy', 'Gentiva', 'for', 'about', '$', '573', 'mln'] [5065, 3395, 2, 181, 3396, 13, 164, 19, 0, 220]
['US', 'to', 'boost', 'ground', ',', 'naval', 'forces', 'in', 'NATO', 'countries'] [15, 2, 586, 3397, 1, 0, 4067, 6, 5066, 5067]
['Robert', 'Pattinson', '-', 'Robert', 'Pattinson', 'Brushes', 'Off', 'Kristen', 'Stewart', "'s", 'Cheating', '...'] [237, 945, 11, 237, 945, 0, 385, 669, 1259, 4, 3398, 3]
['Piers', 'Morgan', 'Delivers', 'One', 'Final', 'Blow', 'To', 'Gun', 'Violence', 'In', 'Last', 'Show'] [5068, 399, 6690, 185, 1074, 5069, 16, 3399, 2225, 20, 785, 161]
['Here', 'We', 'Go', ':', "'Star", 'Wars', 'Episode', 'VII', "'", 'Kicks', 'Off', 'Filming', 'at', 'Pinewood'] [400, 196, 639, 7, 549, 210, 295, 587, 5, 5070, 385, 1371, 22, 0]
['Amazon', 'gets', 'in', 'the', 'game', ':', 'Retailer', 'beats', 'Google', 'to', 'buy', 'hit', 'console', 'broadcasting', '...'] [169, 330, 6, 17, 1619, 7, 0, 609, 82, 2, 181, 245, 5071, 0, 3]
['FOREX-Euro', 'retreats',

In [20]:
print(preprocess.vocab_size)

9866


In [21]:
save_file_json('work/vocab.json', preprocess.word_transformer)

# 81. RNNによる予測

In [22]:
# one-hotにはしない
#xtrain_vec = preprocess.ids2vec(xtrain_ids) 
#xvalid_vec = preprocess.ids2vec(xvalid_ids)
#xtest_vec = preprocess.ids2vec(xtest_ids)

In [23]:
y_train = chr2num(y_train)
y_valid = chr2num(y_valid)
y_test = chr2num(y_test)

In [24]:
save_file_json('work/train_x.json', {'data':x_train_ids})
save_file_json('work/train_y.json', {'data': y_train})
save_file_json('work/valid_x.json', {'data': x_valid_ids})
save_file_json('work/valid_y.json', {'data': y_valid})
save_file_json('work/test_x.json', {'data': x_test_ids})
save_file_json('work/test_y.json', {'data': y_test})

In [5]:
x_train = load_file_json('work/train_x.json')['data']
y_train = np.asarray(load_file_json('work/train_y.json')['data'])
x_valid = load_file_json('work/valid_x.json')['data']
y_valid = np.asarray(load_file_json('work/valid_y.json')['data'])
x_test = load_file_json('work/test_x.json')['data']
y_test = np.asarray(load_file_json('work/test_y.json')['data'])

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Inputs: input, h_0

- input of shape (seq_len, batch, input_size): tensor containing the features of the input sequence. The input can also be a packed variable length sequence. See torch.nn.utils.rnn.pack_padded_sequence() or torch.nn.utils.rnn.pack_sequence() for details.

- h_0 of shape (num_layers * num_directions, batch, hidden_size): tensor containing the initial hidden state for each element in the batch. Defaults to zero if not provided. If the RNN is bidirectional, num_directions should be 2, else it should be 1.

## Outputs: output, h_n

- output of shape (seq_len, batch, num_directions * hidden_size): tensor containing the output features (h_t) from the last layer of the RNN, for each t. If a torch.nn.utils.rnn.PackedSequence has been given as the input, the output will also be a packed sequence.

    For the unpacked case, the directions can be separated using output.view(seq_len, batch, num_directions, hidden_size), with forward and backward being direction 0 and 1 respectively. Similarly, the directions can be separated in the packed case.

- h_n of shape (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for t = seq_len.

    Like output, the layers can be separated using h_n.view(num_layers, num_directions, batch, hidden_size).

In [144]:
vocab_size = preprocess.vocab_size
dw, dh = 300, 50
torch.manual_seed(1234)
embed = nn.Embedding(vocab_size, dw, padding_idx=0) #idx 0 は 0埋め
num_layers = 2
bidirectional = True
rnn = nn.RNN(dw, dh, num_layers=num_layers, bidirectional=bidirectional, batch_first=True)
input = torch.LongTensor([[0, 1, 2, 3, 4]])
linear = nn.Linear(50, 4, bias=True)
softmax = nn.LogSoftmax(dim=1) # dim=-1 or 1

In [145]:
output, hidden = rnn(embed(input))
print(hidden)
hidden = hidden.view(num_layers, 2 if bidirectional else 1, -1, dh)
#print(hidden)
last_hidden = hidden[-1]
print(last_hidden)
#x = linear(hidden[-1])
#print(x)
#print(softmax(x))

tensor([[[-0.9588,  0.7610, -0.9565,  0.5900, -0.5844,  0.9816,  0.9628,
          -0.5733, -0.9226, -0.8944,  0.1460,  0.4036,  0.9598,  0.8089,
           0.9755, -0.9994,  0.9671,  0.1083,  0.9839, -0.7825,  0.9294,
           0.9097, -0.8783, -0.7987, -0.9839, -0.9838,  0.7480, -0.9795,
          -0.9946,  0.8387, -0.4521, -0.9794, -0.3257,  0.4931, -0.9273,
          -0.8774, -0.4283, -0.9119, -0.9689, -0.8910,  0.6624,  0.9643,
          -0.2032, -0.2346,  0.0353,  0.8976, -0.2835, -0.9466, -0.8547,
          -0.7617]],

        [[ 0.6467,  0.4122,  0.2868,  0.2072,  0.1445, -0.1215,  0.5725,
           0.0861,  0.1255, -0.3936,  0.6288,  0.1832, -0.1401,  0.2997,
           0.1401, -0.3450, -0.6161, -0.1151, -0.4304,  0.0187, -0.7646,
           0.2113,  0.8125, -0.4816, -0.5212,  0.2364,  0.0617, -0.3991,
          -0.4392, -0.6603,  0.2891, -0.1834,  0.2687,  0.2681, -0.4468,
          -0.3341,  0.1834,  0.1835, -0.4578,  0.2792,  0.6250,  0.0511,
          -0.1108, -0.4308, -

In [115]:
output, hidden = rnn(embed(input))
print(hidden.shape)
hidden = hidden.view(num_layers, 2 if bidirectional else 1, -1, dh)
print(hidden)
last_hidden = hidden[-1]
#x = linear(hidden[-1])
#print(x)
#print(softmax(x))

torch.Size([2, 2, 50])
tensor([[[[-0.9588,  0.7610, -0.9565,  0.5900, -0.5844,  0.9816,  0.9628,
           -0.5733, -0.9226, -0.8944,  0.1460,  0.4036,  0.9598,  0.8089,
            0.9755, -0.9994,  0.9671,  0.1083,  0.9839, -0.7825,  0.9294,
            0.9097, -0.8783, -0.7987, -0.9839, -0.9838,  0.7480, -0.9795,
           -0.9946,  0.8387, -0.4521, -0.9794, -0.3257,  0.4931, -0.9273,
           -0.8774, -0.4283, -0.9119, -0.9689, -0.8910,  0.6624,  0.9643,
           -0.2032, -0.2346,  0.0353,  0.8976, -0.2835, -0.9466, -0.8547,
           -0.7617],
          [ 0.8426, -0.0087,  0.9774,  0.8538,  0.4782, -0.6959,  0.4698,
            0.6362,  0.2218,  0.9951,  0.9583,  0.9620,  0.6493, -0.8816,
           -0.3704, -0.9910,  0.7911, -0.6577,  0.8689, -0.5620,  0.9889,
            0.4303, -0.9841, -0.6123, -0.6164, -0.8283,  0.5802,  0.9361,
            0.5248, -0.9203,  0.7948,  0.9841, -0.4951,  0.9653,  0.3215,
            0.0211, -0.8799,  0.8807,  0.8267, -0.6232,  1.0000,  0.

# 82. 確率的勾配降下法による学習

In [40]:
class MyRNN(torch.nn.Module):
    def __init__(self, vocab_size, dw=300, dh=50, L=4, num_layers=1, bidirectional=False, rnn_bias=True, PATH=None):
        super(MyRNN, self).__init__()
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.dw, self.dh = dw, dh
        if PATH:
            self.embed = nn.from_pretrained(PATH)
        else:
            m = nn.Embedding(vocab_size, dw, padding_idx=0)
            nn.init.normal_(m.weight, mean=0, std=dw ** -0.5)
            nn.init.constant_(m.weight[0], 0)
            self.embed = m
        self.rnn = nn.RNN(dw, dh, bias=rnn_bias, num_layers=num_layers, bidirectional=bidirectional, batch_first=True, nonlinearity='relu')
        if bidirectional:
            self.linear = nn.Linear(2 * dh, L, bias=True)
        else:
            self.linear = nn.Linear(dh, L, bias=True)
        self.softmax = nn.LogSoftmax(dim=1) # dim=-1 or 1
        
    '''
    x: ids (not one hot vector)
    '''
    def forward(self, x, x_lengths):
        x = self.embed(x)
        packed = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True, enforce_sorted=False)
        _, hidden = self.rnn(packed)
        hidden = hidden.view(self.num_layers, 2 if self.bidirectional else 1, -1, self.dh)
        last_hidden = hidden[-1]
        if self.bidirectional:
            x = self.linear(torch.cat([last_hidden[0], last_hidden[1]], dim=1))
        else:
            x = self.linear(last_hidden[0])
        x = self.softmax(x)
        return x 
    
    def update_from_word2vec(self, w2v, transformer):
        for word, idx in transformer.items():
            with torch.no_grad():
                if word in w2v:
                    self.embed.weight[idx].copy_(torch.from_numpy(w2v[word]))

In [6]:
from torch.utils.tensorboard import SummaryWriter
import tqdm
from torch.utils.data import DataLoader

In [42]:
class MyDataSets(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = [torch.LongTensor(data) for data in x]
        self.y = [torch.LongTensor([data]) for data in y]

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [43]:
def collate_fn(batch):
    x = [data[0] for data in batch]
    x = nn.utils.rnn.pad_sequence(x, batch_first=True)
    y = torch.LongTensor([data[1] for data in batch])
    lengths = torch.LongTensor([len(data[0]) for data in batch])
    return x, y, lengths

In [44]:
def execution(data_x, data_y, op, criterion, model, batch_size=1, is_train=True, use_gpu=False):
    if is_train: model.train()
    else: model.eval()
    ndata = len(data_x)
    dataset = MyDataSets(data_x, data_y)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    sum_loss, acc_score = 0, 0
    for batch_x, batch_y, batch_lengths in data_loader:
        op.zero_grad()
        out = model(batch_x, batch_lengths)
        loss = criterion(out, batch_y)
        if is_train:
            loss.backward()
            op.step()
        sum_loss += loss.data.item() * len(batch_x)
        pred = torch.argmax(out, dim=1)
        acc_score += np.sum((pred == batch_y).cpu().detach().numpy())
    return sum_loss / ndata, acc_score / ndata * 100

In [45]:
x_train = load_file_json('work/train_x.json')['data']
y_train = np.asarray(load_file_json('work/train_y.json')['data'])
x_valid = load_file_json('work/valid_x.json')['data']
y_valid = np.asarray(load_file_json('work/valid_y.json')['data'])
x_test = load_file_json('work/test_x.json')['data']
y_test = np.asarray(load_file_json('work/test_y.json')['data'])

In [68]:
vocab_size = preprocess.vocab_size
torch.manual_seed(1234)
model = MyRNN(vocab_size, dw=300, dh=50, L=4, num_layers=1, bidirectional=False)
nn.utils.clip_grad_norm_(model.parameters(), 0.1)
ntrain = len(x_train)
nepoch = 10 
batch_size = 1
op = optim.SGD(model.parameters(), lr=0.01)
criterion = nn.NLLLoss() 

In [69]:
print(model)

MyRNN(
  (embed): Embedding(9866, 300, padding_idx=0)
  (rnn): RNN(300, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=4, bias=True)
  (softmax): LogSoftmax()
)


In [70]:
train_writer = SummaryWriter(log_dir='./work/logs/train')
valid_writer = SummaryWriter(log_dir='./work/logs/valid')
logger = list()
for epoch in tqdm.notebook.tqdm(range(nepoch)):
    train_loss, train_acc = execution(x_train, y_train, op, criterion, model, batch_size=batch_size)
    train_writer.add_scalar("loss", train_loss, epoch) 
    train_writer.add_scalar("accuracy", train_acc, epoch)
    with torch.no_grad():
        valid_loss, valid_acc = execution(x_valid, y_valid, op, criterion, model, batch_size=batch_size, is_train=False)
        valid_writer.add_scalar("loss", valid_loss, epoch)
        valid_writer.add_scalar("accuracy", valid_acc, epoch)
    logger.append({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
    print({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
train_writer.close()
valid_writer.close()

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

{'epoch': 0, 'train_loss': 1.0372647139848967, 'train_acc': 59.21348314606741, 'valid_loss': 0.887252983872252, 'valid_acc': 69.06367041198503}
{'epoch': 1, 'train_loss': 0.8374215004742438, 'train_acc': 70.91760299625469, 'valid_loss': 0.6876947393141969, 'valid_acc': 76.70411985018727}
{'epoch': 2, 'train_loss': 0.7466804328128738, 'train_acc': 74.11048689138576, 'valid_loss': 0.7361666740501925, 'valid_acc': 72.65917602996255}
{'epoch': 3, 'train_loss': 0.735462963356367, 'train_acc': 74.6067415730337, 'valid_loss': 0.6611646328217783, 'valid_acc': 76.17977528089888}
{'epoch': 4, 'train_loss': 0.6526657013838769, 'train_acc': 77.4812734082397, 'valid_loss': 0.9591464588454356, 'valid_acc': 63.97003745318352}
{'epoch': 5, 'train_loss': nan, 'train_acc': 70.78651685393258, 'valid_loss': nan, 'valid_acc': 42.54681647940075}
{'epoch': 6, 'train_loss': nan, 'train_acc': 42.041198501872664, 'valid_loss': nan, 'valid_acc': 42.54681647940075}



KeyboardInterrupt: 

# 83. ミニバッチ化・GPU上での学習

In [188]:
%%file src/gpu.py

import numpy as np
from functools import reduce
from nltk.tokenize import word_tokenize
from collections import defaultdict
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from torch.utils.data import DataLoader

class PreprocessTools:
    def __init__(self, vocab_path=None):
        self.word_count = defaultdict(int)       
        if vocab_path:
            self.word_transformer = load_file_json(vocab_path)
            self.vocab_size = len(self.word_transformer) + 1
        else:
            self.word_transformer = defaultdict(int)
            self.vocab_size = -1
        
    def tokenize(self, data):
        return [[word for word in word_tokenize(txt)] for txt in data]

    def make_word_transformar(self, train_data:list):
        for data in train_data:
            for word in data:
                self.word_count[word] += 1
        sorted_word_count = sorted(self.word_count.items(), key=lambda x: x[1], reverse=True)
        for idx, (word, count) in enumerate(sorted_word_count):
            if count < 2:
                break
            else:
                self.word_transformer[word] = idx + 1
        self.vocab_size = len(self.word_transformer) + 1

    def txt2ids(self, txt_list:list):
        txt_ids = list()
        for txt in txt_list:
            ids = list()
            for word in txt:
                ids.append(self.word_transformer[word])
            txt_ids.append(ids)
        return txt_ids


    def ids2vec(self, txt_ids:list):
        txt_vec = list()
        identity = np.identity(self.vocab_size)
        for ids in txt_ids:
            txt_vec.append(identity[ids])
        return txt_vec
    
    
def load_data(path):
    with open(path, mode='r') as f:
        X = list()
        Y = list()
        for line in f:
            line = line.strip()
            splited_line = line.split('\t')
            X.append(splited_line[0])
            Y.append(splited_line[1])
        return X, Y

def save_file_json(path, data):
    with open(path, mode='w') as out_file:
        out_file.write(json.dumps(data)+'\n')
        
def load_file_json(path):
    with open(path, mode='r') as in_file:
        data = json.load(in_file)
    return data

def chr2num(y):
    converter = {'b':0, 't':1, 'e':2, 'm':3}
    return [converter[article_type] for article_type in y]

class MyRNN(torch.nn.Module):
    def __init__(self, vocab_size, dw=300, dh=50, L=4, num_layers=1, bidirectional=False, rnn_bias=True, PATH=None):
        super(MyRNN, self).__init__()
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.dw, self.dh = dw, dh
        if PATH:
            self.embed = nn.from_pretrained(PATH)
        else:
            m = nn.Embedding(vocab_size, dw, padding_idx=0)
            nn.init.normal_(m.weight, mean=0, std=dw ** -0.5)
            nn.init.constant_(m.weight[0], 0)
            self.embed = m
        self.rnn = nn.RNN(dw, dh, bias=rnn_bias, num_layers=num_layers, bidirectional=bidirectional, batch_first=True, nonlinearity='tanh')
        if bidirectional:
            self.linear = nn.Linear(2 * dh, L, bias=True)
        else:
            self.linear = nn.Linear(dh, L, bias=True)
        self.softmax = nn.LogSoftmax(dim=1) # dim=-1 or 1
        
    '''
    x: ids (not one hot vector)
    '''
    def forward(self, x, x_lengths):
        x = self.embed(x)
        packed = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True, enforce_sorted=False)
        _, hidden = self.rnn(packed)
        hidden = hidden.view(self.num_layers, 2 if self.bidirectional else 1, -1, self.dh)
        last_hidden = hidden[-1]
        if self.bidirectional:
            x = self.linear(torch.cat([last_hidden[0], last_hidden[1]], dim=1))
        else:
            x = self.linear(last_hidden[0])
        x = self.softmax(x)
        return x 

class MyDataSets(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = [torch.LongTensor(data) for data in x]
        self.y = [torch.LongTensor([data]) for data in y]

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    
def collate_fn(batch):
    x = [data[0] for data in batch]
    x = nn.utils.rnn.pad_sequence(x, batch_first=True)
    y = torch.LongTensor([data[1] for data in batch])
    lengths = torch.LongTensor([len(data[0]) for data in batch])
    return x, y, lengths
    
def execution(data_x, data_y, op, criterion, model, batch_size=1, is_train=True, use_gpu=False):
    if is_train: model.train()
    else: model.eval()
    ndata = len(data_x)
    dataset = MyDataSets(data_x, data_y)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    sum_loss, acc_score = 0, 0
    for batch_x, batch_y, batch_lengths in data_loader:
        op.zero_grad()
        out = model(batch_x, batch_lengths)
        loss = criterion(out, batch_y)
        if is_train:
            loss.backward()
            op.step()
        sum_loss += loss.data.item() * len(batch_x)
        pred = torch.argmax(out, dim=1)
        acc_score += np.sum((pred == batch_y).cpu().detach().numpy())
    return sum_loss / ndata, acc_score / ndata * 100


if __name__ == "__main__":
    preprocess = PreprocessTools('work/vocab.json')
    
    x_train = load_file_json('work/train_x.json')['data']
    y_train = np.asarray(load_file_json('work/train_y.json')['data'])
    x_valid = load_file_json('work/valid_x.json')['data']
    y_valid = np.asarray(load_file_json('work/valid_y.json')['data'])
    x_test = load_file_json('work/test_x.json')['data']
    y_test = np.asarray(load_file_json('work/test_y.json')['data'])


    vocab_size = preprocess.vocab_size
    torch.manual_seed(1234)
    model = MyRNN(vocab_size, dw=300, dh=50, L=4, num_layers=2, bidirectional=True)
    ntrain = len(x_train)
    nepoch = 10 
    batch_size = 128 
    op = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.NLLLoss() 

    train_writer = SummaryWriter(log_dir='./work/logs/train')
    valid_writer = SummaryWriter(log_dir='./work/logs/valid')
    logger = list()
    for epoch in tqdm.tqdm(range(nepoch)):
        train_loss, train_acc = execution(x_train, y_train, op, criterion, model, batch_size=batch_size)
        train_writer.add_scalar("loss", train_loss, epoch) 
        train_writer.add_scalar("accuracy", train_acc, epoch)
        with torch.no_grad():
            valid_loss, valid_acc = execution(x_valid, y_valid, op, criterion, model, batch_size=batch_size, is_train=False)
            valid_writer.add_scalar("loss", valid_loss, epoch)
            valid_writer.add_scalar("accuracy", valid_acc, epoch)
        logger.append({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
        print({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
    train_writer.close()
    valid_writer.close()

Overwriting src/gpu.py


# 84. 単語ベクトルの導入

In [253]:
from gensim.models import KeyedVectors

unable to import 'smart_open.gcs', disabling that module


In [254]:
w2v = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

In [314]:
vocab_size = preprocess.vocab_size
torch.manual_seed(1234)
model = MyRNN(vocab_size, dw=300, dh=50, L=4, num_layers=1, bidirectional=False)
model.update_from_word2vec(w2v, preprocess.word_transformer)
ntrain = len(x_train)
nepoch = 10 
batch_size = 128 
#op = optim.SGD(model.parameters(), lr=0.1)
op = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.NLLLoss() 

In [315]:
print(model)

MyRNN(
  (embed): Embedding(9866, 300, padding_idx=0)
  (rnn): RNN(300, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=4, bias=True)
  (softmax): LogSoftmax()
)


In [316]:
train_writer = SummaryWriter(log_dir='./work/logs/train')
valid_writer = SummaryWriter(log_dir='./work/logs/valid')
logger = list()
for epoch in tqdm.notebook.tqdm(range(nepoch)):
    train_loss, train_acc = execution(x_train, y_train, op, criterion, model, batch_size=batch_size)
    train_writer.add_scalar("loss", train_loss, epoch) 
    train_writer.add_scalar("accuracy", train_acc, epoch)
    with torch.no_grad():
        valid_loss, valid_acc = execution(x_valid, y_valid, op, criterion, model, batch_size=batch_size, is_train=False)
        valid_writer.add_scalar("loss", valid_loss, epoch)
        valid_writer.add_scalar("accuracy", valid_acc, epoch)
    logger.append({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
    print({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
train_writer.close()
valid_writer.close()

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

{'epoch': 0, 'train_loss': 1.0841823437687164, 'train_acc': 50.468164794007485, 'valid_loss': 0.8035261861840437, 'valid_acc': 73.85767790262172}
{'epoch': 1, 'train_loss': 0.5676666072245394, 'train_acc': 79.05430711610487, 'valid_loss': 0.46076675584253746, 'valid_acc': 83.07116104868913}
{'epoch': 2, 'train_loss': 0.3240814438919896, 'train_acc': 87.34082397003745, 'valid_loss': 0.45489098219835805, 'valid_acc': 83.59550561797752}
{'epoch': 3, 'train_loss': 0.2447720053490628, 'train_acc': 89.84082397003745, 'valid_loss': 0.41074375164196286, 'valid_acc': 85.0187265917603}
{'epoch': 4, 'train_loss': 0.20364386380723354, 'train_acc': 90.53370786516854, 'valid_loss': 0.4525810466946734, 'valid_acc': 85.2434456928839}
{'epoch': 5, 'train_loss': 0.16895339485634578, 'train_acc': 91.44194756554307, 'valid_loss': 0.39696553704443943, 'valid_acc': 85.76779026217228}
{'epoch': 6, 'train_loss': 0.14188933785488542, 'train_acc': 92.61235955056179, 'valid_loss': 0.4342009868291433, 'valid_acc'

# 85. 双方向RNN・多層化

In [58]:
class MyRNN(torch.nn.Module):
    def __init__(self, vocab_size, dw=300, dh=50, L=4, num_layers=1, bidirectional=False, dropout=0.0, rnn_bias=True, PATH=None):
        super(MyRNN, self).__init__()
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.dw, self.dh = dw, dh
        if PATH:
            self.embed = nn.from_pretrained(PATH)
        else:
            m = nn.Embedding(vocab_size, dw, padding_idx=0)
            nn.init.normal_(m.weight, mean=0, std=dw ** -0.5)
            nn.init.constant_(m.weight[0], 0)
            self.embed = m
        self.rnn = nn.RNN(dw, dh, bias=rnn_bias, num_layers=num_layers, bidirectional=bidirectional, batch_first=True, nonlinearity='relu', dropout=dropout)
        if bidirectional:
            self.linear = nn.Linear(2 * dh, L, bias=True)
        else:
            self.linear = nn.Linear(dh, L, bias=True)
        self.softmax = nn.LogSoftmax(dim=1) # dim=-1 or 1
        
    '''
    x: ids (not one hot vector)
    '''
    def forward(self, x, x_lengths):
        x = self.embed(x)
        packed = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True, enforce_sorted=False)
        _, hidden = self.rnn(packed)
        hidden = hidden.view(self.num_layers, 2 if self.bidirectional else 1, -1, self.dh)
        last_hidden = hidden[-1]
        if self.bidirectional:
            x = self.linear(torch.cat([last_hidden[0], last_hidden[1]], dim=1))
        else:
            x = self.linear(last_hidden[0])
        x = self.softmax(x)
        return x 
    
    def update_from_word2vec(self, w2v, transformer):
        for word, idx in transformer.items():
            with torch.no_grad():
                if word in w2v:
                    self.embed.weight[idx].copy_(torch.from_numpy(w2v[word]))

In [61]:
vocab_size = preprocess.vocab_size
torch.manual_seed(1234)
model = MyRNN(vocab_size, dw=300, dh=50, L=4, num_layers=2, bidirectional=True, dropout=0.6)
model.update_from_word2vec(w2v, preprocess.word_transformer)
ntrain = len(x_train)
nepoch = 30 
batch_size = 128 
op = optim.Adagrad(model.parameters(), lr=0.01, lr_decay=0.001)
criterion = nn.NLLLoss() 
train_writer = SummaryWriter(log_dir='./work/logs/train')
valid_writer = SummaryWriter(log_dir='./work/logs/valid')
logger = list()
for epoch in tqdm.notebook.tqdm(range(nepoch)):
    train_loss, train_acc = execution(x_train, y_train, op, criterion, model, batch_size=batch_size)
    train_writer.add_scalar("loss", train_loss, epoch) 
    train_writer.add_scalar("accuracy", train_acc, epoch)
    with torch.no_grad():
        valid_loss, valid_acc = execution(x_valid, y_valid, op, criterion, model, batch_size=batch_size, is_train=False)
        valid_writer.add_scalar("loss", valid_loss, epoch)
        valid_writer.add_scalar("accuracy", valid_acc, epoch)
    logger.append({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
    print({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
train_writer.close()
valid_writer.close()

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

{'epoch': 0, 'train_loss': 0.5621581544813592, 'train_acc': 78.89513108614233, 'valid_loss': 0.4073159295521425, 'valid_acc': 84.49438202247191}
{'epoch': 1, 'train_loss': 0.2761372826965561, 'train_acc': 90.54307116104869, 'valid_loss': 0.32408394719777484, 'valid_acc': 89.21348314606742}
{'epoch': 2, 'train_loss': 0.15865980392314968, 'train_acc': 94.85955056179776, 'valid_loss': 0.38206236023134954, 'valid_acc': 89.58801498127342}
{'epoch': 3, 'train_loss': 0.09868653422214566, 'train_acc': 96.76029962546816, 'valid_loss': 0.31603187080402945, 'valid_acc': 90.0374531835206}
{'epoch': 4, 'train_loss': 0.06191470492352149, 'train_acc': 98.01498127340824, 'valid_loss': 0.354854448614049, 'valid_acc': 90.187265917603}
{'epoch': 5, 'train_loss': 0.04207206202970908, 'train_acc': 98.76404494382022, 'valid_loss': 0.423645007476378, 'valid_acc': 90.11235955056179}
{'epoch': 6, 'train_loss': 0.029736371623694004, 'train_acc': 99.14794007490637, 'valid_loss': 0.43740582595603744, 'valid_acc':

# 86. 畳み込みニューラルネットワーク (CNN)

In [12]:
x_train = load_file_json('work/train_x.json')['data']
y_train = np.asarray(load_file_json('work/train_y.json')['data'])
x_valid = load_file_json('work/valid_x.json')['data']
y_valid = np.asarray(load_file_json('work/valid_y.json')['data'])
x_test = load_file_json('work/test_x.json')['data']
y_test = np.asarray(load_file_json('work/test_y.json')['data'])

In [301]:
vocab_size = preprocess.vocab_size
dw, dh, w_sz = 300, 50, 3
torch.manual_seed(1234)
embed = nn.Embedding(vocab_size, dw, padding_idx=0)
kernel_size = [w_sz, dw]
cnn = nn.Conv2d(in_channels=1, out_channels=dh, kernel_size=kernel_size, padding=(w_sz-2, 0), stride=1)
g = nn.ReLU()
pool = torch.max
linear = nn.Linear(dh, 4, bias=True)
softmax = nn.LogSoftmax(dim=1) # dim=-1 or 1
input = torch.LongTensor([[[0, 1, 2, 3, 4]], [[5, 6, 7, 8, 9]]])
print(input.shape)

torch.Size([2, 1, 5])


In [302]:
print(cnn.weight.size())

torch.Size([50, 1, 3, 300])


In [306]:
mask = torch.FloatTensor([[[1,1,1,0,0]], [[1,1,1,1,0]]])
x = embed(input)
x = cnn(x)
x = x.view(x.shape[:3])
x = x * mask
x = g(x)
x = pool(x, dim=2).values
x = linear(x)
x = softmax(x)
print(x)

tensor([[-1.0314, -1.1941, -1.7095, -1.8355],
        [-1.1559, -0.9727, -1.7407, -2.0267]], grad_fn=<LogSoftmaxBackward>)


# 87. 確率的勾配降下法によるCNNの学習

In [13]:
class MyCNN(torch.nn.Module):
    def __init__(self, vocab_size, dw=300, dh=50, w_sz=3, L=4, dropout=0.0):
        super(MyCNN, self).__init__()
        self.dw, self.dh, self.w_sz = dw, dh, w_sz
        
        m = nn.Embedding(vocab_size, dw, padding_idx=0)
        nn.init.normal_(m.weight, mean=0, std=dw ** -0.5)
        nn.init.constant_(m.weight[0], 0)
        self.embed = m
        
        self.cnn = nn.Conv2d(in_channels=1, out_channels=dh, kernel_size=[w_sz, dw], padding=(w_sz-2, 0), stride=1)
        self.g = nn.ReLU()
        self.pool = torch.max
        self.linear = nn.Linear(dh, 4, bias=True)
        self.dropout = nn.Dropout2d(p=dropout)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x, x_lengths):
        x = self.embed(x)
        x = self.cnn(x)
        x = self.g(x)
        x = x.view(x.shape[:3])
        x = self.pool(x, dim=2).values
        x = self.linear(x)
        x = self.softmax(x)
        return x 
    
    def update_from_word2vec(self, w2v, transformer):
        for word, idx in transformer.items():
            with torch.no_grad():
                if word in w2v:
                    self.embed.weight[idx].copy_(torch.from_numpy(w2v[word]))

In [79]:
class MyDataSets(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = [torch.LongTensor(data) for data in x]
        self.y = [torch.LongTensor([data]) for data in y]

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

def collate_fn(batch):
    x = [data[0] for data in batch]
    x = nn.utils.rnn.pad_sequence(x, batch_first=True)
    y = torch.LongTensor([data[1] for data in batch])
    lengths = torch.LongTensor([len(data[0]) for data in batch])
    x = x.view(-1, 1, torch.max(lengths))
    return x, y, lengths

def execution(data_x, data_y, op, criterion, model, batch_size=1, is_train=True, use_gpu=False):
    if is_train: model.train()
    else: model.eval()
    ndata = len(data_x)
    dataset = MyDataSets(data_x, data_y)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    sum_loss, acc_score = 0, 0
    for batch_x, batch_y, batch_lengths in data_loader:
        op.zero_grad()
        out = model(batch_x, batch_lengths)
        loss = criterion(out, batch_y)
        if is_train:
            loss.backward()
            op.step()
        sum_loss += loss.data.item() * len(batch_x)
        pred = torch.argmax(out, dim=1)
        acc_score += np.sum((pred == batch_y).cpu().detach().numpy())
    return sum_loss / ndata, acc_score / ndata * 100

In [None]:
x_train = load_file_json('work/train_x.json')['data']
y_train = np.asarray(load_file_json('work/train_y.json')['data'])
x_valid = load_file_json('work/valid_x.json')['data']
y_valid = np.asarray(load_file_json('work/valid_y.json')['data'])
x_test = load_file_json('work/test_x.json')['data']
y_test = np.asarray(load_file_json('work/test_y.json')['data'])

vocab_size = preprocess.vocab_size
torch.manual_seed(1234)
model = MyCNN(vocab_size, dw=300, dh=50, w_sz=3, L=4)
nn.utils.clip_grad_norm_(model.parameters(), 0.1)
ntrain = len(x_train)
nepoch = 10 
batch_size = 1
op = optim.SGD(model.parameters(), lr=0.01)
criterion = nn.NLLLoss() 

train_writer = SummaryWriter(log_dir='./work/logs/cnn/train')
valid_writer = SummaryWriter(log_dir='./work/logs/cnn/valid')
logger = list()
for epoch in tqdm.notebook.tqdm(range(nepoch)):
    train_loss, train_acc = execution(x_train, y_train, op, criterion, model, batch_size=batch_size)
    train_writer.add_scalar("loss", train_loss, epoch) 
    train_writer.add_scalar("accuracy", train_acc, epoch)
    with torch.no_grad():
        valid_loss, valid_acc = execution(x_valid, y_valid, op, criterion, model, batch_size=batch_size, is_train=False)
        valid_writer.add_scalar("loss", valid_loss, epoch)
        valid_writer.add_scalar("accuracy", valid_acc, epoch)
    logger.append({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
    print({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
train_writer.close()
valid_writer.close()

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

{'epoch': 0, 'train_loss': 0.9389110220776485, 'train_acc': 64.7565543071161, 'valid_loss': 0.7810873905657651, 'valid_acc': 72.28464419475655}
{'epoch': 1, 'train_loss': 0.6317997540631297, 'train_acc': 77.02247191011236, 'valid_loss': 0.5567348639960557, 'valid_acc': 79.02621722846442}
{'epoch': 2, 'train_loss': 0.40004950408328493, 'train_acc': 85.40262172284643, 'valid_loss': 0.4443720965145491, 'valid_acc': 82.84644194756554}
{'epoch': 3, 'train_loss': 0.2403459146221051, 'train_acc': 91.74157303370787, 'valid_loss': 0.40862589928987436, 'valid_acc': 85.99250936329588}
{'epoch': 4, 'train_loss': 0.13932250821607933, 'train_acc': 95.75842696629213, 'valid_loss': 0.38565428323961776, 'valid_acc': 86.59176029962546}
{'epoch': 5, 'train_loss': 0.07791281996948243, 'train_acc': 98.22097378277154, 'valid_loss': 0.3847782454515154, 'valid_acc': 88.08988764044943}


# 88. パラメータチューニング

In [7]:
from gensim.models import KeyedVectors
w2v = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

unable to import 'smart_open.gcs', disabling that module


In [405]:
class MyCNN(torch.nn.Module):
    def __init__(self, vocab_size, dw=300, dh=50, w_sz=3, L=4, dropout=0.0):
        super(MyCNN, self).__init__()
        self.dw, self.dh, self.w_sz = dw, dh, w_sz
        
        m = nn.Embedding(vocab_size, dw, padding_idx=0)
        nn.init.normal_(m.weight, mean=0, std=dw ** -0.5)
        nn.init.constant_(m.weight[0], 0)
        self.embed = m
        
        self.cnn = nn.Conv2d(in_channels=1, out_channels=dh, kernel_size=[w_sz, dw], padding=(w_sz-2, 0), stride=1)
        self.g = nn.ReLU()
        self.pool = torch.max
        self.linear = nn.Linear(dh, 4, bias=True)
        self.dropout = nn.Dropout2d(p=0.2)
        self.softmax = nn.LogSoftmax(dim=1) # dim=-1 or 1
        
    def forward(self, x, x_lengths):
        max_len = x_lengths.max()
        mask = torch.FloatTensor([[[ 1.0 if i < seq_len else 0.0 for i in range(max_len)]] for seq_len in x_lengths])
        x = self.embed(x)
        x = self.cnn(x)
        x = x.view(x.shape[:3])
        x = x * mask
        
        
        x = self.g(x)
        x = self.dropout(x)
        x = self.pool(x, dim=2).values
        x = self.linear(x)
        x = self.dropout(x)
        x = self.softmax(x)
        return x 
    
    def update_from_word2vec(self, w2v, transformer):
        for word, idx in transformer.items():
            with torch.no_grad():
                if word in w2v:
                    self.embed.weight[idx].copy_(torch.from_numpy(w2v[word]))
                    
class MyDataSets(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = [torch.LongTensor(data) for data in x]
        self.y = [torch.LongTensor([data]) for data in y]

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

def collate_fn(batch):
    x = [data[0] for data in batch]
    x = nn.utils.rnn.pad_sequence(x, batch_first=True)
    y = torch.LongTensor([data[1] for data in batch])
    lengths = torch.LongTensor([len(data[0]) for data in batch])
    x = x.view(-1, 1, torch.max(lengths))
    return x, y, lengths

def execution(data_x, data_y, op, criterion, model, batch_size=1, is_train=True, use_gpu=False):
    if is_train: model.train()
    else: model.eval()
    ndata = len(data_x)
    dataset = MyDataSets(data_x, data_y)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    sum_loss, acc_score = 0, 0
    for batch_x, batch_y, batch_lengths in data_loader:
        op.zero_grad()
        out = model(batch_x, batch_lengths)
        loss = criterion(out, batch_y)
        if is_train:
            loss.backward()
            op.step()
        sum_loss += loss.data.item() * len(batch_x)
        pred = torch.argmax(out, dim=1)
        acc_score += np.sum((pred == batch_y).cpu().detach().numpy())
    return sum_loss / ndata, acc_score / ndata * 100

In [409]:
x_train = load_file_json('work/train_x.json')['data']
y_train = np.asarray(load_file_json('work/train_y.json')['data'])
x_valid = load_file_json('work/valid_x.json')['data']
y_valid = np.asarray(load_file_json('work/valid_y.json')['data'])
x_test = load_file_json('work/test_x.json')['data']
y_test = np.asarray(load_file_json('work/test_y.json')['data'])


vocab_size = preprocess.vocab_size
torch.manual_seed(1234)
model = MyCNN(vocab_size, dw=300, dh=50, w_sz=3, L=4)
model.update_from_word2vec(w2v, preprocess.word_transformer)
nepoch = 10 
batch_size = 256 
op = optim.Adagrad(model.parameters(), lr=0.01, lr_decay=0.001)
criterion = nn.NLLLoss() 


train_writer = SummaryWriter(log_dir='./work/logs/cnn/train')
valid_writer = SummaryWriter(log_dir='./work/logs/cnn/valid')
logger = list()
max_valid = -1
max_model_param = None
for epoch in tqdm.notebook.tqdm(range(nepoch)):
    train_loss, train_acc = execution(x_train, y_train, op, criterion, model, batch_size=batch_size)
    train_writer.add_scalar("loss", train_loss, epoch) 
    train_writer.add_scalar("accuracy", train_acc, epoch)
    with torch.no_grad():
        valid_loss, valid_acc = execution(x_valid, y_valid, op, criterion, model, batch_size=batch_size, is_train=False)
        valid_writer.add_scalar("loss", valid_loss, epoch)
        valid_writer.add_scalar("accuracy", valid_acc, epoch)
        
    if max_valid < valid_acc:
        max_valid = valid_acc
        max_model_param = model.state_dict()
        
    logger.append({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
    print({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
train_writer.close()
valid_writer.close()

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

{'epoch': 0, 'train_loss': 0.5757558084605785, 'train_acc': 77.11610486891387, 'valid_loss': 0.3003921103834659, 'valid_acc': 91.01123595505618}
{'epoch': 1, 'train_loss': 0.3131626569376456, 'train_acc': 86.86329588014982, 'valid_loss': 0.24776309576820346, 'valid_acc': 92.20973782771536}
{'epoch': 2, 'train_loss': 0.24898011490200342, 'train_acc': 88.82022471910112, 'valid_loss': 0.2316487668605333, 'valid_acc': 92.35955056179776}
{'epoch': 3, 'train_loss': 0.20590350836627047, 'train_acc': 90.25280898876404, 'valid_loss': 0.22737948575269865, 'valid_acc': 92.43445692883896}
{'epoch': 4, 'train_loss': 0.18683609850844193, 'train_acc': 90.87078651685393, 'valid_loss': 0.22482745501767384, 'valid_acc': 92.73408239700375}
{'epoch': 5, 'train_loss': 0.16657901802536254, 'train_acc': 91.03932584269663, 'valid_loss': 0.22434093769123492, 'valid_acc': 92.58426966292134}
{'epoch': 6, 'train_loss': 0.14319769961110662, 'train_acc': 92.43445692883896, 'valid_loss': 0.22556115971522384, 'valid_

In [410]:
model.load_state_dict(max_model_param)
with torch.no_grad():
    test_loss, test_acc = execution(x_test, y_test, op, criterion, model, batch_size=batch_size, is_train=False)
    print(test_acc)

91.53558052434457


In [393]:
class MyCl(torch.nn.Module):
    def __init__(self, vocab_size, dw=300, dh=50, w_sz=3, L=4, num_layers=1, bidirectional=False, dropout=0.2):
        super(MyCl, self).__init__()
        self.dw, self.dh, self.w_sz = dw, dh, w_sz
        
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        
        self.rnn = nn.RNN(dw, dh, num_layers=num_layers, bidirectional=bidirectional, batch_first=True, nonlinearity='relu', dropout=dropout)
        
        m = nn.Embedding(vocab_size, dw, padding_idx=0)
        nn.init.normal_(m.weight, mean=0, std=dw ** -0.5)
        nn.init.constant_(m.weight[0], 0)
        self.embed = m
        
        self.cnn = nn.Conv2d(in_channels=1, out_channels=dh, kernel_size=[w_sz, 2 * dh], padding=(w_sz-2, 0), stride=1)
        self.g = nn.ReLU()
        self.pool = torch.max
        self.linear = nn.Linear(dh, 4, bias=True)
        self.dropout = nn.Dropout2d(p=dropout)
        self.softmax = nn.LogSoftmax(dim=1) # dim=-1 or 1
        
        
    def forward(self, x, x_lengths):
        x = self.embed(x)
        packed = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True, enforce_sorted=False)
        x, _ = self.rnn(packed)
        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        max_len = x_lengths.max()
        mask = torch.FloatTensor([[ [1.0] if i < seq_len else [0.0] for i in range(max_len)] for seq_len in x_lengths])
        x = x * mask
        x = x.view(-1, 1, max_len, 2 * dh)
        x = self.cnn(x)
        x = x.view(x.shape[:3])
        x = self.g(x)
        x = self.dropout(x)
        x = pool(x, dim=2).values
        x = self.linear(x)
        x = self.dropout(x)
        x = self.softmax(x)
        return x 
    
    def update_from_word2vec(self, w2v, transformer):
        for word, idx in transformer.items():
            with torch.no_grad():
                if word in w2v:
                    self.embed.weight[idx].copy_(torch.from_numpy(w2v[word]))

                    
class MyDataSets(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = [torch.LongTensor(data) for data in x]
        self.y = [torch.LongTensor([data]) for data in y]

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

def collate_fn(batch):
    x = [data[0] for data in batch]
    x = nn.utils.rnn.pad_sequence(x, batch_first=True)
    y = torch.LongTensor([data[1] for data in batch])
    lengths = torch.LongTensor([len(data[0]) for data in batch])
    return x, y, lengths

def execution(data_x, data_y, op, criterion, model, batch_size=1, is_train=True, use_gpu=False):
    if is_train: model.train()
    else: model.eval()
    ndata = len(data_x)
    dataset = MyDataSets(data_x, data_y)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    sum_loss, acc_score = 0, 0
    for batch_x, batch_y, batch_lengths in data_loader:
        op.zero_grad()
        out = model(batch_x, batch_lengths)
        loss = criterion(out, batch_y)
        if is_train:
            loss.backward()
            op.step()
        sum_loss += loss.data.item() * len(batch_x)
        pred = torch.argmax(out, dim=1)
        acc_score += np.sum((pred == batch_y).cpu().detach().numpy())
    return sum_loss / ndata, acc_score / ndata * 100

In [396]:
x_train = load_file_json('work/train_x.json')['data']
y_train = np.asarray(load_file_json('work/train_y.json')['data'])
x_valid = load_file_json('work/valid_x.json')['data']
y_valid = np.asarray(load_file_json('work/valid_y.json')['data'])
x_test = load_file_json('work/test_x.json')['data']
y_test = np.asarray(load_file_json('work/test_y.json')['data'])


vocab_size = preprocess.vocab_size
torch.manual_seed(1234)
model = MyCl(vocab_size, dw=300, dh=50, w_sz=3, L=4, num_layers=2, bidirectional=True, dropout=0.3)
model.update_from_word2vec(w2v, preprocess.word_transformer)
nepoch = 30 
batch_size = 256
op = optim.Adagrad(model.parameters(), lr=0.01, lr_decay=0.001)
criterion = nn.NLLLoss() 


train_writer = SummaryWriter(log_dir='./work/logs/cnn/train')
valid_writer = SummaryWriter(log_dir='./work/logs/cnn/valid')
logger = list()
for epoch in tqdm.notebook.tqdm(range(nepoch)):
    train_loss, train_acc = execution(x_train, y_train, op, criterion, model, batch_size=batch_size)
    train_writer.add_scalar("loss", train_loss, epoch) 
    train_writer.add_scalar("accuracy", train_acc, epoch)
    with torch.no_grad():
        valid_loss, valid_acc = execution(x_valid, y_valid, op, criterion, model, batch_size=batch_size, is_train=False)
        valid_writer.add_scalar("loss", valid_loss, epoch)
        valid_writer.add_scalar("accuracy", valid_acc, epoch)
    logger.append({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
    print({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
train_writer.close()
valid_writer.close()

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

{'epoch': 0, 'train_loss': 0.7617954755990246, 'train_acc': 64.78464419475655, 'valid_loss': 0.45093154049991224, 'valid_acc': 82.47191011235955}
{'epoch': 1, 'train_loss': 0.5248867922731106, 'train_acc': 74.05430711610487, 'valid_loss': 0.386541759141822, 'valid_acc': 84.49438202247191}
{'epoch': 2, 'train_loss': 0.43306670436698397, 'train_acc': 76.45131086142321, 'valid_loss': 0.3535459771584929, 'valid_acc': 85.76779026217228}
{'epoch': 3, 'train_loss': 0.35876306201634783, 'train_acc': 79.70973782771536, 'valid_loss': 0.29925455965576103, 'valid_acc': 89.9625468164794}
{'epoch': 4, 'train_loss': 0.2839015465997132, 'train_acc': 82.7621722846442, 'valid_loss': 0.31662483724315515, 'valid_acc': 90.86142322097378}
{'epoch': 5, 'train_loss': 0.23849713913956833, 'train_acc': 84.70973782771536, 'valid_loss': 0.4425045225839043, 'valid_acc': 90.56179775280899}
{'epoch': 6, 'train_loss': 0.21623792997683478, 'train_acc': 85.07490636704121, 'valid_loss': 0.41337351468618444, 'valid_acc':

# 89. 事前学習済み言語モデルからの転移学習

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/98/87/ef312eef26f5cecd8b17ae9654cdd8d1fae1eb6dbd87257d6d73c128a4d0/transformers-4.3.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 3.6MB/s eta 0:00:01
Collecting sacremoses (from transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 11.6MB/s eta 0:00:01     |███████████████████████████████▉| 880kB 11.6MB/s eta 0:00:01
[?25hCollecting regex!=2019.12.17 (from transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/0a/d5/ae173868b6525c6f18f9a684c8842c0673cfc630430fcb48d8c6eb817f2e/regex-2020.11.13-cp37-cp37m-macosx_10_9_x86_64.whl (284kB)
[K     |████████████████████████████████| 286kB 9.0MB/s eta 0:00:01
[?25hCollecting tokenizers<0.11,>=0.10.1 (from transformers)
[?25l  Downloading https:

In [20]:
import numpy as np
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch import cuda
from transformers import BertTokenizer, BertModel
import tqdm

In [104]:
class MyBertModel(torch.nn.Module):
    def __init__(self, L=4, dropout=0.2):
        super(MyBertModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(p=dropout)
        self.softmax = nn.LogSoftmax(dim=1)
        self.linear = nn.Linear(768, L)
        
        
    def forward(self, inputs):
        out = self.bert(inputs['ids'], attention_mask=inputs['mask'])
        out = self.linear(self.dropout(out['pooler_output']))
        out = self.softmax(out)
        return out

                    
class MyDataSets(Dataset):
    def __init__(self, X, Y, tokenizer, max_len):
        self.X = X
        self.Y = Y
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.Y)

    def __getitem__(self, idx):
        text = self.X[idx]
        inputs = self.tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          pad_to_max_length=True,
          truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
          'ids': torch.LongTensor(ids),
          'mask': torch.LongTensor(mask),
          'labels': torch.LongTensor(self.Y[idx])
        }

def execution(dataset, op, criterion, model, batch_size=1, is_train=True, use_gpu=False):
    if is_train: model.train()
    else: model.eval()
    ndata = len(dataset)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=is_train)
    sum_loss, acc_score = 0, 0
    for data in data_loader:
        op.zero_grad()
        labels = data['labels'].reshape(-1)
        out = model(data)
        loss = criterion(out, labels)
        if is_train:
            loss.backward()
            op.step()
        sum_loss += loss.data.item() * len(labels)
        pred = torch.argmax(out, dim=1)
        acc_score += np.sum((pred == labels).cpu().detach().numpy())
    return sum_loss / ndata, acc_score / ndata * 100

In [114]:
x_train, _ = load_data('data/train.txt')
x_valid, _ = load_data('data/valid.txt')
x_test, _ = load_data('data/test.txt')
y_train = np.asarray(load_file_json('work/train_y.json')['data']).reshape(-1, 1)
y_valid = np.asarray(load_file_json('work/valid_y.json')['data']).reshape(-1, 1)
y_test = np.asarray(load_file_json('work/test_y.json')['data']).reshape(-1, 1)

max_len = 24
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset_train = MyDataSets(x_train, y_train, tokenizer, max_len)
dataset_valid = MyDataSets(x_valid, y_valid, tokenizer, max_len)
dataset_test = MyDataSets(x_test, y_test, tokenizer, max_len)

In [110]:
print(model)

MyBertModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


In [106]:
torch.manual_seed(1234)
model = MyBertModel(L=4, dropout=0.2)
nepoch = 10 
batch_size = 256 
op = optim.Adagrad(model.parameters(), lr=0.01, lr_decay=0.001)
criterion = nn.NLLLoss() 


#train_writer = SummaryWriter(log_dir='./work/logs/cnn/train')
#valid_writer = SummaryWriter(log_dir='./work/logs/cnn/valid')
logger = list()
max_valid = -1
max_model_param = None
for epoch in tqdm.notebook.tqdm(range(nepoch)):
    train_loss, train_acc = execution(dataset_train, op, criterion, model, batch_size=batch_size)
    #train_writer.add_scalar("loss", train_loss, epoch) 
    #train_writer.add_scalar("accuracy", train_acc, epoch)
    with torch.no_grad():
        valid_loss, valid_acc = execution(dataset_valid, op, criterion, model, batch_size=batch_size, is_train=False)
        #valid_writer.add_scalar("loss", valid_loss, epoch)
        #valid_writer.add_scalar("accuracy", valid_acc, epoch)
        
    if max_valid < valid_acc:
        max_valid = valid_acc
        max_model_param = model.state_dict()
        
    logger.append({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
    print({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
#train_writer.close()
#valid_writer.close()

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




KeyboardInterrupt: 

In [113]:
%%file src/gpu89.py

import numpy as np
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch import cuda
from transformers import BertTokenizer, BertModel
import tqdm
import json


def load_data(path):
    with open(path, mode='r') as f:
        X = list()
        for line in f:
            line = line.strip()
            splited_line = line.split('\t')
            X.append(splited_line[0])
        return X

def load_file_json(path):
    with open(path, mode='r') as in_file:
        data = json.load(in_file)
    return data

class MyBertModel(torch.nn.Module):
    def __init__(self, L=4, dropout=0.2):
        super(MyBertModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(p=dropout)
        self.softmax = nn.LogSoftmax(dim=1)
        self.linear = nn.Linear(768, L)
        
        
    def forward(self, inputs):
        out = self.bert(inputs['ids'], attention_mask=inputs['mask'])
        out = self.linear(self.dropout(out['pooler_output']))
        out = self.softmax(out)
        return out

                    
class MyDataSets(Dataset):
    def __init__(self, X, Y, tokenizer, max_len):
        self.X = X
        self.Y = Y
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.Y)

    def __getitem__(self, idx):
        text = self.X[idx]
        inputs = self.tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          pad_to_max_length=True,
          truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
          'ids': torch.LongTensor(ids),
          'mask': torch.LongTensor(mask),
          'labels': torch.LongTensor(self.Y[idx])
        }

def execution(dataset, op, criterion, model, batch_size=1, is_train=True, use_gpu=False):
    if is_train: model.train()
    else: model.eval()
    ndata = len(dataset)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=is_train)
    sum_loss, acc_score = 0, 0
    for data in data_loader:
        op.zero_grad()
        labels = data['labels'].reshape(-1)
        out = model(data)
        loss = criterion(out, labels)
        if is_train:
            loss.backward()
            op.step()
        sum_loss += loss.data.item() * len(labels)
        pred = torch.argmax(out, dim=1)
        acc_score += np.sum((pred == labels).cpu().detach().numpy())
    return sum_loss / ndata, acc_score / ndata * 100

if __name__ == "__main__":
    x_train = load_data('data/train.txt')
    x_valid = load_data('data/valid.txt')
    x_test = load_data('data/test.txt')
    y_train = np.asarray(load_file_json('work/train_y.json')['data']).reshape(-1, 1)
    y_valid = np.asarray(load_file_json('work/valid_y.json')['data']).reshape(-1, 1)
    y_test = np.asarray(load_file_json('work/test_y.json')['data']).reshape(-1, 1)

    max_len = 24
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    dataset_train = MyDataSets(x_train, y_train, tokenizer, max_len)
    dataset_valid = MyDataSets(x_valid, y_valid, tokenizer, max_len)
    dataset_test = MyDataSets(x_test, y_test, tokenizer, max_len)
    
    torch.manual_seed(1234)
    model = MyBertModel(L=4, dropout=0.2)
    nepoch = 10 
    batch_size = 256 
    op = optim.AdamW(model.parameters(), lr=0.00001)
    criterion = nn.NLLLoss() 


    logger = list()
    max_valid = -1
    max_model_param = None
    for epoch in tqdm.tqdm(range(nepoch)):
        train_loss, train_acc = execution(dataset_train, op, criterion, model, batch_size=batch_size)
        with torch.no_grad():
            valid_loss, valid_acc = execution(dataset_valid, op, criterion, model, batch_size=batch_size, is_train=False)

        if max_valid < valid_acc:
            max_valid = valid_acc
            max_model_param = model.state_dict()

        logger.append({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
        print({'epoch':epoch, 'train_loss':train_loss, 'train_acc':train_acc, 'valid_loss':valid_loss, 'valid_acc':valid_acc})
    
    model.load_state_dict(max_model_param)
    with torch.no_grad():
        test_loss, test_acc = execution(dataset_test, op, criterion, model, batch_size=batch_size, is_train=False)
        print(test_acc)

Overwriting src/gpu89.py
