In [181]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)

unable to import 'smart_open.gcs', disabling that module


In [188]:
len(model.vocab)

3000000

# 70. 単語ベクトルの和による特徴量

In [4]:
import numpy as np
import re
from functools import reduce
from nltk.tokenize import word_tokenize
#from nltk.stem.porter import PorterStemmer

In [5]:
def load_data(dir_name, file_name):
    with open(f'{dir_name}{file_name}') as f:
        X = list()
        Y = list()
        for line in f:
            line = line.strip()
            splited_line = line.split('\t')
            X.append(splited_line[0])
            Y.append(splited_line[1])
        return np.asarray(X), np.asarray(Y)

def txt2vec(x):
    vec_x_list = list()
    #stemmer = PorterStemmer()
    for text in x:
        words = word_tokenize(text)
        #words = list(map(stemmer.stem, words)) stemmingしない方が単語ベクトルに含まれる単語数が多かった
        words_vec = [model[word] for word in words if word in model]
        if not words_vec:
            print(words)
            continue
        vec_x_list.append(sum(words_vec) / len(words_vec))
    return np.asarray(vec_x_list)

def save_file_npy(dir_name, file_name, x):
    np.save(f'{dir_name}{file_name}', x)
        
def load_file_npy(dir_name, file_name):
    return np.load(f'{dir_name}{file_name}')

def chr2num(y):
    converter = {'b':0, 't':1, 'e':2, 'm':3}
    return np.asarray([converter[article_type] for article_type in y])

In [301]:
train_x, train_y = load_data('data/', 'train.txt')
valid_x, valid_y = load_data('data/', 'valid.txt')
test_x, test_y = load_data('data/', 'test.txt')

In [302]:
print(train_x.shape, train_y.shape)
print(valid_x.shape, valid_y.shape)
print(test_x.shape, test_y.shape)

(10680,) (10680,)
(1335,) (1335,)
(1335,) (1335,)


In [303]:
train_x = txt2vec(train_x)
valid_x = txt2vec(valid_x)
test_x = txt2vec(test_x)

In [304]:
train_y = chr2num(train_y)
valid_y = chr2num(valid_y)
test_y = chr2num(test_y)

In [305]:
save_file_npy('work/', 'train_x', train_x)
save_file_npy('work/', 'train_y', train_y)
save_file_npy('work/', 'valid_x', valid_x)
save_file_npy('work/', 'valid_y', valid_y)
save_file_npy('work/', 'test_x', test_x)
save_file_npy('work/', 'test_y', test_y)

# 71. 単層ニューラルネットワークによる予測

In [6]:
train_x = load_file_npy('work/', 'train_x.npy')
train_y = load_file_npy('work/', 'train_y.npy')
valid_x = load_file_npy('work/', 'valid_x.npy')
valid_y = load_file_npy('work/', 'valid_y.npy')
test_x = load_file_npy('work/', 'test_x.npy')
test_y = load_file_npy('work/', 'test_y.npy')

In [307]:
print(train_x.shape, train_y.shape)

(10680, 300) (10680,)


In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
#from torch.autograd import Variable

In [317]:
torch.manual_seed(0)
W = torch.randn(300, 4)
print(W)

tensor([[-1.1258, -1.1524, -0.2506, -0.4339],
        [ 0.8487,  0.6920, -0.3160, -2.1152],
        [ 0.3223, -1.2633,  0.3500,  0.3081],
        ...,
        [ 1.1469, -0.1733,  0.0637, -1.2699],
        [-0.6212, -0.2381,  0.0892,  1.8008],
        [-2.0627,  0.3222, -1.1390,  1.2418]])


In [318]:
train_x = torch.from_numpy(train_x)
train_y = torch.from_numpy(train_y)

In [319]:
print(F.softmax(torch.matmul(train_x[:1], W), dim=1))

tensor([[0.0073, 0.9393, 0.0504, 0.0030]])


In [320]:
print(F.softmax(torch.matmul(train_x[:4], W), dim=1))

tensor([[0.0073, 0.9393, 0.0504, 0.0030],
        [0.2980, 0.6627, 0.0371, 0.0021],
        [0.0568, 0.7394, 0.0224, 0.1814],
        [0.1399, 0.5424, 0.0614, 0.2563]])


# 72. 損失と勾配の計算

In [426]:
torch.manual_seed(0)
W = nn.Parameter(torch.randn(300, 4), requires_grad=True)
print(W)

Parameter containing:
tensor([[-1.1258, -1.1524, -0.2506, -0.4339],
        [ 0.8487,  0.6920, -0.3160, -2.1152],
        [ 0.3223, -1.2633,  0.3500,  0.3081],
        ...,
        [ 1.1469, -0.1733,  0.0637, -1.2699],
        [-0.6212, -0.2381,  0.0892,  1.8008],
        [-2.0627,  0.3222, -1.1390,  1.2418]], requires_grad=True)


In [427]:
criterion = nn.CrossEntropyLoss()
v_train_x = train_x[:4]
v_train_y = train_y[:4]

In [428]:
print(v_train_x)

tensor([[ 0.0170, -0.0622, -0.0194,  ..., -0.0100,  0.1557, -0.0435],
        [-0.0488,  0.0571,  0.1176,  ...,  0.0959,  0.1008,  0.0121],
        [ 0.0923,  0.0072, -0.2906,  ...,  0.0138,  0.0388, -0.0379],
        [ 0.0472, -0.0069, -0.0391,  ..., -0.0337,  0.0504, -0.1157]])


In [429]:
v_train_x = torch.matmul(v_train_x, W)
loss = criterion(v_train_x, v_train_y)

In [430]:
print(loss)

tensor(3.4065, grad_fn=<NllLossBackward>)


In [431]:
loss.backward()
print(W.grad)

tensor([[ 0.0115,  0.0194, -0.0339,  0.0029],
        [-0.0103, -0.0047, -0.0004,  0.0154],
        [-0.0262, -0.0441,  0.0811, -0.0108],
        ...,
        [-0.0178,  0.0115,  0.0053,  0.0010],
        [-0.0151,  0.0673, -0.0184, -0.0338],
        [-0.0068, -0.0309,  0.0360,  0.0017]])


In [432]:
torch.manual_seed(0)
W = nn.Parameter(torch.randn(300, 4), requires_grad=True)
print(W)

Parameter containing:
tensor([[-1.1258, -1.1524, -0.2506, -0.4339],
        [ 0.8487,  0.6920, -0.3160, -2.1152],
        [ 0.3223, -1.2633,  0.3500,  0.3081],
        ...,
        [ 1.1469, -0.1733,  0.0637, -1.2699],
        [-0.6212, -0.2381,  0.0892,  1.8008],
        [-2.0627,  0.3222, -1.1390,  1.2418]], requires_grad=True)


In [433]:
out = torch.matmul(train_x[:1], W)
loss = criterion(out, train_y[:1])

In [434]:
print(loss)

tensor(5.8259, grad_fn=<NllLossBackward>)


In [435]:
loss.backward()
print(W.grad)

tensor([[ 1.2397e-04,  1.5989e-02,  8.5838e-04, -1.6971e-02],
        [-4.5319e-04, -5.8451e-02, -3.1380e-03,  6.2042e-02],
        [-1.4114e-04, -1.8203e-02, -9.7726e-04,  1.9321e-02],
        ...,
        [-7.3013e-05, -9.4169e-03, -5.0556e-04,  9.9955e-03],
        [ 1.1338e-03,  1.4623e-01,  7.8504e-03, -1.5521e-01],
        [-3.1683e-04, -4.0864e-02, -2.1938e-03,  4.3374e-02]])


# 73. 確率的勾配降下法による学習

In [None]:
import tqdm

In [436]:
train_x = load_file_npy('work/', 'train_x.npy')
train_y = load_file_npy('work/', 'train_y.npy')
valid_x = load_file_npy('work/', 'valid_x.npy')
valid_y = load_file_npy('work/', 'valid_y.npy')
test_x = load_file_npy('work/', 'test_x.npy')
test_y = load_file_npy('work/', 'test_y.npy')

In [446]:
torch.manual_seed(0)
W = nn.Parameter(torch.randn(300, 4), requires_grad=True)
print(W)

Parameter containing:
tensor([[-1.1258, -1.1524, -0.2506, -0.4339],
        [ 0.8487,  0.6920, -0.3160, -2.1152],
        [ 0.3223, -1.2633,  0.3500,  0.3081],
        ...,
        [ 1.1469, -0.1733,  0.0637, -1.2699],
        [-0.6212, -0.2381,  0.0892,  1.8008],
        [-2.0627,  0.3222, -1.1390,  1.2418]], requires_grad=True)


In [447]:
def train(train_x, train_y, op, criterion, batch_size=4, nepoch=100):
    ntrain = len(train_x)
    loss_list = list()
    for epoch in tqdm.notebook.tqdm(range(nepoch)):
        sum_loss = 0
        perm = np.random.permutation(ntrain)
        for i in range(0, ntrain, batch_size):
            batch_x = torch.from_numpy(train_x[perm[i:i + batch_size]])
            batch_y = torch.from_numpy(train_y[perm[i:i + batch_size]])
            #batch_x, batch_y = Variable(batch_x), Variable(batch_y)
            batch_x = torch.matmul(batch_x, W)
            loss = criterion(batch_x, batch_y)
            op.zero_grad()
            loss.backward()
            op.step()
            sum_loss += loss.data.item() * len(batch_x)
        loss_list.append(sum_loss / ntrain)
    return loss_list

In [467]:
ntrain = len(train_x)
nepoch = 100
op = optim.SGD([W], lr=0.01)
criterion = nn.CrossEntropyLoss()
result = train(train_x, train_y, op, criterion, batch_size=1)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [468]:
print(W)

Parameter containing:
tensor([[-2.0231, -0.1074,  0.4542, -1.2848],
        [ 0.4140,  1.0014, -0.1300, -2.1738],
        [ 0.8591,  0.7397, -2.5430,  0.6604],
        ...,
        [ 0.3355, -0.0249, -0.7943,  0.2516],
        [ 1.9430, -1.6062, -0.3233,  1.0196],
        [-2.2287, -1.0866, -0.4814,  2.1579]], requires_grad=True)


In [469]:
print(result[::10])

[0.28822114222264317, 0.2729065997679739, 0.26329753994712024, 0.25645157239017474, 0.25113904970215967, 0.24712463183076697, 0.24390258440257187, 0.24111985022297414, 0.2389436593737681, 0.23713426657275935]


# 74. 正解率の計測

In [24]:
from sklearn.metrics import accuracy_score

In [474]:
pred_y = torch.argmax(F.softmax(torch.matmul(torch.from_numpy(test_x), W), dim=-1), dim=1)
print(f'test_acc={accuracy_score(test_y, pred_y)}')

test_acc=0.9071161048689138


In [475]:
pred_y = torch.argmax(F.softmax(torch.matmul(torch.from_numpy(train_x), W), dim=-1), dim=1)
print(f'train_loss={accuracy_score(train_y, pred_y)}')

train_loss=0.9195692883895131


# 75. 損失と正解率のプロット

In [19]:
from torch.utils.tensorboard import SummaryWriter
import tqdm

In [20]:
def execution(data_x, data_y, op, criterion, batch_size=4, nepoch=100, train=True):
    ndata = len(data_x)
    perm = np.random.permutation(ndata)
    sum_loss = 0
    for i in range(0, ndata, batch_size):
        op.zero_grad()
        batch_x = torch.from_numpy(data_x[perm[i:i + batch_size]])
        batch_y = torch.from_numpy(data_y[perm[i:i + batch_size]])
        #batch_x, batch_y = Variable(batch_x), Variable(batch_y)
        if train:
            out = torch.matmul(batch_x, W)
            loss = criterion(out, batch_y)
            loss.backward()
            op.step()
        else:
            with torch.no_grad():
                out = torch.matmul(batch_x, W)
                loss = criterion(out, batch_y)
        sum_loss += loss.data.item() * len(batch_x)
    return sum_loss / ndata

In [21]:
torch.manual_seed(0)
W = nn.Parameter(torch.randn(300, 4), requires_grad=True)
print(W)
ntrain = len(train_x)
nepoch = 100
op = optim.SGD([W], lr=0.01)
criterion = nn.CrossEntropyLoss()

Parameter containing:
tensor([[-1.1258, -1.1524, -0.2506, -0.4339],
        [ 0.8487,  0.6920, -0.3160, -2.1152],
        [ 0.3223, -1.2633,  0.3500,  0.3081],
        ...,
        [ 1.1469, -0.1733,  0.0637, -1.2699],
        [-0.6212, -0.2381,  0.0892,  1.8008],
        [-2.0627,  0.3222, -1.1390,  1.2418]], requires_grad=True)


In [None]:
train_loss_list = []
valid_loss_list = []
writer = SummaryWriter(log_dir='./work/logs')
for epoch in tqdm.notebook.tqdm(range(nepoch)):
    train_loss = execution(train_x, train_y, op, criterion, batch_size=1)
    writer.add_scalar("train_loss", train_loss, epoch) 
    train_loss_list.append(train_loss)
    
    valid_loss = execution(valid_x, valid_y, op, criterion, batch_size=1, nepoch=100, train=False)
    writer.add_scalar("valid_loss", valid_loss, epoch)
    valid_loss_list.append(valid_loss)
    
    with torch.no_grad():
        pred_y = torch.argmax(F.softmax(torch.matmul(torch.from_numpy(train_x), W), dim=-1), dim=1)
        train_acc = accuracy_score(train_y, pred_y)
        writer.add_scalar("train_acc_score", train_acc, epoch)

        pred_y = torch.argmax(F.softmax(torch.matmul(torch.from_numpy(valid_x), W), dim=-1), dim=1)
        valid_acc = accuracy_score(valid_y, pred_y)
        writer.add_scalar("valid_acc_score", valid_acc, epoch)
writer.close()

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

In [26]:
print(train_loss_list)
print(valid_loss_list)

[0.5028027535337544, 0.4530537145898094, 0.4212120260656263, 0.39885072301978136, 0.3817836359072561, 0.36883020925704163, 0.35804143832598584, 0.3490760947505988, 0.3414108556378632, 0.3348980882975398, 0.32918650068763544, 0.3240989345345004, 0.3195082837662921, 0.3153567257739548, 0.3116891590673217, 0.3080970557812238, 0.3048600675543025, 0.302021585000974, 0.2993079356373766, 0.2968831482188698]
[0.45987449703108707, 0.4238755054391452, 0.3987418100730527, 0.3810014982517295, 0.3682995367835792, 0.3582362641225979, 0.34990171425655586, 0.3433129606341169, 0.33751936203865257, 0.3334261171332633, 0.3291596764830163, 0.32494030997966145, 0.32244329215217704, 0.31896338794413254, 0.3166717988324684, 0.3146938183171857, 0.312967406040082, 0.3105635773455405, 0.30909733356401886, 0.30747632840379246]


# 76. チェックポイント

# 77. ミニバッチ化

In [None]:
def train(train_x, train_y, batch_size=4, nepoch=100, op, criterion):
    ntrain = len(train_x)
    perm = np.random.permutation(ntrain)
    loss_list = list()
    for epoch in tqdm(range(1, nepoch + 1)):
        sum_loss = 0
        for i in range(0, ntrain, batch_size):
            batch_x = torch.from_numpy(train_x[perm[i:i + batch_size]])
            batch_y = torch.from_numpy(train_y[perm[i:i + batch_size]])
            batch_x, batch_y = Variable(batch_x), Variable(batch_y)
            pred = F.softmax(torch.matmul(batch_x, W), dim=-1)
            loss = criterion(pred, batch_y.type(torch.long))
            op.zero_grad()
            loss.backward()
            op.step()
            sum_loss += loss.data.item() * len(batch_x)
        loss_list.append(sum_loss / ntrain)

# 78. GPU上での学習

In [136]:
print(torch.cuda.is_available())

False


# 79. 多層ニューラルネットワーク