In [1]:
from typing import List, Tuple

import gensim
import pandas as pd
import torch
import numpy as np

from module import Word2VecIdConverter, sentence2words

In [3]:
torch.cuda.is_available()

True

In [4]:
train_df = pd.read_csv('data/train.txt', sep='\t')
train_df.shape

(10672, 8)

In [5]:
converter = Word2VecIdConverter('data/mapping.csv', 'data/GoogleNews-vectors-negative300.bin')
n_words = converter.get_n_words()

In [2]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)
weights = w2v.wv.syn0
weights.shape

(3000000, 300)

In [6]:
results = map(sentence2words, train_df.title)
results = map(converter.word2id, results)
X = list(map(lambda x: torch.Tensor(x).long(), results))
len(X)

10672

In [7]:
y = torch.Tensor(train_df.category.map({'b': 0, 't': 1, 'e': 2, 'm': 3}).to_list()).long()

In [25]:
class RNN(torch.nn.Module):
    def __init__(self, vocab_size: int, input_size: int, hidden_size: int, output_size: int, weights: np.ndarray):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = torch.nn.Embedding(vocab_size, input_size)
        self.embedding.weight = torch.nn.Parameter(torch.from_numpy(weights))
        self.embedding.weight.requires_grad = False
        self.rnn = torch.nn.RNN(input_size, hidden_size, bidirectional=True)
        self.linear = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x: torch.Tensor, hidden: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x)
        x, _ = self.rnn(x, hidden)
        x = self.linear(x[:, -1])
        return torch.log_softmax(x, dim=1)

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [21]:
dw = 300
dh = 50
n_class = 4
rnn = RNN(vocab_size=n_words, input_size=dw, hidden_size=dh, output_size=n_class, weights=weights).cuda()

In [22]:
n_train_size = 10000
X_train = X[:n_train_size]
y_train = y[:n_train_size]
#X_train = X
#y_train = y

In [23]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(rnn.parameters(), lr=0.01)

In [26]:
n_epochs = 10
batch_size = 256
for epoch in range(1, n_epochs+1):
    optimizer.step()
    for i in range(0, len(X_train), batch_size):
        X_pad = torch.nn.utils.rnn.pad_sequence(X_train[i:i+batch_size], batch_first=True).cuda()
        h_0 = torch.zeros(2*X_pad.shape[1]*dh).reshape(2, X_pad.shape[1], dh).cuda()

        y_pred = rnn(X_pad, h_0)
        loss = criterion(y_pred, y_train[i:i+batch_size].cuda())
        loss.backward()
        optimizer.step()
    print(f'epoch {epoch}: {loss.item()}')
print('DONE')

epoch 1: 1.2523601055145264
epoch 2: 1.3445429801940918
epoch 3: 1.3777530193328857
epoch 4: 1.5240954160690308
epoch 5: 1.6282480955123901
epoch 6: 2.114105463027954
epoch 7: 1.8348824977874756
epoch 8: 1.5840672254562378
epoch 9: 1.745245337486267
epoch 10: 1.5938433408737183
DONE


In [27]:
y_pred.argmax(dim=1)

tensor([0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')