In [5]:
from typing import List, Tuple

import gensim
import pandas as pd
import torch
import numpy as np

from module import Word2VecIdConverter, sentence2words

In [2]:
torch.cuda.is_available()

True

In [3]:
train_df = pd.read_csv('data/train.txt', sep='\t')
train_df.shape

(10672, 8)

In [4]:
converter = Word2VecIdConverter('data/mapping.csv', 'data/GoogleNews-vectors-negative300.bin')
n_words = converter.get_n_words()

In [6]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)
weights = w2v.wv.syn0
weights.shape

(3000000, 300)

In [7]:
results = map(sentence2words, train_df.title)
results = map(converter.word2id, results)
X = list(map(lambda x: torch.Tensor(x).long(), results))
len(X)

10672

In [8]:
y = torch.Tensor(train_df.category.map({'b': 0, 't': 1, 'e': 2, 'm': 3}).to_list()).long()

In [9]:
class CNN(torch.nn.Module):
    def __init__(self,
                 vocab_size: int,
                 input_size: int,
                 hidden_size: int,
                 output_size: int,
                 kernel_size: int,
                 weights: np.ndarray):
        super(CNN, self).__init__()
        self._hidden_size = hidden_size
        self._kernel_size = kernel_size
        self.embedding = torch.nn.Embedding(vocab_size, input_size)
        self.embedding.weight = torch.nn.Parameter(torch.from_numpy(weights))
        self.embedding.weight.requires_grad = False
        self.conv = torch.nn.Conv2d(
            in_channels=1,
            out_channels=hidden_size,
            kernel_size=kernel_size,
            stride=1,
            padding=0,
            groups=1,
            bias=True
        )
        self.linear = torch.nn.Linear(hidden_size*5*99, output_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x)
        x = x.unsqueeze(1)
        x = self.conv(x)
        x = torch.max_pool2d(x, kernel_size=(self._kernel_size, self._kernel_size))
        x = x.view(-1, self._hidden_size*5*99)
        x = self.linear(x)
        return torch.log_softmax(x, dim=1)

In [10]:
dw = 300
dh = 50
n_class = 4
kernel_size = 3
cnn = CNN(vocab_size=n_words, input_size=dw, hidden_size=dh, output_size=n_class, kernel_size=kernel_size, weights=weights).cuda()

In [11]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=0.001)

In [12]:
n_train_size = 9600
X_train = X[:n_train_size]
y_train = y[:n_train_size]
X_pad = torch.nn.utils.rnn.pad_sequence(X_train, batch_first=True)

batch_size = 128
n_epochs = 30
for epoch in range(1, n_epochs+1):
    running_loss = 0.0
    correct = 0
    for i in range(0, len(X_train), batch_size):
        optimizer.zero_grad()
        y_pred = cnn(X_pad[i:i+batch_size].cuda())
        loss = criterion(y_pred, y_train[i:i+batch_size].cuda())
        loss.backward()
        optimizer.step()
        correct += (y_pred.argmax(1) == y_train[i:i+batch_size].cuda()).sum().item()
        running_loss += loss.item()
    
    print(f'epoch {epoch}- loss: {running_loss}, accuracy: {correct/y_train.shape[0]}')
print('DONE')

epoch 1- loss: 86.28493183851242, accuracy: 0.5902083333333333
epoch 2- loss: 47.82752722501755, accuracy: 0.7822916666666667
epoch 3- loss: 36.54272583127022, accuracy: 0.8284375
epoch 4- loss: 31.49660536646843, accuracy: 0.8507291666666666
epoch 5- loss: 28.259257674217224, accuracy: 0.8661458333333333
epoch 6- loss: 25.99043382704258, accuracy: 0.87625
epoch 7- loss: 24.14384587109089, accuracy: 0.8865625
epoch 8- loss: 22.235573932528496, accuracy: 0.8972916666666667
epoch 9- loss: 20.305023714900017, accuracy: 0.9083333333333333
epoch 10- loss: 18.557565093040466, accuracy: 0.9178125
epoch 11- loss: 16.99018655717373, accuracy: 0.9280208333333333
epoch 12- loss: 15.558308206498623, accuracy: 0.9369791666666667
epoch 13- loss: 14.233071625232697, accuracy: 0.9427083333333334
epoch 14- loss: 13.001017540693283, accuracy: 0.9492708333333333
epoch 15- loss: 11.856257431209087, accuracy: 0.9561458333333334
epoch 16- loss: 10.798105597496033, accuracy: 0.961875
epoch 17- loss: 9.830118

In [13]:
y_pred.argmax(dim=1)[:10]

tensor([3, 2, 0, 3, 2, 0, 2, 2, 0, 2], device='cuda:0')