In [13]:
from typing import List, Tuple

import pandas as pd
import torch
import numpy as np

from module import Word2IdConverter, sentence2words

In [14]:
train_df = pd.read_csv('data/train.txt', sep='\t')
train_df.shape

(10672, 8)

In [15]:
converter = Word2IdConverter('data/mapping.csv')
n_words = converter.get_n_words()

In [16]:
results = map(sentence2words, train_df.title)
results = map(converter.word2id, results)
X = list(map(lambda x: torch.Tensor(x).long(), results))
len(X)

10672

In [17]:
y = torch.Tensor(train_df.category.map({'b': 0, 't': 1, 'e': 2, 'm': 3}).to_list()).long()

In [18]:
class CNN(torch.nn.Module):
    def __init__(self,
                 vocab_size: int,
                 input_size: int,
                 hidden_size: int,
                 output_size: int,
                 kernel_size: int):
        super(CNN, self).__init__()
        self._hidden_size = hidden_size
        self._kernel_size = kernel_size
        self.embedding = torch.nn.Embedding(vocab_size, input_size)
        self.conv = torch.nn.Conv2d(
            in_channels=1,
            out_channels=hidden_size,
            kernel_size=kernel_size,
            stride=1,
            padding=0,
            groups=1,
            bias=True
        )
        self.linear = torch.nn.Linear(hidden_size*5*99, output_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x)
        x = x.unsqueeze(1)
        x = self.conv(x)
        x = torch.max_pool2d(x, kernel_size=(self._kernel_size, self._kernel_size))
        x = x.view(-1, self._hidden_size*5*99)
        x = self.linear(x)
        return torch.log_softmax(x, dim=1)

In [37]:
dw = 300
dh = 50
n_class = 4
kernel_size = 3
cnn = CNN(vocab_size=n_words, input_size=dw, hidden_size=dh, output_size=n_class, kernel_size=kernel_size)

In [38]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(cnn.parameters(), lr=0.01)

In [39]:
n_train_size = 1000
X_train = X[:n_train_size]
y_train = y[:n_train_size]

n_epochs = 10
for epoch in range(1, n_epochs+1):
    optimizer.zero_grad()
    X_pad = torch.nn.utils.rnn.pad_sequence(X_train, batch_first=True)
    y_pred = cnn(X_pad)
    loss = criterion(y_pred, y_train)
    loss.backward()
    optimizer.step()
    accuracy = (y_pred.argmax(1) == y_train).sum().item() / y_train.shape[0]
    print(f'epoch {epoch}- loss: {loss.item()}, accuracy: {accuracy}')
print('DONE')

epoch 1- loss: 1.4820444583892822, accuracy: 0.145
epoch 2- loss: 20.87087059020996, accuracy: 0.435
epoch 3- loss: 70.64411926269531, accuracy: 0.392
epoch 4- loss: 14.961731910705566, accuracy: 0.435
epoch 5- loss: 45.07423782348633, accuracy: 0.392
epoch 6- loss: 31.241729736328125, accuracy: 0.07
epoch 7- loss: 26.482818603515625, accuracy: 0.435
epoch 8- loss: 33.544498443603516, accuracy: 0.392
epoch 9- loss: 28.559917449951172, accuracy: 0.103
epoch 10- loss: 34.77330780029297, accuracy: 0.435
DONE


In [40]:
y_pred.argmax(dim=1)[:10]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])