In [120]:
from typing import List, Tuple

import pandas as pd
import torch
import numpy as np

from module import Word2IdConverter, sentence2words

In [72]:
train_df = pd.read_csv('data/train.txt', sep='\t')
train_df.shape

(10672, 8)

In [73]:
converter = Word2IdConverter('data/mapping.csv')
n_words = converter.get_n_words()

In [303]:
results = map(sentence2words, train_df.title)
results = map(converter.word2id, results)
X = list(map(lambda x: torch.Tensor(x).long(), results))
len(X)

10672

In [309]:
class RNN(torch.nn.Module):
    def __init__(self, vocab_size: int, input_size: int, hidden_size: int, output_size: int):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = torch.nn.Embedding(vocab_size, input_size)
        self.rnn = torch.nn.RNN(input_size, hidden_size)
        self.linear = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x: torch.Tensor, hidden: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        x = self.embedding(x)
        x, _ = self.rnn(x, hidden)
        x = self.linear(x[:, -1])
        return torch.log_softmax(x, dim=1)

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [310]:
dw = 300
dh = 50
n_class = 4
rnn = RNN(vocab_size=n_words, input_size=dw, hidden_size=dh, output_size=n_class)

In [321]:
n_train_size = 100
X_train = X[:n_train_size]
X_pad = torch.nn.utils.rnn.pad_sequence(X_train, batch_first=True)
h_0 = torch.zeros(1*X_pad.shape[1]*dh).reshape(1, X_pad.shape[1], dh)

print(X_pad.shape)
y_pred = rnn(X_pad, h_0)

torch.Size([100, 18])


In [322]:
y_pred.argmax(dim=1)

tensor([2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2])