In [1]:
import pandas as pd
import pickle

import torchtext
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split

torch.manual_seed(1)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7fcd03bfcb10>

In [2]:
with open('/root/share/data/train_data.pkl', 'rb') as f:
    train_data = pickle.load(f)
with open('/root/share/data/train_label.pkl', 'rb') as f:
    train_label = pickle.load(f)
with open('/root/share/data/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)
with open('/root/share/data/test_data.pkl', 'rb') as f:
    test = pickle.load(f)

In [3]:
train, valid = random_split([[train_data[i], train_label[i]] for i in range(len(train_label))], [len(train_label)-2000,2000])

In [4]:
class LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, target_size)
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, input):
        embeds = self.word_embeddings(input)
        _, output = self.lstm(embeds)
        output = self.dropout(output[0])
        output = self.linear(output)
        output = F.log_softmax(output, dim=1).view(input.size()[0], -1)
        return output

In [52]:
# model hyperparameters
EMBEDDING_DIM = 200
HIDDEN_DIM = 100
VOCAB_SIZE = 42000
TARGET_SIZE = 3

# train hyperparameters
EPOCHS = 10
BATCH_SIZE = 1024

In [53]:
model = LSTM(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, TARGET_SIZE)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

In [54]:
train_dataloader = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(valid, batch_size=BATCH_SIZE, shuffle=False)

In [61]:
batch_num = len(train_dataloader)
best_accuracy = 0.0
trigger_times = 0
patience = 5
for epoch in range(3):
    running_loss = 0.0
    correct = 0
    for i, data in enumerate(train_dataloader, 1):
        inputs, labels = data

        optimizer.zero_grad()

        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted==labels).sum().item()
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f'{epoch + 1} epoch loss: {running_loss / 297:.6f}, accuracy : {100*correct/len(train):.2f}%')
    running_loss = 0.0

    valid_correct = 0
    valid_total = 0
    with torch.no_grad():
        for data in test_dataloader:
            inputs, labels = data
        outputs = model(inputs)

        _, predicted = torch.max(outputs.data, 1)
        valid_total += labels.size(0)
        valid_correct += (predicted == labels).sum().item()
    print(f'Valid Accuracy : {100*valid_correct/valid_total:.2f}%')
    # early stopping
    current_accuracy = valid_correct/valid_total
    if current_accuracy < best_accuracy:
        trigger_times += 1
        if trigger_times >= patience :
            break
    else:
        torch.save(model.state_dict(), '/root/share/model/weight.pt')
        trigger_times = 0
        best_accuracy = current_accuracy

1 epoch loss: 0.789525, accuracy : 86.97%
Valid Accuracy : 84.53%
2 epoch loss: 0.786458, accuracy : 87.77%
Valid Accuracy : 85.76%
3 epoch loss: 0.785891, accuracy : 87.92%
Valid Accuracy : 84.73%


In [62]:
test_model = LSTM(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, TARGET_SIZE)
test_model.load_state_dict(torch.load('/root/share/model/weight.pt'))
test_model.eval()

LSTM(
  (word_embeddings): Embedding(42000, 200)
  (lstm): LSTM(200, 100, batch_first=True)
  (linear): Linear(in_features=100, out_features=3, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [63]:
infer_dataloader = DataLoader(test, batch_size = BATCH_SIZE, shuffle=False)

In [64]:
res = torch.Tensor([])
with torch.no_grad():
    for data in infer_dataloader:
        outputs = test_model(data)

        _, predicted = torch.max(outputs.data, 1)
        res = torch.cat([res, predicted])

In [65]:
res = res.tolist()
res = list(map(int, res))
submission_data = {'index':[i for i in range(5000)], 'category' : res}
submission_df = pd.DataFrame(submission_data)
submission_df.to_csv('/root/share/data/lstm_submission.csv', index=False)