In [1]:
# data downloading
import data_util
import pandas as pd
from collections import defaultdict
import os
import sys

os.chdir("/home/yxjiang/source/ml_playground")
print(os.getcwd())

from util import config

/home/yxjiang/source/ml_playground


In [2]:
dataset_url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dest_dir = "/tmp/data"
dataset_folder_path = os.path.join(dest_dir, "aclImdb")
data_util.download_data(url=dataset_url, dest_dir=dest_dir)

# generate word to id mapping
word_to_id, word_list = data_util.get_vocabulary(folder_path=dataset_folder_path, file_suffix="vocab")
print("There size of vocabulary is :", len(word_to_id))

Destination folder [/tmp/data] exists.
Target file [aclImdb_v1.tar.gz] exists, skip downloading.
Start to extract [/tmp/data/aclImdb_v1.tar.gz] to [/tmp/data]...
File extracted
Processing vocabulary from [/tmp/data/aclImdb].
There size of vocabulary is : 89527


In [9]:
# model
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, config, vocabulary_size):
        super().__init__()
        self.config = config
        self.rnn_type = config.rnn_type
        self.embed = nn.Embedding(vocabulary_size, config.word_embedding_length)
        self.num_layers = config.num_layers if config.num_layers is not None else 1
        self.directions = config.num_directions if config.num_directions is not None else 1
        self.hidden_size = 128
        if config.rnn_type is nn.RNN:
            self.rnn = nn.RNN(
                input_size=config.word_embedding_length, 
                hidden_size=self.hidden_size, 
                num_layers=self.num_layers,
                bidirectional=False if self.directions == 1 else True
            )
        elif config.rnn_type is nn.LSTM:
            self.cell_size = 128
            self.rnn = nn.LSTM(
                input_size=config.word_embedding_length,
                hidden_size=self.hidden_size,
                num_layers=self.num_layers,
                batch_first=True,
                bidirectional=False if self.directions == 1 else True
            )
        elif config.rnn_type is nn.GRU:
            self.rnn = nn.GRU(
                input_size=config.word_embedding_length, 
                hidden_size=self.hidden_size, 
                num_layers=self.num_layers,
                bidirectional=False if self.directions == 1 else True
            )
        self.fc1 = nn.Linear(in_features=self.hidden_size * self.directions, out_features=64)
        self.dropout = nn.Dropout(config.dropout)
        self.fc2 = nn.Linear(in_features=64, out_features=config.num_classes)

    def forward(self, x):
        batch = x.shape[0]
        x = self.embed(x)  # (batch, sentence_length, embedding_dim)
        x = x.to(self.config.device)
        # x = x.permute(1, 0, 2).contiguous()  # (sentence_length, batch, embedding_dim)

        h0 = torch.zeros((self.num_layers * self.directions, batch, self.hidden_size)).to(self.config.device)
        # h0 = torch.zeros((batch, self.num_layers * self.directions, self.hidden_size)).to(self.config.device)
        if self.rnn_type is nn.RNN or self.rnn_type is nn.GRU:
            output, ht = self.rnn(x, h0)
        elif self.rnn_type is nn.LSTM:
            c0 = torch.zeros((self.num_layers * self.directions, batch, self.hidden_size)).to(self.config.device)
            # c0 = torch.zeros((batch, self.num_layers * self.directions, batch, self.hidden_size)).to(self.config.device)
            output, (ht, ct) = self.rnn(x, (h0, c0))
        # decompose layers and directions, (layer, directions, batch, embedding_dim)
        ht = ht.view(self.num_layers, self.directions, ht.shape[1], -1) 
        ht = ht[-1]  # get last layer and remove layer dimension, (directions, batch, embedding_dim)
        ht = ht.permute(1, 0, 2)  # (batch, directions, embedding_dim)
        ht = ht.contiguous().view(batch, -1)
        x = F.relu(self.fc1(ht))
        x = self.dropout(x)
        x = self.fc2(x)
        x = x.view(batch, -1)
        return x

In [10]:
# put everything together
import time
from data_util import *
from models import *
from trainer import classification_trainer
from util import config

import torch.optim as optim
import torch.nn.functional as F


def run(rnn_type, **kwargs):
    cfg = config.Config(
        rnn_type=rnn_type,
        criteria=nn.CrossEntropyLoss, optimizer=optim.Adam, lr=0.00002, epochs=200, 
        batch_size=1024, num_classes=2, sentence_max_length=200, word_embedding_length=128, 
        activation=F.relu, dropout=0.1, **kwargs
    )

    pos_train_data_folder = os.path.join(dataset_folder_path, "train/pos")
    neg_train_data_folder = os.path.join(dataset_folder_path, "train/neg")
    train_dataset = MovieReviewDataset(cfg, pos_train_data_folder, neg_train_data_folder, word_to_id, 
                                transform=transforms.Compose([
                                    TruncateTransform(cfg), 
                                    WordsToIdsTransform(cfg, word_to_id),
                                    # PadTransform(cfg)
                                ]))

    pos_test_data_folder = os.path.join(dataset_folder_path, "test/pos")
    neg_test_data_folder = os.path.join(dataset_folder_path, "test/neg")
    test_dataset = MovieReviewDataset(cfg, pos_test_data_folder, neg_test_data_folder, word_to_id, 
                                transform=transforms.Compose([
                                    TruncateTransform(cfg), 
                                    WordsToIdsTransform(cfg, word_to_id),
                                    # PadTransform(cfg)
                                ]))

    train_dataloader = DataLoader(dataset=train_dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=data_util.pad_collate)
    test_dataloader = DataLoader(dataset=test_dataset, batch_size=cfg.batch_size, collate_fn=data_util.pad_collate)

    # model = TextCNN(cfg, len(word_to_id)).to(device)
    model = RNN(cfg, len(word_list)).to(cfg.device)

    classification_trainer.train(model=model, config=cfg, train_dataloader=train_dataloader, test_dataloader=test_dataloader, check_interval=10)

# run(rnn_type=nn.GRU, num_layers=3, num_directions=2)
# run(rnn_type=nn.GRU, num_layers=1, num_directions=1)
# run(nn.LSTM, num_layers=3, num_directions=2)
run(nn.LSTM, num_layers=1, num_directions=1)

[6 seconds](epoch: 0/200)[25000 samples] loss: 0.693.
eval loss: 0.693, accuracy: 49.420% [12355/25000]
[70 seconds](epoch: 10/200)[275000 samples] loss: 0.692.
eval loss: 0.693, accuracy: 49.648% [12412/25000]
[135 seconds](epoch: 20/200)[525000 samples] loss: 0.693.
eval loss: 0.693, accuracy: 49.816% [12454/25000]
[201 seconds](epoch: 30/200)[775000 samples] loss: 0.693.
eval loss: 0.693, accuracy: 49.796% [12449/25000]
[268 seconds](epoch: 40/200)[1025000 samples] loss: 0.693.
eval loss: 0.693, accuracy: 50.000% [12500/25000]
[334 seconds](epoch: 50/200)[1275000 samples] loss: 0.689.
eval loss: 0.693, accuracy: 50.300% [12575/25000]
[400 seconds](epoch: 60/200)[1525000 samples] loss: 0.690.
eval loss: 0.693, accuracy: 51.252% [12813/25000]
[466 seconds](epoch: 70/200)[1775000 samples] loss: 0.691.
eval loss: 0.693, accuracy: 51.180% [12795/25000]
[530 seconds](epoch: 80/200)[2025000 samples] loss: 0.685.
eval loss: 0.692, accuracy: 52.100% [13025/25000]
[596 seconds](epoch: 90/200)

KeyboardInterrupt: 

In [None]:
# tiny example
import torch

a = [torch.Tensor([1,2,3]), torch.Tensor([1,2,3,4,5,6]), torch.Tensor([1,2,3,4,5])]
print('shapes of a:', [t.shape for t in a])

pad = torch.nn.utils.rnn.pad_sequence(a)
print('pad:', pad)

packed_pad = torch.nn.utils.rnn.pack_padded_sequence(pad, lengths=[t.shape[0] for t in a], enforce_sorted=False)
print('packed_pad:', packed_pad)
