In [1]:
# data downloading
import data_util
import pandas as pd
from collections import defaultdict
import os
import sys

os.chdir("/home/yxjiang/source/ml_playground")
print(os.getcwd())

from config import *

/home/yxjiang/source/ml_playground


In [2]:
dataset_url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dest_dir = "/tmp/data"
dataset_folder_path = os.path.join(dest_dir, "aclImdb")
data_util.download_data(url=dataset_url, dest_dir=dest_dir)

# generate word to id mapping
word_to_id, word_list = data_util.get_vocabulary(folder_path=dataset_folder_path, file_suffix="vocab")
print("There size of vocabulary is :", len(word_to_id))

Destination folder [/tmp/data] exists.
Target file [aclImdb_v1.tar.gz] exists, skip downloading.
Start to extract [/tmp/data/aclImdb_v1.tar.gz] to [/tmp/data]...
File extracted
Processing vocabulary from [/tmp/data/aclImdb].
There size of vocabulary is : 89527


In [3]:
# model
class RNN(nn.Module):
    def __init__(self, config, vocabulary_size):
        super().__init__()
        self.config = config
        self.rnn_type = config.rnn_type
        self.embed = nn.Embedding(vocabulary_size, config.word_embedding_length)
        self.num_layers = 1
        self.directions = 1
        self.hidden_size = 128
        if config.rnn_type is nn.RNN:
            self.rnn = nn.RNN(
                input_size=config.word_embedding_length, 
                hidden_size=self.hidden_size, 
                num_layers=self.num_layers
            )
        elif config.rnn_type is nn.LSTM:
            self.cell_size = 128
            self.rnn = nn.LSTM(
                input_size=config.word_embedding_length,
                hidden_size=self.hidden_size,
                num_layers=self.num_layers
            )
        elif config.rnn_type is nn.GRU:
            self.rnn = nn.GRU(
                input_size=config.word_embedding_length, 
                hidden_size=self.hidden_size, 
                num_layers=self.num_layers
            )
        self.fc1 = nn.Linear(in_features=self.hidden_size, out_features=64)
        self.dropout = nn.Dropout(config.dropout)
        self.fc2 = nn.Linear(in_features=64, out_features=config.num_classes)

    def forward(self, x):
        batch = x.shape[0]
        x = self.embed(x)  # (batch, sentence_length, embedding_dim)
        x = x.permute(1, 0, 2).contiguous()  # (sentence_length, batch, embedding_dim)

        h0 = torch.zeros((self.num_layers * self.directions, batch, self.hidden_size)).to(self.config.device)
        if self.rnn_type is nn.RNN or self.rnn_type is nn.GRU:
            output, ht = self.rnn(x, h0)
        elif self.rnn_type is nn.LSTM:
            c0 = torch.zeros((self.num_layers * self.directions, batch, self.hidden_size)).to(self.config.device)
            output, (ht, ct) = self.rnn(x, (h0, c0))
        ht = ht.view(self.num_layers, self.directions, ht.shape[1], -1) # decompose layers and directions
        ht = ht[-1]  # get last layer and remove layer dimension
        ht = ht.permute(1, 0, 2)  # (batch, directions, embedding_dim)
        ht = ht.contiguous().view(batch, self.directions, self.hidden_size)
        x = F.relu(self.fc1(ht))
        x = self.dropout(x)
        x = self.fc2(x)
        x = x.view(batch, -1)
        return x

In [4]:
# put everything together
import time
from data_util import *
from models import *
from trainer import classification_trainer

import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

# config = TextCNNConfig(
#             criteria=nn.CrossEntropyLoss, optimizer=optim.Adam, lr=0.00003, epochs=1000, 
#             batch_size=1024, num_classes=2, sentence_max_length=30, word_embedding_length=128, 
#             activation=F.relu, dropout=0.1, conv_layer_sizes=[3,4,5,6,7]
#         )

def run(rnn_type, num_layers, **kwargs):
    config = RNNConfig(
        rnn_type=rnn_type,
        criteria=nn.CrossEntropyLoss, optimizer=optim.Adam, lr=0.00001, epochs=500, 
        batch_size=1024, num_classes=2, sentence_max_length=40, word_embedding_length=128, 
        num_layers=num_layers, activation=F.relu, dropout=0.1, **kwargs
    )

    pos_train_data_folder = os.path.join(dataset_folder_path, "train/pos")
    neg_train_data_folder = os.path.join(dataset_folder_path, "train/neg")
    train_dataset = MovieReviewDataset(config, pos_train_data_folder, neg_train_data_folder, word_to_id, 
                                transform=transforms.Compose([
                                    CutOrPadTransform(config), WordsToIdsTransform(config, word_to_id)
                                ]))

    pos_test_data_folder = os.path.join(dataset_folder_path, "test/pos")
    neg_test_data_folder = os.path.join(dataset_folder_path, "test/neg")
    test_dataset = MovieReviewDataset(config, pos_test_data_folder, neg_test_data_folder, word_to_id, 
                                transform=transforms.Compose([
                                    CutOrPadTransform(config), WordsToIdsTransform(config, word_to_id)
                                ]))

    train_dataloader = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=True)
    test_dataloader = DataLoader(dataset=test_dataset, batch_size=config.batch_size)

    # model = TextCNN(config, len(word_to_id)).to(device)
    model = RNN(config, len(word_list)).to(config.device)

    classification_trainer.train(model=model, config=config, train_dataloader=train_dataloader, test_dataloader=test_dataloader, check_interval=50)


run(rnn_type=nn.GRU, num_layers=1)
run(rnn_type=nn.GRU, num_layers=1, weight_decay=0.1)
run(nn.LSTM, num_layers=1)

[1 seconds](epoch: 0/500)[25000 samples] loss: 0.689.
eval loss: 0.696, accuracy: 50.052% [12513/25000]
[95 seconds](epoch: 50/500)[1275000 samples] loss: 0.688.
eval loss: 0.691, accuracy: 52.648% [13162/25000]
[190 seconds](epoch: 100/500)[2525000 samples] loss: 0.666.
eval loss: 0.687, accuracy: 54.688% [13672/25000]
[285 seconds](epoch: 150/500)[3775000 samples] loss: 0.595.
eval loss: 0.639, accuracy: 63.496% [15874/25000]
[381 seconds](epoch: 200/500)[5025000 samples] loss: 0.571.
eval loss: 0.613, accuracy: 66.236% [16559/25000]
[476 seconds](epoch: 250/500)[6275000 samples] loss: 0.512.
eval loss: 0.598, accuracy: 67.904% [16976/25000]
[572 seconds](epoch: 300/500)[7525000 samples] loss: 0.472.
eval loss: 0.593, accuracy: 69.020% [17255/25000]
[667 seconds](epoch: 350/500)[8775000 samples] loss: 0.456.
eval loss: 0.599, accuracy: 69.472% [17368/25000]
[761 seconds](epoch: 400/500)[10025000 samples] loss: 0.425.
eval loss: 0.608, accuracy: 69.768% [17442/25000]
[856 seconds](epo