In [None]:
# data downloading
import data_util
import pandas as pd
from collections import defaultdict
import os
import sys

os.chdir("/home/yxjiang/source/ml_playground")
print(os.getcwd())

from util import config

In [None]:
dataset_url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dest_dir = "/tmp/data"
dataset_folder_path = os.path.join(dest_dir, "aclImdb")
data_util.download_data(url=dataset_url, dest_dir=dest_dir)

# generate word to id mapping
word_to_id, word_list = data_util.get_vocabulary(folder_path=dataset_folder_path, file_suffix="vocab")
print("There size of vocabulary is :", len(word_to_id))

In [None]:
# put everything together
import time
from data_util import *
from models import *
from trainer import classification_trainer
from util import config

import torch.optim as optim
import torch.nn.functional as F


def run(rnn_type, checkpoint_prefix, **kwargs):
    cfg = config.Config(
        rnn_type=rnn_type,
        criteria=nn.CrossEntropyLoss, optimizer=optim.Adam, lr=0.00002, epochs=200, 
        batch_size=128, num_classes=2, sentence_max_length=20, word_embedding_length=128, 
        activation=F.relu, dropout=0.1, **kwargs
    )

    pos_train_data_folder = os.path.join(dataset_folder_path, "train/pos")
    neg_train_data_folder = os.path.join(dataset_folder_path, "train/neg")
    train_dataset = MovieReviewDataset(cfg, pos_train_data_folder, neg_train_data_folder, word_to_id, 
                                transform=transforms.Compose([
                                    TruncateTransform(cfg), 
                                    WordsToIdsTransform(cfg, word_to_id),
                                ]))

    pos_test_data_folder = os.path.join(dataset_folder_path, "test/pos")
    neg_test_data_folder = os.path.join(dataset_folder_path, "test/neg")
    test_dataset = MovieReviewDataset(cfg, pos_test_data_folder, neg_test_data_folder, word_to_id, 
                                transform=transforms.Compose([
                                    TruncateTransform(cfg), 
                                    WordsToIdsTransform(cfg, word_to_id),
                                ]))

    train_dataloader = DataLoader(dataset=train_dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=data_util.pad_collate)
    test_dataloader = DataLoader(dataset=test_dataset, batch_size=cfg.batch_size, collate_fn=data_util.pad_collate)

    # model = TextCNN(cfg, len(word_to_id)).to(device)
    model = RNN(cfg, len(word_list)).to(cfg.device)

    classification_trainer.train(model=model, config=cfg, train_dataloader=train_dataloader, test_dataloader=test_dataloader, checkpoint_prefix=checkpoint_prefix, check_interval=10, kwargs=kwargs)

# run(rnn_type=nn.GRU, num_layers=3, num_directions=2)
# run(rnn_type=nn.GRU, num_layers=1, num_directions=1)
# run(nn.LSTM, num_layers=3, num_directions=2)
run(nn.LSTM, checkpoint_prefix='lstm', num_layers=1, num_directions=1, existing_checkpoint_filepath='/tmp/model/lstm_30.ckpt')