In [1]:
# env setup
import os
import sys

os.chdir("/home/yxjiang/source/ml_playground")

print(os.getcwd())

/home/yxjiang/source/ml_playground


In [2]:
# data downloading
import data_util
import pandas as pd
from collections import defaultdict

# dataset_url="https://s3.amazonaws.com/fast-ai-nlp/dbpedia_csv.tgz"
dataset_url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dest_dir = "/tmp/data"
dataset_folder_path = os.path.join(dest_dir, "aclImdb")
data_util.download_data(url=dataset_url, dest_dir=dest_dir)

# generate word to id mapping
word_to_id, word_list = data_util.get_vocabulary(folder_path=dataset_folder_path, file_suffix="vocab")
print("There size of vocabulary is :", len(word_to_id))


Destination folder [/tmp/data] exists.
Target file [aclImdb_v1.tar.gz] exists, skip downloading.
Start to extract [/tmp/data/aclImdb_v1.tar.gz] to [/tmp/data]...
File extracted
Processing vocabulary from [/tmp/data/aclImdb].
There size of vocabulary is : 89527


In [3]:
# configs
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim

class Config:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.num_classes = 2
        self.sentence_max_length = 40
        self.word_embedding_length = 128
        self.activation = F.relu
        self.criteria = nn.CrossEntropyLoss
        self.optimizer = optim.Adam
        self.lr = 0.00003
        self.epochs = 1000
        self.batch_size = 1024
        self.dropout = 0.2
        self.conv_layer_sizes = [3, 4, 5, 6, 7, 8]


    def __str__(self):
        return "sentence_max_len_%d-embedding-%d-lr-%.8f-batch_size-%d-dropout-%.2f-conv_layers-%s" % (self.sentence_max_length, self.word_embedding_length, self.lr, self.batch_size, self.dropout, "|".join([str(s) for s in self.conv_layer_sizes]))

In [4]:
# put everything together
import time
from data_util import *
from models import *
from trainer import classification_trainer

config = Config()

pos_train_data_folder = os.path.join(dataset_folder_path, "train/pos")
neg_train_data_folder = os.path.join(dataset_folder_path, "train/neg")
train_dataset = MovieReviewDataset(config, pos_train_data_folder, neg_train_data_folder, word_to_id, 
                            transform=transforms.Compose([
                                CutOrPadTransform(config), WordsToIdsTransform(config, word_to_id)
                            ]))

pos_test_data_folder = os.path.join(dataset_folder_path, "test/pos")
neg_test_data_folder = os.path.join(dataset_folder_path, "test/neg")
test_dataset = MovieReviewDataset(config, pos_test_data_folder, neg_test_data_folder, word_to_id, 
                            transform=transforms.Compose([
                                CutOrPadTransform(config), WordsToIdsTransform(config, word_to_id)
                            ]))

train_dataloader = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=config.batch_size)

# model = TextCNN(config, len(word_to_id)).to(device)
model = RNN(config, len(word_list)).to(config.device)

classification_trainer.train(model, config, train_dataloader, test_dataloader)

ModuleNotFoundError: No module named 'trainer'