In [1]:
# env setup
import os
import sys

os.chdir("/home/yxjiang/source/ml_playground")

print(os.getcwd())

/home/yxjiang/source/ml_playground


In [2]:
# data downloading
from util import data_util
import pandas as pd
from collections import defaultdict

# dataset_url="https://s3.amazonaws.com/fast-ai-nlp/dbpedia_csv.tgz"
dataset_url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dest_dir = "/tmp/data"
dataset_folder_path = os.path.join(dest_dir, "aclImdb")
data_util.download_data(url=dataset_url, dest_dir=dest_dir)

# generate word to id mapping
word_to_id, word_list = data_util.get_vocabulary(folder_path=dataset_folder_path, file_suffix="vocab")
print("There size of vocabulary is :", len(word_to_id))


Destination folder [/tmp/data] exists.
Target file [aclImdb_v1.tar.gz] exists, skip downloading.
Start to extract [/tmp/data/aclImdb_v1.tar.gz] to [/tmp/data]...
File extracted
Processing vocabulary from [/tmp/data/aclImdb].
There size of vocabulary is : 89527


In [3]:
# transform, dataset and dataloader
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms


class CutOrPadTransform:
    """
    Shape all sentences to the equal length.
    """
    def __init__(self, config):
        self.config = config
    
    def __call__(self, input):
        if len(input["words"]) >= config.sentence_max_length:
            input["words"] = input["words"][:config.sentence_max_length]
        else:
            input["words"].extend([" "] * (config.sentence_max_length - len(input["words"])))
        return input


class WordsToIdsTransform:
    """
    Convert the list of words to embeddings.
    """
    def __init__(self, config, word_to_id):
        self.config = config
        self.word_to_id = word_to_id
    
    def __call__(self, input):
        input["word_ids"] = torch.tensor([self.word_to_id[w.lower()] for w in input["words"]], dtype=torch.long)
        # del input['words']
        return input


class MovieReviewDataset(Dataset):
    def __init__(self, config, pos_data_folder, neg_data_folder, word_to_id, transform):
        self.config = config
        self.word_to_id = word_to_id
        self.data = []
        # read all data into memory
        for filename in os.listdir(pos_data_folder):
            if filename.endswith(".txt"):
                with open(os.path.join(pos_data_folder, filename), "r") as f:
                    self.data.append((f.readline(), 1))

        for filename in os.listdir(neg_data_folder):
            if filename.endswith(".txt"):
                with open(os.path.join(neg_data_folder, filename), "r") as f:
                    self.data.append((f.readline(), 0))

        self.transform = transform
    
    def __getitem__(self, idx):
        words = [w.strip() for w in self.data[idx][0].strip().split(" ")]
        label = self.data[idx][1]
        input = self.transform({"words": words, "label": label})
        # print(input["words"], "\n", input["word_ids"], "\n", input["label"])
        return input["words"], input["word_ids"], input["label"]
        

    def __len__(self):
        return len(self.data)

In [4]:
# model
import torch
import torch.nn as nn
import torch.nn.functional as F

class RNN(nn.Module):
    def __init__(self, config, vocabulary_size):
        super().__init__()
        self.config = config
        self.embed = nn.Embedding(vocabulary_size, config.word_embedding_length)
        self.hidden_size = 128
        self.num_layers = 1
        self.directions = 1
        self.rnn = nn.RNN(input_size=config.word_embedding_length, hidden_size=self.hidden_size, num_layers=self.num_layers)
        self.fc1 = nn.Linear(in_features=self.hidden_size, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=config.num_classes)

    def forward(self, x):
        batch = x.shape[0]
        x = self.embed(x)  # (batch, sentence_length, embedding_dim)
        x = x.permute(1, 0, 2).contiguous()  # (sentence_length, batch, embedding_dim)

        h0 = torch.zeros((self.num_layers * self.directions, batch, self.hidden_size)).to(device)
        output, ht = self.rnn(x, h0)
        ht = ht.permute(1, 0, 2)  # (batch, num_layer * directions, embedding_dim)
        ht = ht.contiguous().view(batch, self.num_layers * self.directions, self.hidden_size)
        x = F.relu(self.fc1(ht))
        x = F.relu(self.fc2(x))
        x = x.view(batch, -1)
        return x



class TextCNN(nn.Module):
    def __init__(self, config, vocabulary_size):
        super().__init__()
        self.config = config
        self.embed = nn.Embedding(vocabulary_size, config.word_embedding_length)
        self.conv_layer_sizes = config.conv_layer_sizes

        for i, size in enumerate(self.conv_layer_sizes):
            self.add_module("conv" + str(i), nn.Conv2d(1, 1, kernel_size=(size, config.word_embedding_length)).to(device))
            self.add_module("pool" + str(i), nn.MaxPool2d((config.sentence_max_length - size + 1, 1)).to(device))

        self.dropout = nn.Dropout(config.dropout)
        self.fc = nn.Linear(len(self.conv_layer_sizes), config.num_classes)


    def forward(self, x):
        batch = x.shape[0]
        x = torch.unsqueeze(self.embed(x), 1)  # [NCHW], add channel to dimension 1
        # convs
        xs = []
        for i in range(len(self.conv_layer_sizes)):
            xs.append(self.config.activation(self._modules["conv" + str(i)](x)))  # conv modules
            xs[i] = self._modules["pool" + str(i)](xs[i])  # max over time pooling modules

        x = torch.cat(xs, dim=-1)
        x = self.dropout(x)
        x = self.fc(x)
        x = x.view(batch, -1)
        
        return x

In [5]:
# trainer
from torch.utils.tensorboard import SummaryWriter

def train(model, config, train_dataloader, test_dataloader, device, check_interval=5000):
    criteria = config.criteria()
    optimizer = config.optimizer(model.parameters(), config.lr)
    start = time.time()
    counts = 0
    writer = SummaryWriter(filename_suffix=str(config))
    # writer.add_graph(model)
    torch.backends.cudnn.benchmark = True
    for epoch in range(config.epochs):
        for i, (words, word_ids, labels) in enumerate(train_dataloader):
            counts += labels.shape[0]
            optimizer.zero_grad()
            output = model(word_ids.to(device))
            loss = criteria(output, labels.to(device))
            loss.backward()
            optimizer.step()
            if ((epoch + 1) * i) % check_interval == 0:
                print("[%d seconds](epoch: %d/%d)[%d samples] loss: %.3f." % (time.time() - start, epoch + 1, config.epochs, counts, loss.mean().item()))
                # eval on test dataset
                model.eval()
                with torch.no_grad():
                    acc_eval_loss = 0.0
                    batches = 0
                    correct = 0
                    total_samples = 0
                    for j, (words, eval_word_ids, eval_labels) in enumerate(test_dataloader):
                        eval_output = model(eval_word_ids.to(device))
                        eval_loss = criteria(eval_output, eval_labels.to(device))
                        acc_eval_loss += eval_loss.item()
                        batches += 1
                        correct += torch.sum(torch.argmax(eval_output.cpu(), dim=1) == eval_labels)
                        total_samples += eval_word_ids.shape[0]
                    accuracy = 100.0 * correct / total_samples
                    print('eval loss: %.3f, accuracy: %.3f%% [%d/%d]' % (acc_eval_loss / batches, accuracy, correct, total_samples))
                    writer.add_scalar('Loss/eval', acc_eval_loss / batches, epoch + 1)
                    writer.add_scalar('Eval accuracy', accuracy, epoch + 1)
                model.train()
                writer.add_scalar('Loss/train', loss.mean().item(), epoch + 1)

In [17]:
# configs
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim

class Config:
    def __init__(self):
        self.num_classes = 2
        self.sentence_max_length = 40
        self.word_embedding_length = 128
        self.activation = F.relu
        self.criteria = nn.CrossEntropyLoss
        self.optimizer = optim.Adam
        self.lr = 0.0001
        self.epochs = 500
        self.batch_size = 1024
        self.dropout = 0.1
        self.conv_layer_sizes = [3, 4, 5, 6, 7, 8]

    def __str__(self):
        return "sentence_max_len_%d-embedding-%d-lr-%.8f-batch_size-%d-dropout-%.2f-conv_layers-%s" % (self.sentence_max_length, self.word_embedding_length, self.lr, self.batch_size, self.dropout, "|".join([str(s) for s in self.conv_layer_sizes]))

In [16]:
# put everything together
import time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = Config()

pos_train_data_folder = os.path.join(dataset_folder_path, "train/pos")
neg_train_data_folder = os.path.join(dataset_folder_path, "train/neg")
train_dataset = MovieReviewDataset(config, pos_train_data_folder, neg_train_data_folder, word_to_id, 
                            transform=transforms.Compose([
                                CutOrPadTransform(config), WordsToIdsTransform(config, word_to_id)
                            ]))

pos_test_data_folder = os.path.join(dataset_folder_path, "test/pos")
neg_test_data_folder = os.path.join(dataset_folder_path, "test/neg")
test_dataset = MovieReviewDataset(config, pos_test_data_folder, neg_test_data_folder, word_to_id, 
                            transform=transforms.Compose([
                                CutOrPadTransform(config), WordsToIdsTransform(config, word_to_id)
                            ]))

train_dataloader = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=config.batch_size)

model = TextCNN(config, len(word_to_id)).to(device)
# model = RNN(config, len(word_list)).to(device)

train(model, config, train_dataloader, test_dataloader, device)

 0.682.
eval loss: 0.689, accuracy: 53.560% [13390/25000]
[35 seconds](epoch: 11/500)[251024 samples] loss: 0.679.
eval loss: 0.688, accuracy: 53.980% [13495/25000]
[39 seconds](epoch: 12/500)[276024 samples] loss: 0.682.
eval loss: 0.686, accuracy: 54.268% [13567/25000]
[42 seconds](epoch: 13/500)[301024 samples] loss: 0.676.
eval loss: 0.685, accuracy: 54.716% [13679/25000]
[46 seconds](epoch: 14/500)[326024 samples] loss: 0.676.
eval loss: 0.683, accuracy: 55.028% [13757/25000]
[49 seconds](epoch: 15/500)[351024 samples] loss: 0.670.
eval loss: 0.682, accuracy: 55.500% [13875/25000]
[53 seconds](epoch: 16/500)[376024 samples] loss: 0.678.
eval loss: 0.680, accuracy: 55.684% [13921/25000]
[56 seconds](epoch: 17/500)[401024 samples] loss: 0.664.
eval loss: 0.679, accuracy: 55.928% [13982/25000]
[60 seconds](epoch: 18/500)[426024 samples] loss: 0.673.
eval loss: 0.678, accuracy: 56.216% [14054/25000]
[63 seconds](epoch: 19/500)[451024 samples] loss: 0.667.
eval loss: 0.676, accuracy: 5

KeyboardInterrupt: 