In [1]:
# env setup
import os
import sys

os.chdir("/home/yxjiang/source/ml_playground")

print(os.getcwd())

/home/yxjiang/source/ml_playground


In [2]:
# data downloading
from util import data_util
import pandas as pd
from collections import defaultdict

# dataset_url="https://s3.amazonaws.com/fast-ai-nlp/dbpedia_csv.tgz"
dataset_url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dest_dir = "/tmp/data"
dataset_folder_path = os.path.join(dest_dir, "aclImdb")
data_util.download_data(url=dataset_url, dest_dir=dest_dir)

# generate word to id mapping
word_to_id, word_list = data_util.get_vocabulary(folder_path=dataset_folder_path, file_suffix="vocab")
print("There size of vocabulary is :", len(word_to_id))

# generate class id to name mapping
# class_to_name = defaultdict(str)
# with open(os.path.join(dataset_folder_path, "classes.txt"), "r") as f:
#     for i, class_name in enumerate(f):
#         class_to_name[i] = class_name.strip()
# print("There class mapping:", class_to_name.items())

Destination folder [/tmp/data] exists.
Target file [aclImdb_v1.tar.gz] exists, skip downloading.
Start to extract [/tmp/data/aclImdb_v1.tar.gz] to [/tmp/data]...
File extracted
Processing vocabulary from [/tmp/data/aclImdb].
There size of vocabulary is : 89527


In [4]:
# transform, dataset and dataloader
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms


class CutOrPadTransform:
    """
    Shape all sentences to the equal length.
    """
    def __init__(self, config):
        self.config = config
    
    def __call__(self, input):
        if len(input["words"]) >= config.sentence_max_length:
            input["words"] = input["words"][:config.sentence_max_length]
        else:
            input["words"].extend([" "] * (config.sentence_max_length - len(input["words"])))
        return input


class WordsToIdsTransform:
    """
    Convert the list of words to embeddings.
    """
    def __init__(self, config, word_to_id):
        self.config = config
        self.word_to_id = word_to_id
    
    def __call__(self, input):
        input["word_ids"] = torch.tensor([self.word_to_id[w.lower()] for w in input["words"]], dtype=torch.long)
        # del input['words']
        return input


class MovieReviewDataset(Dataset):
    def __init__(self, config, pos_data_folder, neg_data_folder, word_to_id, transform):
        self.config = config
        self.word_to_id = word_to_id
        self.data = []
        # read all data into memory
        for filename in os.listdir(pos_data_folder):
            if filename.endswith(".txt"):
                with open(os.path.join(pos_data_folder, filename), "r") as f:
                    self.data.append((f.readline(), 1))

        for filename in os.listdir(neg_data_folder):
            if filename.endswith(".txt"):
                with open(os.path.join(neg_data_folder, filename), "r") as f:
                    self.data.append((f.readline(), 0))

        self.transform = transform
    
    def __getitem__(self, idx):
        words = [w.strip() for w in self.data[idx][0].strip().split(" ")]
        label = self.data[idx][1]
        input = self.transform({"words": words, "label": label})
        # print(input["words"], "\n", input["word_ids"], "\n", input["label"])
        return input["words"], input["word_ids"], input["label"]
        

    def __len__(self):
        return len(self.data)

In [5]:
# model
import torch
import torch.nn as nn
import torch.nn.functional as F

class RNN(nn.Module):
    def __init__(self, config, vocabulary_size):
        super().__init__()
        self.config = config
        self.embed = nn.Embedding(vocabulary_size, config.word_embedding_length)
        self.hidden_size = 128
        self.num_layers = 1
        self.directions = 1
        self.rnn = nn.RNN(input_size=config.word_embedding_length, hidden_size=self.hidden_size, num_layers=self.num_layers)
        self.fc1 = nn.Linear(in_features=self.hidden_size, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=config.num_classes)

    def forward(self, x):
        batch = x.shape[0]
        x = self.embed(x)  # (batch, sentence_length, embedding_dim)
        x = x.permute(1, 0, 2).contiguous()  # (sentence_length, batch, embedding_dim)

        h0 = torch.zeros((self.num_layers * self.directions, batch, self.hidden_size)).to(device)
        output, ht = self.rnn(x, h0)
        ht = ht.permute(1, 0, 2)  # (batch, num_layer * directions, embedding_dim)
        ht = ht.contiguous().view(batch, self.num_layers * self.directions, self.hidden_size)
        x = F.relu(self.fc1(ht))
        x = F.relu(self.fc2(x))
        x = x.view(batch, -1)
        return x



class TextCNN(nn.Module):
    def __init__(self, config, vocabulary_size):
        super().__init__()
        self.config = config
        self.embed = nn.Embedding(vocabulary_size, config.word_embedding_length)

        self.conv3 = nn.Conv2d(1, 1, kernel_size=(3, config.word_embedding_length))
        self.conv4 = nn.Conv2d(1, 1, kernel_size=(4, config.word_embedding_length))
        self.conv5 = nn.Conv2d(1, 1, kernel_size=(5, config.word_embedding_length))
        self.conv6 = nn.Conv2d(1, 1, kernel_size=(6, config.word_embedding_length))
        self.conv7 = nn.Conv2d(1, 1, kernel_size=(7, config.word_embedding_length))

        self.max_over_time_pool3 = nn.MaxPool2d((config.sentence_max_length - 2, 1))
        self.max_over_time_pool4 = nn.MaxPool2d((config.sentence_max_length - 3, 1))
        self.max_over_time_pool5 = nn.MaxPool2d((config.sentence_max_length - 4, 1))
        self.max_over_time_pool6 = nn.MaxPool2d((config.sentence_max_length - 5, 1))
        self.max_over_time_pool7 = nn.MaxPool2d((config.sentence_max_length - 6, 1))

        self.dropout = nn.Dropout(config.dropout)
        self.fc = nn.Linear(5, config.num_classes)


    def forward(self, x):
        batch = x.shape[0]
        x = torch.unsqueeze(self.embed(x), 1)  # [NCHW], add channel to dimension 1
        c = self.conv3(x)
        # convs
        x1 = self.config.activation(self.conv3(x))
        x2 = self.config.activation(self.conv4(x))
        x3 = self.config.activation(self.conv5(x))
        x4 = self.config.activation(self.conv6(x))
        x5 = self.config.activation(self.conv7(x))

        # max over time pooling
        x1 = self.max_over_time_pool3(x1)
        x2 = self.max_over_time_pool4(x2)
        x3 = self.max_over_time_pool5(x3)
        x4 = self.max_over_time_pool6(x4)
        x5 = self.max_over_time_pool7(x5)

        x = torch.cat((x1, x2, x3, x4, x5), dim=-1)
        x = self.dropout(x)
        x = self.fc(x)
        x = x.view(batch, -1)
        
        return x

In [22]:
# trainer
from torch.utils.tensorboard import SummaryWriter

def train(model, config, train_dataloader, test_dataloader, device, check_interval=5000):
    criteria = config.criteria()
    optimizer = config.optimizer(model.parameters(), config.lr)
    start = time.time()
    counts = 0
    writer = SummaryWriter()
    # writer.add_graph(model)
    torch.backends.cudnn.benchmark = True
    for epoch in range(config.epochs):
        for i, (words, word_ids, labels) in enumerate(train_dataloader):
            counts += labels.shape[0]
            optimizer.zero_grad()
            output = model(word_ids.to(device))
            loss = criteria(output, labels.to(device))
            loss.backward()
            optimizer.step()
            if ((epoch + 1) * i) % check_interval == 0:
                print("[%d seconds](epoch: %d/%d)[%d samples] loss: %.3f." % (time.time() - start, epoch + 1, config.epochs, counts, loss.mean().item()))
                # eval on test dataset
                model.eval()
                with torch.no_grad():
                    eval_loss = 0.0
                    batches = 0
                    for j, (words, word_ids, label) in enumerate(test_dataloader):
                        output = model(word_ids.to(device))
                        loss = criteria(output, label.to(device))
                        eval_loss += loss.item()
                        batches += 1
                    print('eval loss: ', eval_loss / batches)
                model.train()
                writer.add_scalar('Loss/train', loss.mean().item(), (epoch + 1) * i)

In [23]:
# configs
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim

class Config:
    def __init__(self):
        self.num_classes = 2
        self.sentence_max_length = 30
        self.word_embedding_length = 32
        self.activation = F.relu
        self.criteria = nn.CrossEntropyLoss
        self.optimizer = optim.Adam
        self.lr = 0.05
        self.epochs = 200
        self.batch_size = 1024
        self.dropout = 0.2

In [24]:
# put everything together
import time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = Config()

pos_train_data_folder = os.path.join(dataset_folder_path, "train/pos")
neg_train_data_folder = os.path.join(dataset_folder_path, "train/neg")
train_dataset = MovieReviewDataset(config, pos_train_data_folder, neg_train_data_folder, word_to_id, 
                            transform=transforms.Compose([
                                CutOrPadTransform(config), WordsToIdsTransform(config, word_to_id)
                            ]))

pos_test_data_folder = os.path.join(dataset_folder_path, "test/pos")
neg_test_data_folder = os.path.join(dataset_folder_path, "test/neg")
test_dataset = MovieReviewDataset(config, pos_test_data_folder, neg_test_data_folder, word_to_id, 
                            transform=transforms.Compose([
                                CutOrPadTransform(config), WordsToIdsTransform(config, word_to_id)
                            ]))

train_dataloader = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=config.batch_size)

# model = TextCNN(config, len(word_to_id)).to(device)
model = RNN(config, len(word_list)).to(device)

train(model, config, train_dataloader, test_dataloader, device)

[0 seconds](epoch: 1/200)[1024 samples] loss: 0.693.
eval loss:  0.8306064879894257
[2 seconds](epoch: 2/200)[26024 samples] loss: 0.693.
eval loss:  0.6931475830078125
[4 seconds](epoch: 3/200)[51024 samples] loss: 0.693.
eval loss:  0.6931475830078125
[7 seconds](epoch: 4/200)[76024 samples] loss: 0.693.
eval loss:  0.6931475830078125
[10 seconds](epoch: 5/200)[101024 samples] loss: 0.693.
eval loss:  0.6931475830078125
[12 seconds](epoch: 6/200)[126024 samples] loss: 0.693.
eval loss:  0.6931475830078125
[15 seconds](epoch: 7/200)[151024 samples] loss: 0.693.
eval loss:  0.6931475830078125
[17 seconds](epoch: 8/200)[176024 samples] loss: 0.693.
eval loss:  0.6931475830078125
[20 seconds](epoch: 9/200)[201024 samples] loss: 0.693.
eval loss:  0.6931475830078125
[22 seconds](epoch: 10/200)[226024 samples] loss: 0.693.


KeyboardInterrupt: 

In [1]:
import torch
import torch.nn as nn

sentence_length = 5
batch_size = 3
input_dim = 6
num_direction = 1
layer = 1
hidden_size = 7

rnn = nn.RNN(input_dim, hidden_size, layer)
x = torch.randn(sentence_length, batch_size, input_dim)
h0 = torch.randn(num_direction * layer, batch_size, hidden_size)

output, hn = rnn(x, h0)
print(output.shape, hn.shape)

print(hn)
print(output)

torch.Size([5, 3, 7]) torch.Size([1, 3, 7])
tensor([[[-0.6205,  0.8133, -0.2291,  0.7881, -0.5221, -0.5650,  0.3169],
         [-0.0421, -0.1698,  0.4719,  0.4474, -0.1437, -0.0701,  0.5606],
         [ 0.3918, -0.0828, -0.3640,  0.2610,  0.4248,  0.8207, -0.5132]]],
       grad_fn=<StackBackward>)
tensor([[[ 0.6937,  0.4441, -0.3771,  0.4462, -0.5602, -0.0196, -0.2659],
         [-0.3431,  0.5107, -0.7487,  0.7184,  0.7843, -0.0246, -0.1306],
         [ 0.2353, -0.8312,  0.1603, -0.3758,  0.0087, -0.7784, -0.9561]],

        [[-0.8158,  0.6252, -0.7030,  0.5025, -0.2198, -0.7257, -0.5358],
         [-0.6847, -0.3237,  0.5203,  0.5507, -0.6944, -0.3100, -0.2738],
         [ 0.4111, -0.8770,  0.5370, -0.2157, -0.1740,  0.2111,  0.4982]],

        [[ 0.0787, -0.0141,  0.6000,  0.6017, -0.4545,  0.0529,  0.2032],
         [ 0.1149,  0.3421, -0.0415,  0.4033, -0.2639, -0.4638,  0.1652],
         [-0.9416,  0.7988, -0.5959, -0.2201, -0.4038, -0.0768,  0.6462]],

        [[-0.3865,  0.8236, 