In [1]:
# env setup
import os
import sys

os.chdir("/home/yxjiang/source/ml_playground")

print(os.getcwd())

/home/yxjiang/source/ml_playground


In [2]:
# data downloading
from util import data_util
import pandas as pd
from collections import defaultdict

# dataset_url="https://s3.amazonaws.com/fast-ai-nlp/dbpedia_csv.tgz"
dataset_url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dest_dir = "/tmp/data"
dataset_folder_path = os.path.join(dest_dir, "aclImdb")
data_util.download_data(url=dataset_url, dest_dir=dest_dir)

# generate word to id mapping
word_to_id = data_util.get_vocabulary(folder_path=dataset_folder_path, file_suffix="vocab")
print("There size of vocabulary is :", len(word_to_id))

# generate class id to name mapping
# class_to_name = defaultdict(str)
# with open(os.path.join(dataset_folder_path, "classes.txt"), "r") as f:
#     for i, class_name in enumerate(f):
#         class_to_name[i] = class_name.strip()
# print("There class mapping:", class_to_name.items())

Destination folder [/tmp/data] exists.
Target file [aclImdb_v1.tar.gz] exists, skip downloading.
Start to extract [/tmp/data/aclImdb_v1.tar.gz] to [/tmp/data]...
File extracted
Processing vocabulary from [/tmp/data/aclImdb].
There size of vocabulary is : 89527


In [28]:
# configs
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim

class Config:
    def __init__(self):
        self.num_classes = 2
        self.sentence_max_length = 30
        self.word_embedding_length = 32
        self.activation = F.relu
        self.criteria = nn.CrossEntropyLoss
        self.optimizer = optim.Adam
        self.lr = 0.001
        self.epochs = 5000
        self.batch_size = 128

In [29]:
# transform, dataset and dataloader
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms


class CutOrPadTransform:
    """
    Shape all sentences to the equal length.
    """
    def __init__(self, config):
        self.config = config
    
    def __call__(self, input):
        if len(input['words']) >= config.sentence_max_length:
            input['words'] = input['words'][:config.sentence_max_length]
        else:
            input['words'].extend([' '] * (config.sentence_max_length - len(input['words'])))
        return input


class WordsToIdsTransform:
    """
    Convert the list of words to embeddings.
    """
    def __init__(self, config, word_to_id):
        self.config = config
        self.word_to_id = word_to_id
    
    def __call__(self, input):
        input['word_ids'] = torch.tensor([word_to_id[w] for w in input['words']], dtype=torch.long)
        # del input['words']
        return input


class MovieReviewDataset(Dataset):
    def __init__(self, config, pos_data_folder, neg_data_folder, word_to_id, transform):
        self.config = config
        self.word_to_id = word_to_id
        self.data = []
        # read all data into memory
        for filename in os.listdir(pos_data_folder):
            if filename.endswith(".txt"):
                with open(os.path.join(pos_data_folder, filename), "r") as f:
                    self.data.append((f.readline(), 1))

        for filename in os.listdir(pos_data_folder):
            if filename.endswith(".txt"):
                with open(os.path.join(pos_data_folder, filename), "r") as f:
                    self.data.append((f.readline(), 0))

        self.transform = transform
    
    def __getitem__(self, idx):
        words = [w.strip() for w in self.data[idx][0].strip().split(" ")]
        label = self.data[idx][1]
        input = self.transform({'words': words, 'label': label})
        return input['words'], input['word_ids'], input['label']
        

    def __len__(self):
        return len(self.data)

In [30]:
# model
import torch
import torch.nn as nn
import torch.nn.functional as F

class TextCNN(nn.Module):
    def __init__(self, config, vocabulary_size):
        super().__init__()
        self.config = config
        self.embed = nn.Embedding(vocabulary_size, config.word_embedding_length)

        self.conv3 = nn.Conv2d(1, 1, kernel_size=(3, config.word_embedding_length))
        self.conv4 = nn.Conv2d(1, 1, kernel_size=(4, config.word_embedding_length))
        self.conv5 = nn.Conv2d(1, 1, kernel_size=(5, config.word_embedding_length))
        self.conv6 = nn.Conv2d(1, 1, kernel_size=(6, config.word_embedding_length))
        self.conv7 = nn.Conv2d(1, 1, kernel_size=(7, config.word_embedding_length))

        self.max_over_time_pool3 = nn.MaxPool2d((config.sentence_max_length - 2, 1))
        self.max_over_time_pool4 = nn.MaxPool2d((config.sentence_max_length - 3, 1))
        self.max_over_time_pool5 = nn.MaxPool2d((config.sentence_max_length - 4, 1))
        self.max_over_time_pool6 = nn.MaxPool2d((config.sentence_max_length - 5, 1))
        self.max_over_time_pool7 = nn.MaxPool2d((config.sentence_max_length - 6, 1))

        self.fc = nn.Linear(5, config.num_classes)


    def forward(self, x):
        batch = x.shape[0]
        x = torch.unsqueeze(self.embed(x), 1)  # [NCHW]
        c = self.conv3(x)
        # convs
        x1 = self.config.activation(self.conv3(x))
        x2 = self.config.activation(self.conv4(x))
        x3 = self.config.activation(self.conv5(x))
        x4 = self.config.activation(self.conv6(x))
        x5 = self.config.activation(self.conv7(x))

        # max over time pooling
        x1 = self.max_over_time_pool3(x1)
        x2 = self.max_over_time_pool4(x2)
        x3 = self.max_over_time_pool5(x3)
        x4 = self.max_over_time_pool6(x4)
        x5 = self.max_over_time_pool7(x5)

        x = torch.cat((x1, x2, x3, x4, x5), dim=-1)
        x = self.fc(x)
        x = x.view(batch, -1)
        
        return x

In [31]:
# trainer
def train(model, config, train_dataloader, device, check_interval=1000):
    criteria = config.criteria()
    optimizer = config.optimizer(model.parameters(), config.lr)
    start = time.time()
    counts = 0
    for epoch in range(config.epochs):
        for i, (words, word_ids, labels) in enumerate(train_dataloader):
            counts += labels.shape[0]
            output = model(word_ids.to(device))
            loss = criteria(output, labels.to(device))
            optimizer.zero_grad()
            optimizer.step()
            if ((epoch + 1) * i) % check_interval == 0:
                print("[%d seconds](epoch: %d/%d)[%d samples] loss: %.3f." % (time.time() - start, epoch + 1, config.epochs, counts, loss.mean().item()))

In [32]:
# put everything together
import time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = Config()

pos_data_folder = os.path.join(dataset_folder_path, "train/pos")
neg_data_folder = os.path.join(dataset_folder_path, "train/neg")
train_dataset = MovieReviewDataset(config, pos_data_folder, neg_data_folder, word_to_id, 
                            transform=transforms.Compose([
                                CutOrPadTransform(config), WordsToIdsTransform(config, word_to_id)
                            ]))
dataloader = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=True)

model = TextCNN(config, len(word_to_id)).to(device)

train(model, config, dataloader, device)

loss: 0.690.
[58 seconds](epoch: 49/5000)[1200128 samples] loss: 0.696.
[59 seconds](epoch: 50/5000)[1225128 samples] loss: 0.699.
[59 seconds](epoch: 50/5000)[1227688 samples] loss: 0.689.
[59 seconds](epoch: 50/5000)[1230248 samples] loss: 0.696.
[59 seconds](epoch: 50/5000)[1232808 samples] loss: 0.701.
[59 seconds](epoch: 50/5000)[1235368 samples] loss: 0.695.
[59 seconds](epoch: 50/5000)[1237928 samples] loss: 0.697.
[60 seconds](epoch: 50/5000)[1240488 samples] loss: 0.688.
[60 seconds](epoch: 50/5000)[1243048 samples] loss: 0.690.
[60 seconds](epoch: 50/5000)[1245608 samples] loss: 0.695.
[60 seconds](epoch: 50/5000)[1248168 samples] loss: 0.695.
[60 seconds](epoch: 51/5000)[1250128 samples] loss: 0.702.
[61 seconds](epoch: 52/5000)[1275128 samples] loss: 0.688.
[63 seconds](epoch: 53/5000)[1300128 samples] loss: 0.704.
[64 seconds](epoch: 54/5000)[1325128 samples] loss: 0.698.
[65 seconds](epoch: 55/5000)[1350128 samples] loss: 0.695.
[66 seconds](epoch: 56/5000)[1375128 sample

KeyboardInterrupt: 

In [145]:
dataset = pd.read_csv(os.path.join(dataset_folder_path, "train.csv"))
print("dataset shape:", dataset.shape)
print(dataset.iloc[0])
# print(dataset.iloc[0][0])
# print(dataset.iloc[0][1])
# print(dataset.iloc[0][2])

dataset shape: (559999, 3)
1                                                                                                                                                                                                                                                                                                           1
E. D. Abbott Ltd                                                                                                                                                                                                                                                                               Schwan-Stabilo
 Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.     Schwan-STABILO is a German maker of pens for ...
Name: 0, dtype: object
