In [2]:
# env setup
import os
import sys

os.chdir("/home/yxjiang/source/ml_playground")

print(os.getcwd())

/home/yxjiang/source/ml_playground


In [95]:
# data downloading
from util import data_util
import pandas as pd
from collections import defaultdict

dbpedia_url="https://s3.amazonaws.com/fast-ai-nlp/dbpedia_csv.tgz"
dest_dir = "/tmp/data"
dataset_folder_path = os.path.join(dest_dir, "dbpedia_csv")
data_util.download_data(url=dbpedia_url, dest_dir=dest_dir)

# get vocabulary
def get_vocabulary(folder_path, file_suffix, check_interval=50000):
    """
    Get the word to id from the vocabulary of the corpus.
    """
    print("Processing vocabulary from [%s]." % (folder_path))
    vocab = set()
    for filename in os.listdir(folder_path):
        if not filename.endswith(file_suffix):
            continue
        sub_vocab = set()
        if file_suffix == ".csv" or file_suffix == "csv":
            data_frame = pd.read_csv(os.path.join(folder_path, filename))
            for i, row in data_frame.iterrows():
                if i % check_interval == 0:
                    print("Processed %d rows" % (i))
                sub_vocab |= set(row[2].strip().split(" "))
        else:
            raise Exception("Suffix [%s] not supported for calculating the vocabulary." % (file_suffix))
        vocab |= sub_vocab
    word_to_id = defaultdict(int)
    for i, w in enumerate(vocab, 1):
        word_to_id[w] = i
    return word_to_id

# generate word to id mapping
word_to_id = get_vocabulary(folder_path=dataset_folder_path, file_suffix=".csv")
print("There size of vocabulary is :", len(word_to_id))

# generate class id to name mapping
class_to_name = defaultdict(str)
with open(os.path.join(dataset_folder_path, "classes.txt"), "r") as f:
    for i, class_name in enumerate(f):
        class_to_name[i] = class_name.strip()
print("There class mapping:", class_to_name.items())

Destination folder exists.
Target file exists, skip downloading.
Data extracted, skip extracting.
Processing vocabulary from [/tmp/data/dbpedia_csv].
Processed 0 rows
Processed 50000 rows
Processed 100000 rows
Processed 150000 rows
Processed 200000 rows
Processed 250000 rows
Processed 300000 rows
Processed 350000 rows
Processed 400000 rows
Processed 450000 rows
Processed 500000 rows
Processed 550000 rows
Processed 0 rows
Processed 50000 rows
There size of vocabulary is : 1299678
There class mapping: dict_items([(0, 'Company'), (1, 'EducationalInstitution'), (2, 'Artist'), (3, 'Athlete'), (4, 'OfficeHolder'), (5, 'MeanOfTransportation'), (6, 'Building'), (7, 'NaturalPlace'), (8, 'Village'), (9, 'Animal'), (10, 'Plant'), (11, 'Album'), (12, 'Film'), (13, 'WrittenWork')])


In [146]:
# configs
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim

class Config:
    def __init__(self):
        self.sentence_max_length = 30
        self.word_embedding_length = 32
        self.activation = F.tanh
        self.criteria = nn.CrossEntropyLoss
        self.optimizer = optim.Adam
        self.lr = 0.05
        self.epochs = 5
        self.batch_size = 2

In [147]:
# transform, dataset and dataloader
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

class CutOrPadTransform:
    """
    Shape all sentences to the equal length
    """
    def __init__(self, config):
        self.config = config
    
    def __call__(self, input):
        if len(input['words']) >= config.sentence_max_length:
            input['words'] = input['words'][:config.sentence_max_length]
        else:
            input['words'].extend([' '] * (config.sentence_max_length - len(input['words'])))
        return input


class WordsToIdsTransform:
    """
    Convert the list of words to embeddings.
    """
    def __init__(self, config, word_to_id):
        self.config = config
        self.word_to_id = word_to_id
    
    def __call__(self, input):
        input['word_ids'] = torch.tensor([word_to_id[w] for w in input['words']], dtype=torch.long)
        # del input['words']
        return input


class TextDataset(Dataset):
    def __init__(self, config, csv_file, word_to_id, class_to_name, transform):
        self.config = config
        self.word_to_id = word_to_id
        self.class_to_name = class_to_name
        self.data_frame = pd.read_csv(csv_file)
        self.transform = transform
    
    def __getitem__(self, idx):
        words = [w.strip() for w in self.data_frame.iloc[idx, 2].strip().split(" ")]
        label = self.data_frame.iloc[idx, 0] - 1
        input = self.transform({'words': words, 'label': label})
        return input['words'], input['word_ids'], input['label']
        

    def __len__(self):
        return len(self.data_frame)

In [151]:
# model
import torch
import torch.nn as nn
import torch.nn.functional as F

class TextCNN(nn.Module):
    def __init__(self, config, vocabulary_size, num_labels):
        super().__init__()
        self.config = config
        self.embed = nn.Embedding(vocabulary_size, config.word_embedding_length)

        self.conv3 = nn.Conv2d(1, 1, kernel_size=(3, config.word_embedding_length))
        self.conv4 = nn.Conv2d(1, 1, kernel_size=(4, config.word_embedding_length))
        self.conv5 = nn.Conv2d(1, 1, kernel_size=(5, config.word_embedding_length))
        self.conv6 = nn.Conv2d(1, 1, kernel_size=(6, config.word_embedding_length))
        self.conv7 = nn.Conv2d(1, 1, kernel_size=(7, config.word_embedding_length))

        self.max_over_time_pool3 = nn.MaxPool2d((config.sentence_max_length - 2, 1))
        self.max_over_time_pool4 = nn.MaxPool2d((config.sentence_max_length - 3, 1))
        self.max_over_time_pool5 = nn.MaxPool2d((config.sentence_max_length - 4, 1))
        self.max_over_time_pool6 = nn.MaxPool2d((config.sentence_max_length - 5, 1))
        self.max_over_time_pool7 = nn.MaxPool2d((config.sentence_max_length - 6, 1))

        self.fc = nn.Linear(5, num_labels)


    def forward(self, x):
        batch = x.shape[0]
        x = torch.unsqueeze(self.embed(x), 1)  # [NCHW]
        c = self.conv3(x)
        # convs
        x1 = self.config.activation(self.conv3(x))
        x2 = self.config.activation(self.conv4(x))
        x3 = self.config.activation(self.conv5(x))
        x4 = self.config.activation(self.conv6(x))
        x5 = self.config.activation(self.conv7(x))

        # max over time pooling
        x1 = self.max_over_time_pool3(x1)
        x2 = self.max_over_time_pool4(x2)
        x3 = self.max_over_time_pool5(x3)
        x4 = self.max_over_time_pool6(x4)
        x5 = self.max_over_time_pool7(x5)

        x = torch.cat((x1, x2, x3, x4, x5), dim=-1)
        x = self.fc(x)
        x = x.view(batch, -1)
        
        return x

In [152]:
# trainer
def train(model, config, train_dataloader, device, check_interval=1000):
    criteria = config.criteria()
    optimizer = config.optimizer(model.parameters(), config.lr)
    start = time.time()
    counts = 0
    for epoch in range(config.epochs):
        for i, (words, word_ids, labels) in enumerate(train_dataloader):
            counts += labels.shape[0]
            output = model(word_ids.to(device))
            loss = criteria(output, labels.to(device))
            optimizer.zero_grad()
            optimizer.step()
            if ((epoch + 1) * i) % check_interval == 0:
                print("[%d seconds](epoch: %d/%d)[%d samples] loss: %.3f." % (time.time() - start, epoch + 1, config.epochs, counts, loss.mean().item()))

In [153]:
# put everything together
import time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = Config()

train_csv_file = os.path.join(dataset_folder_path, "train.csv")
train_dataset = TextDataset(config, train_csv_file, word_to_id, class_to_name, 
                            transform=transforms.Compose([
                                CutOrPadTransform(config), WordsToIdsTransform(config, word_to_id)
                            ]))
dataloader = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=True)

model = TextCNN(config, len(word_to_id), len(class_to_name)).to(device)

train(model, config, dataloader, device)

[0 seconds](epoch: 1/5)[2 samples] loss: 3.128.
[1 seconds](epoch: 1/5)[2002 samples] loss: 3.044.
[2 seconds](epoch: 1/5)[4002 samples] loss: 2.801.
[3 seconds](epoch: 1/5)[6002 samples] loss: 2.779.
[4 seconds](epoch: 1/5)[8002 samples] loss: 2.422.
[5 seconds](epoch: 1/5)[10002 samples] loss: 2.657.
[6 seconds](epoch: 1/5)[12002 samples] loss: 2.748.
[7 seconds](epoch: 1/5)[14002 samples] loss: 2.457.
[9 seconds](epoch: 1/5)[16002 samples] loss: 2.274.
[10 seconds](epoch: 1/5)[18002 samples] loss: 3.041.
[11 seconds](epoch: 1/5)[20002 samples] loss: 2.926.
[12 seconds](epoch: 1/5)[22002 samples] loss: 3.006.
[13 seconds](epoch: 1/5)[24002 samples] loss: 2.442.
[14 seconds](epoch: 1/5)[26002 samples] loss: 2.351.
[15 seconds](epoch: 1/5)[28002 samples] loss: 2.796.
[16 seconds](epoch: 1/5)[30002 samples] loss: 2.802.
[18 seconds](epoch: 1/5)[32002 samples] loss: 2.588.
[19 seconds](epoch: 1/5)[34002 samples] loss: 2.467.
[20 seconds](epoch: 1/5)[36002 samples] loss: 2.853.
[21 second

KeyboardInterrupt: 

In [145]:
dataset = pd.read_csv(os.path.join(dataset_folder_path, "train.csv"))
print("dataset shape:", dataset.shape)
print(dataset.iloc[0])
# print(dataset.iloc[0][0])
# print(dataset.iloc[0][1])
# print(dataset.iloc[0][2])

dataset shape: (559999, 3)
1                                                                                                                                                                                                                                                                                                           1
E. D. Abbott Ltd                                                                                                                                                                                                                                                                               Schwan-Stabilo
 Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.     Schwan-STABILO is a German maker of pens for ...
Name: 0, dtype: object
