# CS769 Project
## Implementation of Existing Works
### Character-level CNN on Grouped Emotions
We use CharCNN on the fine-grained dataset and the performance is poor. In the paper of GoEmotions, the authors also divide the emotions into different groups. In this notebook, we conduct an experiment on the emotion groups taxonomy. The 28 emotions are divided into:
- ambiguous
- negative
- neutral
- positive

### Data Preprocessing

In [1]:
from google.colab import drive
import numpy as np
import pandas as pd

drive.mount('/content/drive')
'''
Read the data that need preprocessing.
'''
training_data = pd.read_csv('Project/Data/train.tsv', header=None, sep='\t')
dev_data = pd.read_csv('Project/Data/dev.tsv', header=None, sep='\t')
test_data = pd.read_csv('Project/Data/test.tsv', header=None, sep='\t')

Mounted at /content/drive


In [27]:
'''
Get the list of emotions.
'''
emotion_list = pd.read_csv('Project/Data/emotions.txt', header=None, sep='\n')
emotion_list = emotion_list[0].to_list()

In [25]:
'''
Read the emotion mapping files
'''
import json
f = open('Project/Data/sentiment_mapping.json')
sentiment_group = json.load(f)
f.close()
g = open('Project/Data/ekman_mapping.json')
ekman_group = json.load(g)
g.close()

In [28]:
# Create datasets based on different grouping methods.
def generate_group_dataset(input_data, labels, json_file):
  '''
  input:
    input_data: a pandas dataframe, e.g the dataset in our github repo
    labels: the list of classes (in our case, emotions)
    # method: emotions group or ekman taxonomy.
    json_file: encode the grouping method
  output:
    output_data: the grouped dataset.
    outout_labels: the label assignment
  '''
  output_labels = {'neutral': 0}
  k = 1
  for group in json_file:
    output_labels[group] = k
    k += 1
  col_titles = [1, 0]
  output_data = input_data.reindex(columns=col_titles)
  for l in range(output_data.shape[0]):
    curLabel = labels[int(output_data[1][l].split(',')[0])]
    if curLabel == 'neutral':
      output_data[1][l] = 0
    else:
      for group in json_file:
        if curLabel in json_file[group]:
          output_data[1][l] = output_labels[group]
          break
  return output_data, output_labels

In [30]:
training_data_emotions, label_view = generate_group_dataset(training_data, emotion_list,sentiment_group)
dev_data_emotions, _ = generate_group_dataset(dev_data, emotion_list,sentiment_group)
test_data_emotions, _ = generate_group_dataset(test_data, emotion_list,sentiment_group)

In [None]:
training_data_emotions.to_csv('Project/Data/train_emotion_group.tsv', header=False, index=False)
dev_data_emotions.to_csv('Project/Data/dev_emotion_group.tsv', header=False, index=False)
test_data_emotions.to_csv('Project/Data/test_emotion_group.tsv', header=False, index=False)

### Model Architecture

The model is the same as described in the original paper. 

In [3]:
import torch.nn as nn
'''
Credit: we follow the implementation of https://github.com/uvipen/Character-level-cnn-pytorch
'''

class CharacterLevelCNN(nn.Module):
    def __init__(self, n_classes=28, input_length=1014, input_dim=68,
                 n_conv_filters=256,
                 n_fc_neurons=1024):
        '''
        As indicated in their paper, the authors use 6 convolutional layers (all 1D)
        The kernel sizes are the same as the original paper.
        '''
        super(CharacterLevelCNN, self).__init__()
        self.conv1 = nn.Sequential(nn.Conv1d(input_dim, n_conv_filters, kernel_size=7, padding=0), nn.ReLU(),
                                   nn.MaxPool1d(3))
        self.conv2 = nn.Sequential(nn.Conv1d(n_conv_filters, n_conv_filters, kernel_size=7, padding=0), nn.ReLU(),
                                   nn.MaxPool1d(3))
        self.conv3 = nn.Sequential(nn.Conv1d(n_conv_filters, n_conv_filters, kernel_size=3, padding=0), nn.ReLU())
        self.conv4 = nn.Sequential(nn.Conv1d(n_conv_filters, n_conv_filters, kernel_size=3, padding=0), nn.ReLU())
        self.conv5 = nn.Sequential(nn.Conv1d(n_conv_filters, n_conv_filters, kernel_size=3, padding=0), nn.ReLU())
        self.conv6 = nn.Sequential(nn.Conv1d(n_conv_filters, n_conv_filters, kernel_size=3, padding=0), nn.ReLU(),
                                   nn.MaxPool1d(3))

        dimension = int((input_length - 96) / 27 * n_conv_filters)
        self.fc1 = nn.Sequential(nn.Linear(dimension, n_fc_neurons), nn.Dropout(0.5))
        self.fc2 = nn.Sequential(nn.Linear(n_fc_neurons, n_fc_neurons), nn.Dropout(0.5))
        self.fc3 = nn.Linear(n_fc_neurons, n_classes)
        '''
        Different weighting strategy for small / large scale model,
        we use the small model
        '''
        if n_conv_filters == 256 and n_fc_neurons == 1024:
            self._create_weights(mean=0.0, std=0.05)
        elif n_conv_filters == 1024 and n_fc_neurons == 2048:
            self._create_weights(mean=0.0, std=0.02)
    
    def _create_weights(self, mean=0.0, std=0.05):
        '''
        Weight Initializer
        '''
        for module in self.modules():
            if isinstance(module, nn.Conv1d) or isinstance(module, nn.Linear):
                module.weight.data.normal_(mean, std)

    def forward(self, input):
        input = input.transpose(1, 2)
        output = self.conv1(input)
        output = self.conv2(output)
        output = self.conv3(output)
        output = self.conv4(output)
        output = self.conv5(output)
        output = self.conv6(output)

        output = output.view(output.size(0), -1)
        output = self.fc1(output)
        output = self.fc2(output)
        output = self.fc3(output)

        return output

### Custom Dataloader in PyTorch

In [4]:
import numpy as np
import sys
import csv
from torch.utils.data import Dataset
csv.field_size_limit(sys.maxsize)


class MyDataset(Dataset):
    def __init__(self, data_path, max_length=1014):
        self.data_path = data_path
        # The authors only consider the following characters.
        self.vocabulary = list("""abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}""")
        self.identity_mat = np.identity(len(self.vocabulary))
        texts, labels = [], []
        with open(data_path) as csv_file:
            reader = csv.reader(csv_file)
            for idx, line in enumerate(reader):
                text = ""
                for tx in line[1:]:
                    text += tx
                    text += " "
                '''
                We assume that the data file has a specific format:
                  The first column: label, integer.
                  The second column: text data
                '''
                label = int(line[0])
                texts.append(text)
                labels.append(label)
        self.texts = texts
        self.labels = labels
        self.max_length = max_length
        self.length = len(self.labels)
        self.num_classes = len(set(self.labels))

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        raw_text = self.texts[index]
        # Encode the characters as described in the paper.
        data = np.array([self.identity_mat[self.vocabulary.index(i)] for i in list(raw_text) if i in self.vocabulary],
                        dtype=np.float32)
        if len(data) > self.max_length:
            data = data[:self.max_length]
        elif 0 < len(data) < self.max_length:
            data = np.concatenate(
                (data, np.zeros((self.max_length - len(data), len(self.vocabulary)), dtype=np.float32)))
        elif len(data) == 0:
            data = np.zeros((self.max_length, len(self.vocabulary)), dtype=np.float32)
        label = self.labels[index]
        return data, label

In [5]:
from sklearn import metrics

def get_evaluation(y_true, y_prob, list_metrics):
    y_pred = np.argmax(y_prob, -1)
    output = {}
    if 'accuracy' in list_metrics:
        output['accuracy'] = metrics.accuracy_score(y_true, y_pred)
    if 'loss' in list_metrics:
        try:
            output['loss'] = metrics.log_loss(y_true, y_prob)
        except ValueError:
            output['loss'] = -1
    if 'confusion_matrix' in list_metrics:
        output['confusion_matrix'] = str(metrics.confusion_matrix(y_true, y_pred))
        output['confusion_mat_original'] = metrics.confusion_matrix(y_true, y_pred)
        output['f1scores'] = metrics.f1_score(y_true,y_pred,average='macro')
    return output

### Training

In [14]:
import os
import sys
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import shutil

def train():
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    opt_log_path = "Project/logs/charCNN"
    opt_dataset = "GoEmotions_Group"
    opt_optimizer = "adam" # ['adam', 'sgd']
    opt_num_epochs =15
    opt_es_min_delta = 0.01 # improvement margin of loss
    opt_es_patience = 0 # Early stopping measure, wait for _ epochs when no improvement
    opt_lr = 0.0008 # 0.001 as a starting point for adam, 0.01 for sgd
    opt_batch_size = 128
    opt_max_length = 1014 # maximum length of the text data
    opt_feature = 'small' # Small: 256 filters, Large: 1024 filters
    opt_alphabet = """abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"""
    opt_input, opt_output = "Project/Data/", "Project/Output/"

    training_params = {"batch_size": opt_batch_size,
                       "shuffle": True,
                       "num_workers": 0}
    test_params = {"batch_size": opt_batch_size,
                   "shuffle": False,
                   "num_workers": 0}
    training_set = MyDataset(opt_input + "train" + "_emotion_group.tsv", opt_max_length)
    test_set = MyDataset(opt_input + "dev" + "_emotion_group.tsv", opt_max_length)
    training_generator = DataLoader(training_set, **training_params)
    test_generator = DataLoader(test_set, **test_params)

    if opt_feature == "small":
        model = CharacterLevelCNN(input_length=opt_max_length, n_classes=training_set.num_classes,
                                  input_dim=len(opt_alphabet),
                                  n_conv_filters=256, n_fc_neurons=1024)

    elif opt_feature == "large":
        model = CharacterLevelCNN(input_length=opt_max_length, n_classes=training_set.num_classes,
                                  input_dim=len(opt_alphabet),
                                  n_conv_filters=1024, n_fc_neurons=2048)
    else:
        sys.exit("Invalid feature mode!")

    log_path = "{}_{}_{}".format(opt_log_path, opt_feature, opt_dataset)
    if os.path.isdir(log_path):
        shutil.rmtree(log_path)
    os.makedirs(log_path)
    writer = SummaryWriter(log_path)

    if torch.cuda.is_available():
        model.cuda()

    criterion = nn.CrossEntropyLoss()
    if opt_optimizer == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=opt_lr)
    elif opt_optimizer == "sgd":
        optimizer = torch.optim.SGD(model.parameters(), lr=opt_lr, momentum=0.9)
    best_loss = 1e5
    best_epoch = 0
    model.train()
    num_iter_per_epoch = len(training_generator)

    for epoch in range(opt_num_epochs):
        for iter, batch in enumerate(training_generator):
            feature, label = batch
            if torch.cuda.is_available():
                feature = feature.cuda()
                label = label.cuda()
            optimizer.zero_grad()
            predictions = model(feature)
            loss = criterion(predictions, label)
            loss.backward()
            optimizer.step()

            training_metrics = get_evaluation(label.cpu().numpy(), predictions.cpu().detach().numpy(),
                                              list_metrics=["accuracy"])
            print("Epoch: {}/{}, Iteration: {}/{}, Lr: {}, Loss: {}, Accuracy: {}".format(
                epoch + 1,
                opt_num_epochs,
                iter + 1,
                num_iter_per_epoch,
                optimizer.param_groups[0]['lr'],
                loss, training_metrics["accuracy"]))
            writer.add_scalar('Train/Loss', loss, epoch * num_iter_per_epoch + iter)
            writer.add_scalar('Train/Accuracy', training_metrics["accuracy"], epoch * num_iter_per_epoch + iter)
        model.eval()
        loss_ls = []
        te_label_ls = []
        te_pred_ls = []
        for batch in test_generator:
            te_feature, te_label = batch
            num_sample = len(te_label)
            if torch.cuda.is_available():
                te_feature = te_feature.cuda()
                te_label = te_label.cuda()
            with torch.no_grad():
                te_predictions = model(te_feature)
            te_loss = criterion(te_predictions, te_label)
            loss_ls.append(te_loss * num_sample)
            te_label_ls.extend(te_label.clone().cpu())
            te_pred_ls.append(te_predictions.clone().cpu())

        te_loss = sum(loss_ls) / test_set.__len__()
        te_pred = torch.cat(te_pred_ls, 0)
        te_label = np.array(te_label_ls)
        test_metrics = get_evaluation(te_label, te_pred.numpy(), list_metrics=["accuracy", "confusion_matrix"])
        print("Epoch: {}/{}, Lr: {}, Loss: {}, Accuracy: {}".format(
            epoch + 1,
            opt_num_epochs,
            optimizer.param_groups[0]['lr'],
            te_loss, test_metrics["accuracy"]))
        writer.add_scalar('Test/Loss', te_loss, epoch)
        writer.add_scalar('Test/Accuracy', test_metrics["accuracy"], epoch)
        model.train()
        if te_loss + opt_es_min_delta < best_loss:
            best_loss = te_loss
            best_epoch = epoch
            torch.save(model, "{}/char-cnn_{}_{}".format(opt_output, opt_dataset, opt_feature))
        # Early stopping
        if epoch - best_epoch > opt_es_patience > 0:
            print("Stop training at epoch {}. The lowest loss achieved is {} at epoch {}".format(epoch, te_loss, best_epoch))
            break
        if opt_optimizer == "sgd" and epoch % 3 == 0 and epoch > 0:
            current_lr = optimizer.state_dict()['param_groups'][0]['lr']
            current_lr /= 2
            for param_group in optimizer.param_groups:
                param_group['lr'] = current_lr


if __name__ == "__main__":
    train()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch: 1/15, Iteration: 116/340, Lr: 0.0008, Loss: 1.3263682126998901, Accuracy: 0.375
Epoch: 1/15, Iteration: 117/340, Lr: 0.0008, Loss: 1.3328170776367188, Accuracy: 0.3515625
Epoch: 1/15, Iteration: 118/340, Lr: 0.0008, Loss: 1.2594108581542969, Accuracy: 0.4375
Epoch: 1/15, Iteration: 119/340, Lr: 0.0008, Loss: 1.2665585279464722, Accuracy: 0.40625
Epoch: 1/15, Iteration: 120/340, Lr: 0.0008, Loss: 1.227908968925476, Accuracy: 0.4453125
Epoch: 1/15, Iteration: 121/340, Lr: 0.0008, Loss: 1.3229091167449951, Accuracy: 0.375
Epoch: 1/15, Iteration: 122/340, Lr: 0.0008, Loss: 1.3289718627929688, Accuracy: 0.359375
Epoch: 1/15, Iteration: 123/340, Lr: 0.0008, Loss: 1.3197991847991943, Accuracy: 0.421875
Epoch: 1/15, Iteration: 124/340, Lr: 0.0008, Loss: 1.3553417921066284, Accuracy: 0.2890625
Epoch: 1/15, Iteration: 125/340, Lr: 0.0008, Loss: 1.2872015237808228, Accuracy: 0.4296875
Epoch: 1/15, Iteration: 126/340, Lr: 0.00

### Evaluation

In [9]:
# Load best model, compute the F-1 scores
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
import shutil
# use cpu because maximum quota reached ... 
best_model = torch.load('Output/char-cnn_GoEmotions_Group_small',map_location=torch.device('cpu'))
best_model.eval()
opt_batch_size = 128
opt_max_length = 1014
opt_input = "Project/Data/"
test_params = {"batch_size": opt_batch_size,
               "shuffle": False,
               "num_workers": 0}
criterion = nn.CrossEntropyLoss()
loss_ls = []
test_set = MyDataset(opt_input + "dev" + "_emotion_group.tsv", opt_max_length)
test_generator = DataLoader(test_set, **test_params)
te_label_ls = []
te_pred_ls = []
for batch in test_generator:
  te_feature, te_label = batch
  num_sample = len(te_label)
  if torch.cuda.is_available():
    te_feature = te_feature.cuda()
    te_label = te_label.cuda()
  with torch.no_grad():
    te_predictions = best_model(te_feature)
  te_loss = criterion(te_predictions, te_label)
  loss_ls.append(te_loss * num_sample)
  te_label_ls.extend(te_label.clone().cpu())
  te_pred_ls.append(te_predictions.clone().cpu())

te_loss = sum(loss_ls) / test_set.__len__()
te_pred = torch.cat(te_pred_ls, 0)
te_label = np.array(te_label_ls)
test_metrics = get_evaluation(te_label, te_pred.numpy(), list_metrics=["accuracy", "confusion_matrix"])
print("Loss: {}, Accuracy: {}".format(
            te_loss, test_metrics["accuracy"]))

Loss: 1.0757064819335938, Accuracy: 0.5431256911168448


In [10]:
test_metrics['f1scores']

0.4428739297553563

In [16]:
test_metrics['confusion_mat_original']

array([[1135,  371,   20,   66],
       [ 562, 1554,   16,   37],
       [ 730,  288,   99,   26],
       [ 266,   88,    9,  159]])

In [19]:
def get_f1scores(confusion_matrix):
  '''
  Compute the f1 scores, precision, recall for each class
  '''
  num_of_classes = np.shape(confusion_matrix)[0]
  precision_list = []
  recall_list = []
  f1_list = []
  for i in range(num_of_classes):
    predicted_i = confusion_matrix[:, i]
    actual_i = confusion_matrix[i, :]
    TP = confusion_matrix[i, i]
    PP = sum(predicted_i)
    Actual_p = sum(actual_i)
    precision_list.append(TP * 1.0 / PP)
    recall_list.append(TP * 1.0 / Actual_p)
    f1_list.append(2 * precision_list[i] * recall_list[i] / (precision_list[i] + recall_list[i])) 
  return precision_list,  recall_list, f1_list
    


In [20]:
precision_trial, recall_trial, f1_trial = get_f1scores(test_metrics['confusion_mat_original'])