In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd

In [None]:
emotion_labels = pd.read_csv("mapping.txt", sep="\t", names=['label', 'emotion'])

In [None]:
# Training set
train_text = pd.read_csv("train_text.txt", sep="\t", names=['text'])
train_label = pd.read_csv("train_labels.txt", sep="\t", names=['label'])
train_df = pd.concat([train_text, train_label], axis=1)

In [None]:
# Validation set
val_text = pd.read_csv("val_text.txt", sep="\t", names=['text'])
val_label = pd.read_csv("val_labels.txt", sep="\t", names=['label'])
valid_df = pd.concat([val_text, val_label], axis=1)

In [None]:
# Test set
test_text = pd.read_csv("test_text.txt", sep="\t", names=['text'])
test_label = pd.read_csv("test_labels.txt", sep="\t", names=['label'])
test_df = pd.concat([test_text, test_label], axis=1)

**Pick 2 emotion classes**

In [None]:
# Randomly pick 3 emotions: emotion[0], emotion[1], emotion[2]
# 1st dataframe contains emotion[0], emotion[1]
# 2nd dataframe contains emotion[1], emotion[2]
selected_emotions = emotion_labels.sample(n = 3)["label"].tolist()

In [None]:
selected_train_df1 = train_df[train_df['label'].isin(selected_emotions[:2])]
selected_valid_df1 = valid_df[valid_df['label'].isin(selected_emotions[:2])]
selected_test_df1 = test_df[test_df['label'].isin(selected_emotions[:2])]

selected_train_df2 = train_df[train_df['label'].isin(selected_emotions[1:])]
selected_valid_df2 = valid_df[valid_df['label'].isin(selected_emotions[1:])]
selected_test_df2 = test_df[test_df['label'].isin(selected_emotions[1:])]

In [None]:
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')

def preprocess(texts):
  # Lowercase
  texts = texts.lower()
  # Split tokens on white space
  tokens = texts.split()
  # Remove all punctuation from words
  table = str.maketrans('', '', string.punctuation)
  tokens = [w.translate(table) for w in tokens]
  # remove remaining tokens that are not alphabetic
  tokens = [word for word in tokens if word.isalpha()]
  # filter out stop words
  stop_words = set(stopwords.words('english'))
  tokens = [w for w in tokens if not w in stop_words]
  # filter out short tokens
  tokens = [word for word in tokens if len(word) > 1]
  return tokens

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
selected_train_df1["preprocess_text"] = selected_train_df1["text"].apply(preprocess)
selected_valid_df1["preprocess_text"] = selected_valid_df1["text"].apply(preprocess)
selected_test_df1["preprocess_text"] = selected_test_df1["text"].apply(preprocess)

selected_train_df2["preprocess_text"] = selected_train_df2["text"].apply(preprocess)
selected_valid_df2["preprocess_text"] = selected_valid_df2["text"].apply(preprocess)
selected_test_df2["preprocess_text"] = selected_test_df2["text"].apply(preprocess)

**Works on 1st dataset**

In [None]:
from collections import Counter
from keras.preprocessing.text import Tokenizer
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score, accuracy_score

In [None]:
# Build Vocabulary
vocab = Counter()

all_train_tokens = []
for row in selected_train_df1["preprocess_text"].tolist():
  for token in row:
    all_train_tokens.append(token)

vocab.update(all_train_tokens)

In [None]:
def convert_text_to_sequence(tokens, max_length):
  encoded = tokenizer.texts_to_sequences(tokens)
  encoded = np.array(encoded).T.reshape(-1)
  # Padding the encoded sentence to max_length, ensure all embeddings has the same length
  pad_zero = np.zeros(max_length - encoded.shape[0])
  output = np.concatenate((encoded, pad_zero))
  return output

# Create the tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")
# fit the tokenizer on the documents
tokenizer.fit_on_texts(all_train_tokens)
# define max_length for padding purpose
max_length = 500
# sequence encode
selected_train_df1["encode_text"] = selected_train_df1["preprocess_text"].apply(lambda x: convert_text_to_sequence(x, max_length))
selected_valid_df1["encode_text"] = selected_valid_df1["preprocess_text"].apply(lambda x: convert_text_to_sequence(x, max_length))
selected_test_df1["encode_text"] = selected_test_df1["preprocess_text"].apply(lambda x: convert_text_to_sequence(x, max_length))

# Padding = 0, <UNK> = 1

In [None]:
class CNN(nn.Module):
  def __init__(self, vocab_size, embedding_dim, max_length, filters, kernel_size, dropout_prob, hidden_dim):
    super(CNN,self).__init__()
    # Embedding layer
    self.embeddings = nn.Embedding(vocab_size, embedding_dim)
    # Dropout
    self.dropout = nn.Dropout(dropout_prob)
    # Convolution layer
    self.conv1d = nn.Conv1d(in_channels=embedding_dim, out_channels=filters, kernel_size=kernel_size)
    # ReLu
    self.relu = nn.ReLU()
    # Global Max-Pooling
    self.pool = nn.AdaptiveMaxPool1d(1)
    # Fully connected layers
    self.fc1 = nn.Linear(filters, hidden_dim)
    self.fc2 = nn.Linear(hidden_dim, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    embedded = self.embeddings(x)
    dropout = self.dropout(embedded)
    dropout = dropout.permute(0, 2, 1)  # Conv1d expects (batch, channels, seq_len)
    conv1d = self.conv1d(dropout)
    relu = self.relu(conv1d)
    pool = self.pool(relu).squeeze(-1)
    fc1 = self.relu(self.dropout(self.fc1(pool)))
    fc2 = self.fc2(fc1)
    output = self.sigmoid(fc2)
    return output

In [None]:
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# Training Data
train_text_data = list(selected_train_df1["encode_text"])

train_label_data = list(selected_train_df1["label"])
selected_label = set(selected_train_df1["label"])
# Map the label to {0,1}, because we will do Binary-cross-entropy later, only accept label=[0,1]
# dict = {Label1 : 0, Label2: 1}
mapping_dict = {value: index for index, value in enumerate(selected_label)}
train_label_data = list(mapping_dict[value] for value in train_label_data)

Xtrain = torch.LongTensor(train_text_data)
Ytrain = torch.Tensor(train_label_data)

In [None]:
# Valid Data
valid_text_data = list(selected_valid_df1["encode_text"])

valid_label_data = list(selected_valid_df1["label"])
valid_label_data = list(mapping_dict[value] for value in valid_label_data)

# transform to torch.LongTensor(integer)
Xvalid = torch.LongTensor(valid_text_data)
Yvalid = torch.Tensor(valid_label_data)

In [None]:
# Testing Data
test_text_data = list(selected_test_df1["encode_text"])

test_label_data = list(selected_test_df1["label"])
test_label_data = list(mapping_dict[value] for value in test_label_data)

# transform to torch.LongTensor(integer)
Xtest = torch.LongTensor(test_text_data)
Ytest = torch.Tensor(test_label_data)

In [None]:
def train_evaluate_model(Xtrain, Ytrain, Xvalid, Yvalid, Xtest, Ytest, vocab_size, batch_sizes, epochs, learning_rate):
  print(f'Train & Evaluate model with batch_sizes={batch_sizes}; epochs={epochs}; learning_rate={learning_rate}')
  # Hyperparameter that we fixed:
  embedding_dim = 50
  filters = 250
  kernel_size = 3
  hidden_dim = 250
  dropout_prob = 0.2

  train_dataset = TensorDataset(Xtrain, Ytrain)
  train_dataloader = DataLoader(train_dataset, batch_size=batch_sizes, shuffle=True)

  # Initialize model and optimizer
  model = CNN(vocab_size, embedding_dim, max_length, filters, kernel_size, dropout_prob, hidden_dim)
  criterion = nn.BCELoss()
  optimizer = optim.Adam(model.parameters(), lr=learning_rate)

  for epoch in range(epochs):
    total_loss = 0
    model.train()
    for X, Y in train_dataloader:
      optimizer.zero_grad()
      output = model(X).squeeze(-1)
      loss = criterion(output, Y)
      loss.backward()
      optimizer.step()
      total_loss += loss.item()

    # Validation loop
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
      validation_loss = 0
      v_output = model(Xvalid).squeeze(-1)
      v_loss = criterion(v_output, Yvalid)
      validation_loss = v_loss.item()

    print(f'[Train] Epoch [{epoch + 1}/{epochs}], Training Loss: {total_loss:.4f}, Validation Loss: {validation_loss:.4f}')

  # Evaluate with Testing set
  model.eval()
  with torch.no_grad():
    test_outputs = model(Xtest).squeeze(-1)
    test_outputs = (test_outputs > 0.5).float()
    accuracy = accuracy_score(Ytest, test_outputs)
    f1score = f1_score(Ytest, test_outputs, average='macro')
    print(f'[Test] Accuracy: {accuracy * 100:.4f}%, F1-score: {f1score * 100:.4f}%\n')

  return accuracy

In [None]:
batch_sizes = [30, 50, 70]
epochs = [3, 4, 5]
learning_rate = [0.001, 0.002]
best_batch = batch_sizes[0]
best_epochs = epochs[0]
best_lr = learning_rate[0]
best_acc = 0

for batch in batch_sizes:
  for epoch in epochs:
    for lr in learning_rate:
      acc = train_evaluate_model(Xtrain, Ytrain, Xvalid, Yvalid, Xtest, Ytest, vocab_size, batch, epoch, lr)
      if acc >= best_acc:
        best_acc = acc
        best_batch = batch
        best_epochs = epoch
        best_lr = lr

Train & Evaluate model with batch_sizes=30; epochs=3; learning_rate=0.001
[Train] Epoch [1/3], Training Loss: 22.1502, Validation Loss: 0.5526
[Train] Epoch [2/3], Training Loss: 21.9226, Validation Loss: 0.5458
[Train] Epoch [3/3], Training Loss: 19.7119, Validation Loss: 0.5562
[Test] Accuracy: 77.4257%, F1-score: 62.1210%

Train & Evaluate model with batch_sizes=30; epochs=3; learning_rate=0.002
[Train] Epoch [1/3], Training Loss: 23.3457, Validation Loss: 0.5654
[Train] Epoch [2/3], Training Loss: 20.8139, Validation Loss: 0.5384
[Train] Epoch [3/3], Training Loss: 16.7957, Validation Loss: 0.6700
[Test] Accuracy: 76.0396%, F1-score: 46.2336%

Train & Evaluate model with batch_sizes=30; epochs=4; learning_rate=0.001
[Train] Epoch [1/4], Training Loss: 23.1334, Validation Loss: 0.5739
[Train] Epoch [2/4], Training Loss: 21.7531, Validation Loss: 0.5557
[Train] Epoch [3/4], Training Loss: 20.3962, Validation Loss: 0.5339
[Train] Epoch [4/4], Training Loss: 17.6042, Validation Loss: 0

In [None]:
print(f'Best Hyperparams Combination:')
print(f'batch_sizes = {best_batch}')
print(f'epochs = {best_epochs}')
print(f'learning_rate = {best_lr}')

Best Hyperparams Combination:
batch_sizes = 70
epochs = 5
learning_rate = 0.001


**Work with 2nd Dataset**

In [None]:
# Build Vocabulary
vocab = Counter()

all_train_tokens = []
for row in selected_train_df2["preprocess_text"].tolist():
  for token in row:
    all_train_tokens.append(token)

vocab.update(all_train_tokens)

In [None]:
# Create the tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")
# fit the tokenizer on the documents
tokenizer.fit_on_texts(all_train_tokens)
# define max_length for padding purpose
max_length = 500
# sequence encode
selected_train_df2["encode_text"] = selected_train_df2["preprocess_text"].apply(lambda x: convert_text_to_sequence(x, max_length))
selected_valid_df2["encode_text"] = selected_valid_df2["preprocess_text"].apply(lambda x: convert_text_to_sequence(x, max_length))
selected_test_df2["encode_text"] = selected_test_df2["preprocess_text"].apply(lambda x: convert_text_to_sequence(x, max_length))

# Padding = 0, <UNK> = 1

In [None]:
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# Training Data
train_text_data = list(selected_train_df2["encode_text"])

train_label_data = list(selected_train_df2["label"])
selected_label = set(selected_train_df2["label"])
# Map the label to {0,1}, because we will do Binary-cross-entropy later, only accept label=[0,1]
# dict = {Label1 : 0, Label2: 1}
mapping_dict = {value: index for index, value in enumerate(selected_label)}
train_label_data = list(mapping_dict[value] for value in train_label_data)

Xtrain = torch.LongTensor(train_text_data)
Ytrain = torch.Tensor(train_label_data)

In [None]:
# Valid Data
valid_text_data = list(selected_valid_df2["encode_text"])

valid_label_data = list(selected_valid_df2["label"])
valid_label_data = list(mapping_dict[value] for value in valid_label_data)

# transform to torch.LongTensor(integer)
Xvalid = torch.LongTensor(valid_text_data)
Yvalid = torch.Tensor(valid_label_data)

In [None]:
# Testing Data
test_text_data = list(selected_test_df2["encode_text"])

test_label_data = list(selected_test_df2["label"])
test_label_data = list(mapping_dict[value] for value in test_label_data)

# transform to torch.LongTensor(integer)
Xtest = torch.LongTensor(test_text_data)
Ytest = torch.Tensor(test_label_data)

In [None]:
acc = train_evaluate_model(Xtrain, Ytrain, Xvalid, Yvalid, Xtest, Ytest, vocab_size, best_batch, best_epochs, best_lr)

Train & Evaluate model with batch_sizes=70; epochs=5; learning_rate=0.001
[Train] Epoch [1/5], Training Loss: 9.2783, Validation Loss: 0.6084
[Train] Epoch [2/5], Training Loss: 9.0108, Validation Loss: 0.6164
[Train] Epoch [3/5], Training Loss: 8.7736, Validation Loss: 0.5697
[Train] Epoch [4/5], Training Loss: 8.7118, Validation Loss: 0.5651
[Train] Epoch [5/5], Training Loss: 7.9673, Validation Loss: 0.5799
[Test] Accuracy: 69.8545%, F1-score: 60.0812%

