In [1]:
!pip install pytorch-lightning==1.6.5 spacy==2.2.4
!pip install pandas

You should consider upgrading via the '/Users/vitalii.mishchenko/Documents/experiments/2302-nlp-course/venv/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/vitalii.mishchenko/Documents/experiments/2302-nlp-course/venv/bin/python -m pip install --upgrade pip' command.[0m


In [26]:
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, random_split
from collections import Counter
import en_core_web_md
import numpy as np
import pytorch_lightning as pl
import spacy
import torch
import torch.nn.functional as F
import torchmetrics
import pandas as pd

In [27]:
loaded_spacy_model = en_core_web_md.load()

In [3]:
# Fix the random seed so that we get consistent results
torch.manual_seed(0)
np.random.seed(0)

In [4]:
import tarfile
import os
import csv

DIRECTORY_NAME="classification"
TRAIN_FILE="classification/empatheticdialogues/train.csv"
VALIDATION_FILE="classification/empatheticdialogues/valid.csv"
TEST_FILE="classification/empatheticdialogues/test.csv"


def download_dataset():
  """
  Download the dialog dataset. The tarball contains three files: train.csv, valid.csv, test.csv
  """

  # if running locally, install wget before by running: "brew install wget"
  # "!pip install wget" won't help as it installs python module not cli
  !wget 'https://dl.fbaipublicfiles.com/parlai/empatheticdialogues/empatheticdialogues.tar.gz'
  # !wget help
  if not os.path.isdir(DIRECTORY_NAME):
    !mkdir classification
  tar = tarfile.open('empatheticdialogues.tar.gz')
  tar.extractall(DIRECTORY_NAME)
  tar.close()

# Expensive operation so we should just do this once
download_dataset()

--2023-03-03 13:38:23--  https://dl.fbaipublicfiles.com/parlai/empatheticdialogues/empatheticdialogues.tar.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28022709 (27M) [application/gzip]
Saving to: ‘empatheticdialogues.tar.gz’


2023-03-03 13:38:26 (16.0 MB/s) - ‘empatheticdialogues.tar.gz’ saved [28022709/28022709]



In [13]:
import glob
glob.glob(f"{DIRECTORY_NAME}/**/*.csv", recursive=True)

['classification/empatheticdialogues/valid.csv',
 'classification/empatheticdialogues/test.csv',
 'classification/empatheticdialogues/train.csv']

['classification/empatheticdialogues/valid.csv',
 'classification/empatheticdialogues/test.csv',
 'classification/empatheticdialogues/train.csv']

['classification/empatheticdialogues/valid.csv',
 'classification/empatheticdialogues/test.csv',
 'classification/empatheticdialogues/train.csv']

In [14]:
# See the parse_dataset function below for short explanation.
df = pd.read_csv(TRAIN_FILE, sep='\n', header=None)
df = df[0].str.split(',', expand=True)
new_header = df.iloc[0]
df = df[1:]
df.columns = new_header
df.head(5)

Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
1,hit:0_conv:1,1,sentimental,I remember going to the fireworks with my best...,1,I remember going to see the fireworks with my ...,5|5|5_2|2|5,
2,hit:0_conv:1,2,sentimental,I remember going to the fireworks with my best...,0,Was this a friend you were in love with_comma_...,5|5|5_2|2|5,
3,hit:0_conv:1,3,sentimental,I remember going to the fireworks with my best...,1,This was a best friend. I miss her.,5|5|5_2|2|5,
4,hit:0_conv:1,4,sentimental,I remember going to the fireworks with my best...,0,Where has she gone?,5|5|5_2|2|5,
5,hit:0_conv:1,5,sentimental,I remember going to the fireworks with my best...,1,We no longer talk.,5|5|5_2|2|5,


In [15]:
# "sentimental" to "<id>"
label_to_integer = dict()
# "<id>" to "sentimental"
integer_to_label = dict()

for ix, label in enumerate(df["context"].unique()):
  label_to_integer[label] = ix
  integer_to_label[ix] = label

In [16]:
def parse_dataset(file_path, sample=5000):
  # Our dataset file is a csv with varying input lengths, therefore we load the
  # file at once, we have to split it up into separate steps:
  # 1. Read each row as a single column row
  df = pd.read_csv(file_path, sep = '\n', header = None)
  # 2. Split up each row into separate columns
  df = df[0].str.split(',', expand = True)
  # 3. Set the header by using the first row
  new_header = df.iloc[0]
  df = df[1:]
  df.columns = new_header

  # Machine learning cannot work with categorical labels like "surprised" or
  # "excited". Therefore, we convert these tokens into a number.
  df["target"] = df["context"].apply(lambda x: label_to_integer[x])
  df["feature"] = df["prompt"] + " " + df["utterance"]

  # We only need the column "**feature**" created from column
  # "**prompt**" + "**utterance**" and the column "**target**".
  return df[["target", "feature"]].sample(n = sample, random_state = 0).values

In [18]:
training_data = parse_dataset(TRAIN_FILE, sample = 40000)
validation_data = parse_dataset(VALIDATION_FILE, sample = 4000)
test_data = parse_dataset(TEST_FILE, sample = 4000)

print('Shape of training dataset: ({rows}, {cols})'.format(rows=len(training_data), cols=len(training_data[0])))
print('Shape of validation dataset: ({rows}, {cols})'.format(rows=len(validation_data), cols=len(validation_data[0])))
print('Shape of test dataset: ({rows}, {cols})'.format(rows=len(test_data), cols=len(test_data[0])))

Shape of training dataset: (40000, 2)
Shape of validation dataset: (4000, 2)
Shape of test dataset: (4000, 2)


In [19]:
class ClassificationDataset(Dataset):
  """Creates an pytorch dataset to consume our pre-loaded csv data

  Reference: https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
  """
  def __init__(self, data, vectorizer):
    self.dataset = data
    # Vectorizer needs to implement a vectorize function that returns vector and tokens
    # 🌟🌟🌟 Pay extra attention here since you'll have to work on this in the models 🌟🌟🌟
    self.vectorizer = vectorizer

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    (label, sentence) = self.dataset[idx]
    sentence_vector, sentence_tokens = self.vectorizer.vectorize(sentence)
    return {
      "vectors": sentence_vector,
      "label": label,
      "tokens": sentence_tokens, # for debugging only
      "sentence": sentence # for debugging only
    }

In [20]:
class ClassificationDataModule(pl.LightningDataModule):
  """LightningDataModule: Wrapper class for the dataset to be used in training
  """
  def __init__(self, vectorizer, params):
    super().__init__()
    self.params = params
    self.classification_train = ClassificationDataset(training_data, vectorizer)
    self.classification_val = ClassificationDataset(validation_data, vectorizer)
    self.classification_test = ClassificationDataset(test_data, vectorizer)

  # Function to convert the input raw data from the dataset into model input.
  # 🌟🌟🌟 Pay extra attention here since you'll have to work on this in the models 🌟🌟🌟
  def collate_fn(self, batch):
    # Embedding layers need the inputs to be integer, so we need to add this special case here.
    if self.params.integer_input:
      word_vector = [torch.LongTensor(item["vectors"]) for item in batch]
      sentence_vector = pad_sequence(word_vector, batch_first=True, padding_value=0)
    else:
      sentence_vector = torch.stack([torch.Tensor(item["vectors"]) for item in batch])
    labels = torch.LongTensor([item["label"] for item in batch])
    return {"vectors": sentence_vector, "labels": labels, "sentences": [item["sentence"] for item in batch]}

  # Training dataloader .. will reset itself each epoch
  def train_dataloader(self):
    return DataLoader(self.classification_train, batch_size=self.params.batch_size, collate_fn=self.collate_fn)

  # Validation dataloader .. will reset itself each epoch
  def val_dataloader(self):
    return DataLoader(self.classification_val, batch_size=self.params.batch_size, collate_fn=self.collate_fn)

  # Test dataloader .. will reset itself each epoch
  def test_dataloader(self):
    return DataLoader(self.classification_test, batch_size=self.params.batch_size, collate_fn=self.collate_fn)

In [22]:
class EmotionClassifier(pl.LightningModule):
  def __init__(self, model, params):
    super().__init__()
    self.model = model
    self.params = params
    self.accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=params.num_classes)

  # will be called automatically by "training_step", "validation_step", etc.
  def forward(self, x):
    return self.model(x)

  def training_step(self, batch, batch_idx):
    x = batch["vectors"]
    y = batch["labels"]
    y_hat = self(x)
    loss = F.cross_entropy(y_hat, y, reduction='mean')
    self.log_dict(
      {'train_loss': loss},
      batch_size=self.params.batch_size,
      prog_bar=True
    )
    return loss

  def validation_step(self, batch, batch_nb):
    x = batch["vectors"]
    y = batch["labels"]
    y_hat = self(x)
    val_loss = F.cross_entropy(y_hat, y, reduction='mean')
    predictions = torch.argmax(y_hat, dim=1)
    self.log_dict(
      {
        'val_loss': val_loss,
        'val_accuracy': self.accuracy(predictions, y)
      },
      batch_size=self.params.batch_size,
      prog_bar=True
    )
    return val_loss

  def test_step(self, batch, batch_nb):
    x = batch["vectors"]
    y = batch["labels"]
    y_hat = self(x)
    test_loss = F.cross_entropy(y_hat, y, reduction='mean')
    predictions = torch.argmax(y_hat, dim=1)
    self.log_dict(
      {
        'test_loss': test_loss,
        'test_accuracy': self.accuracy(predictions, y)
      },
      batch_size=self.params.batch_size,
      prog_bar=True
    )
    return test_loss

  def predict_step(self, batch, batch_idx):
    y_hat = self.model(batch["vectors"])
    predictions = torch.argmax(y_hat, dim=1)
    return {'logits':y_hat, 'predictions': predictions, 'labels': batch["labels"], 'sentences': batch['sentences']}

  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=self.params.learning_rate)
    return optimizer

In [23]:
def trainer(model, params, vectorizer):
  # Create a pytorch trainer
  trainer = pl.Trainer(max_epochs=params.max_epochs, check_val_every_n_epoch=1)

  # Initialize our data loader with the passed vectorizer
  data_module = ClassificationDataModule(vectorizer, params)

  # Instantiate a new model
  emotionClassifier = EmotionClassifier(model, params)

  # Train and validate the model
  trainer.fit(emotionClassifier, data_module.train_dataloader(), val_dataloaders=data_module.val_dataloader())

  # Test the model
  trainer.test(emotionClassifier, data_module.test_dataloader())

  # Predict on the same test set to show some output
  output = trainer.predict(emotionClassifier, data_module.test_dataloader())

  for i in range(2):
    print("-----------")
    print("Sentence: ", output[1]['sentences'][i])
    print("Predicted Emotion: ", integer_to_label[output[1]['predictions'][i].item()])
    print("Actual Label: ", integer_to_label[output[1]['labels'][i].item()])

In [24]:
class WordVectorClassificationModel(torch.nn.Module):
  def __init__(self, word_vec_dimension, num_classes):
    super().__init__()
    self.classes = num_classes
    self.linear_layer = torch.nn.Linear(word_vec_dimension, num_classes)

  # 🌟🌟🌟 Pay extra attention here since you'll have to work on this in the models 🌟🌟🌟
  def forward(self, batch):
    """Projection from word_vec_dim to n_classes

    Batch is of shape (batch_size, max_seq_len, word_vector_dim)
    """
    return self.linear_layer(batch)

In [28]:
class SpacyVectorizer:
  def vectorize(self, sentence):
    """
    Given a sentence, tokenize it and reference pre-trained word vector for each token.

    Returns a tuple of sentence_vector and list of text tokens
    """
    sentence_vector = []
    sentence_tokens = []
    # https://spacy.io/api/language#attributes
    spacy_doc = loaded_spacy_model.make_doc(sentence) ## I am Sourabh
    word_vector = [token.vector for token in spacy_doc] ## [ [Embedding of I], [Embedding of am], [Embedding of UNK]]
    sentence_tokens = list([token.text for token in spacy_doc]) # [[I], [am], [Sourabh]]
    sentence_vector = np.mean(np.array(word_vector), axis=0)
    return sentence_vector, sentence_tokens

In [29]:
class HParams:
  batch_size: int = 32
  integer_input: bool = False
  # cannot change, because en_core_web_md creates word vectors with 300 dimensions
  word_vec_dimension: int = 300
  num_classes: int = 32
  learning_rate: float = 0.001
  max_epochs: int = 4

trainer(
  model=WordVectorClassificationModel(HParams.word_vec_dimension,
                                      HParams.num_classes),
  params=HParams,
  vectorizer=SpacyVectorizer())

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /Users/vitalii.mishchenko/Documents/experiments/2302-nlp-course/src/week3/lightning_logs

  | Name     | Type                          | Params
-----------------------------------------------------------
0 | model    | WordVectorClassificationModel | 9.6 K 
1 | accuracy | MulticlassAccuracy            | 0     
-----------------------------------------------------------
9.6 K     Trainable params
0         Non-trainable params
9.6 K     Total params
0.039     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]



Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_accuracy         0.3267500102519989
        test_loss            2.456451892852783
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────




Predicting: 1250it [00:00, ?it/s]

-----------
Sentence:  When I got falsely accused of eating my roommate's ice cream last night_comma_ I was completely outraged! I don't even eat ice cream because I'm lactose intolerant_comma_ and she knows that! I wonder who actually ate it then.
Predicted Emotion:  guilty
Actual Label:  furious
-----------
Sentence:  I found out that my childhood cat passed away yesterday  Thats sad. I love cats
Predicted Emotion:  sad
Actual Label:  sad
