**Yong Zhu Cheng A0275768H**

The objective of the project is to train a machine to generate an empathetic response to a given prompt, leveraging the large EmpatheticDialogues dataset developed by Rashkin et al.(2018). The dataset consists of crowd-sourced one-on-one dialogues covering a range of emotions, with human-annotated emotions as a key feature.

The model used 

## 1. Init

In [1]:
import pandas as pd
import numpy as np
import math
import os
import tarfile
from tqdm.auto import tqdm
import torch
from transformers import BertTokenizer, BertGenerationEncoder, BertGenerationDecoder, EncoderDecoderModel
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import Seq2SeqTrainingArguments, Trainer
from datasets import Dataset
import evaluate
import warnings
import gc

main_path = r'C:\Users\yongz\NUS\DSA5202_Project'
os.chdir(main_path)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## 2. Data Preprocessing



The following cells, up to the indicated checkpoint, are only run once:

In [None]:
# [RUN ONCE] Extract zip file
# data_path = r'dataset'
# file = tarfile.open('empatheticdialogues.tar.gz')
# if os.path.isdir(data_path) is False:
#     os.mkdir(data_path)
#     file.extractall(data_path)

In [None]:
train_df = pd.read_csv(r'dataset\empatheticdialogues\train.csv',on_bad_lines='skip')
test_df = pd.read_csv(r'dataset\empatheticdialogues\test.csv',on_bad_lines='skip')
train_df.describe()

In [None]:
train_df.head()

In [None]:
def get_prev_utterance(row,df):
    if row.utterance_idx == 1:
        return pd.NA
    else:
        return df.at[row.conv_id+'_'+str(int(row.utterance_idx)-1),'utterance']

def process_df(df0,save_dir):    
    df = df0.copy()
    df['index1'] = df['conv_id'] + '_' + df['utterance_idx'].astype(str)
    df = df.set_index('index1',drop=True)
    conv_ids = df['conv_id'].unique()
    for conv_idx in tqdm(conv_ids):
        u_ids = [df.at[i,'utterance_idx'] for i in df.index if conv_idx in i]
        if max(u_ids) != len(u_ids):
            df = df.drop(df[df['conv_id']==conv_idx].index)
    df['utterance0'] = df.apply(lambda row: get_prev_utterance(row,df),axis=1)
    df = df.reset_index(drop=True)
    df = df[['context','prompt','utterance0','utterance']]
    df = df.rename(columns={'utterance':'utterance1'})
    df = df.dropna(subset='utterance0')
    df.to_csv(save_dir,index=False)
    
process_df(train_df,r'dataset\empatheticdialogues\train_v1.csv')
process_df(test_df,r'dataset\empatheticdialogues\test_v1.csv')

Resume from here:

In [3]:
train_df1 = pd.read_csv(r'dataset\empatheticdialogues\train_v1.csv')
test_df1 = pd.read_csv(r'dataset\empatheticdialogues\test_v1.csv')
obj_del = ['enc','dec','model','tokenizer']
train_df1 = train_df1.rename(columns={'utterance0':'text','utterance1':'label'})
train_df1.head()

Unnamed: 0,context,prompt,text,label
0,sentimental,I remember going to the fireworks with my best...,I remember going to see the fireworks with my ...,Was this a friend you were in love with_comma_...
1,sentimental,I remember going to the fireworks with my best...,Was this a friend you were in love with_comma_...,This was a best friend. I miss her.
2,sentimental,I remember going to the fireworks with my best...,This was a best friend. I miss her.,Where has she gone?
3,sentimental,I remember going to the fireworks with my best...,Where has she gone?,We no longer talk.
4,sentimental,I remember going to the fireworks with my best...,We no longer talk.,Oh was this something that happened because of...


Model 1: BERT

In [None]:
model_checkpoint = 'facebook/bart-base'
for obj in obj_del:
    if obj in globals(): del globals()[obj]
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    tokenizer = BertTokenizer.from_pretrained(model_checkpoint,do_lower_case=True)
def tokenize(sample):
    return tokenizer(sample['text'],text_target=sample['label'],max_length=200,padding='max_length',truncation=True)

core_ds = Dataset.from_pandas(train_df1[['text','label']])
core_ds = core_ds.map(tokenize,batched=True,remove_columns=core_ds.column_names)
core_ds = core_ds.train_test_split(test_size=0.2,seed=10)
train_ds = core_ds['train'].select(range(2000)) # for smaller subset
val_ds =  core_ds['test'].select(range(1000))
enc = BertGenerationEncoder.from_pretrained(model_checkpoint)
dec = BertGenerationDecoder.from_pretrained(model_checkpoint, add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)
model = EncoderDecoderModel(encoder=enc,decoder=dec)

Model 2:

In [5]:
model_checkpoint = 'facebook/bart-base'
for obj in obj_del:
    if obj in globals(): del globals()[obj]
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,do_lower_case=True)
def tokenize(sample):
    return tokenizer(sample['text'],text_target=sample['label'],max_length=200,padding='max_length',truncation=True)

core_ds = Dataset.from_pandas(train_df1[['text','label']])
core_ds = core_ds.map(tokenize,batched=True,remove_columns=core_ds.column_names)
core_ds = core_ds.train_test_split(test_size=0.2,seed=10)
train_ds = core_ds['train'].select(range(2000)) # for smaller subset
val_ds =  core_ds['test'].select(range(1000))
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Map:   0%|          | 0/58770 [00:00<?, ? examples/s]

In [None]:
for obj in obj_del:
    if obj in globals(): del globals()[obj]
train_ds = core_ds['train'].select(range(2000)) # for smaller subset
val_ds =  core_ds['test'].select(range(1000))
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [6]:
model_dir = r'models'
if os.path.isdir(model_dir) is False:
    os.mkdir(model_dir)
training_args = Seq2SeqTrainingArguments(
    output_dir=model_dir,
    disable_tqdm=False,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
#     label_names=['utterance1']
) # finetune hp
training_args.set_logging(report_to=['tensorboard'])
training_args.set_dataloader(train_batch_size=2,eval_batch_size=2)

model.config.decoder_start_token_id = model.config.bos_token_id
model.config.pad_token_id = -100 # https://discuss.huggingface.co/t/expected-workflow-100-and-padding-in-labels-in-seq2seq/27692

metric=evaluate.load('bleu')
def compute_metrics(eval_pred):
    pred,labels = eval_pred
    return metric.compute(predictions=pred,references=labels)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics
)

In [None]:
gc.collect()
val_ds2 = core_ds['test'].select(range(200))
trainer.evaluate(val_ds2)

In [None]:
trainer.train()

In [None]:
'''
Naive approach:
Fine-tune pre-trained model on successive sentences (y_t-1 -> y_t)
Generate empathetic reply to a response

Limitations:
No nuance in representing empathy, e.g. different words, etc.
No theoretical framework

Scope for improvement:
Variables:
Decoder-only
W/ or w/o cross attention
W/ other information encoded
Hyperparameters
'''
no_print=True

## References

https://github.com/facebookresearch/EmpatheticDialogues