In [1]:
import sys
sys.dont_write_bytecode = True

import numpy as np
import torch
import torch.nn as nn
from itertools import chain
import language_tool_python
from argparse import Namespace
from datasets import load_dataset, load_metric, DatasetDict, Dataset
from transformers import (
    AutoConfig, 
    AutoTokenizer, 
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    default_data_collator,
)

import time
from copy import deepcopy
from tqdm import tqdm
from typing import *
from DialogueAPI import dialogue

2023-01-06 13:23:12.897708: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-06 13:23:13.765212: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-06 13:23:13.765404: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


## Blended_Skill_Talk Dataset

In [2]:
bst_dataset = load_dataset("blended_skill_talk")
train_dataset = bst_dataset['train']
eval_dataset = bst_dataset['validation']
test_dataset = bst_dataset['test']
print(bst_dataset)
# print(train_dataset[0])

Using custom data configuration default
Reusing dataset blended_skill_talk (/home/monkey/.cache/huggingface/datasets/blended_skill_talk/default/1.0.0/bded69fdeee98ed8bba2ef088ac9dfd74e9ad0b95b1de5d51e333cee6f6261aa)
100%|██████████| 3/3 [00:00<00:00, 123.03it/s]

DatasetDict({
    train: Dataset({
        features: ['personas', 'additional_context', 'previous_utterance', 'context', 'free_messages', 'guided_messages', 'suggestions'],
        num_rows: 4819
    })
    validation: Dataset({
        features: ['personas', 'additional_context', 'previous_utterance', 'context', 'free_messages', 'guided_messages', 'suggestions'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['personas', 'additional_context', 'previous_utterance', 'context', 'free_messages', 'guided_messages', 'suggestions'],
        num_rows: 980
    })
})





In [21]:
# Get statistics of pair of dialogues 
train_num, eval_num, test_num = 0, 0, 0
for i, instance in enumerate(train_dataset):
    train_num += len(instance['free_messages'])

for i, instance in enumerate(eval_dataset):
    eval_num += len(instance['free_messages'])

for i, instance in enumerate(test_dataset):
    test_num += len(instance['free_messages'])

print("#pairs of training dialogues: {}, validation dialogues: {}, test dialogues: {}".format(
    train_num, eval_num, test_num,
))

#pairs of training dialogues: 27018, validation dialogues: 5651, test dialogues: 5482


In [25]:
# Show examples
for i, instance in enumerate(test_dataset.select(range(1))):
    for key, value in instance.items():
        if key != 'label_candidates':
            print("{} ({}): {}".format(key, len(value), value))

personas (2): ['i hate talking to people.', 'i believe dragons are real.']
additional_context (14): Social anxiety
previous_utterance (2): ['Wow, I am never shy. Do you have anxiety?', "Yes. I end up sweating and blushing and feel like i'm going to throw up."]
context (19): wizard_of_wikipedia
free_messages (3): ['and why is that?', 'interesting but I know how you feel especially the whole people telling that it in your head ', "Dang that's though. But I also understand that. I have people some who talks behind my back because of certain things that I believe in "]
guided_messages (3): ["I think it's because in my head, I think everyone is judging me. I just start to sweat and I get sick in my stomach.", "I don't really have people telling me in my head, more like behind my back", 'Me too! What do you believe in? I believe in dragons... Just finished watching Game of Thrones. Man, those things are dope']
suggestions (3): {'convai2': ["i've no idea i am also very shy", 'oh i know . i al

#### Seq2Seq Model

In [14]:
data_args = Namespace(
    model_name_or_path="facebook/bart-base",
    # model_name_or_path="results/",
    max_source_length=256,
    max_target_length=256,
    pad_to_max_length=False,
    ignore_pad_token_for_loss=True,
    max_train_samples=None,
    preprocessing_num_workers=None,
    overwrite_cache=True,
    output_dir='results/bart',
)
padding = "max_length" if data_args.pad_to_max_length else False

# config = AutoConfig.from_pretrained(data_args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(data_args.model_name_or_path)
# model = AutoModelForSeq2SeqLM.from_pretrained(data_args.model_name_or_path)

Downloading: 100%|██████████| 1.72k/1.72k [00:00<00:00, 612kB/s]
Downloading: 100%|██████████| 899k/899k [00:00<00:00, 16.3MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 12.3MB/s]
Downloading: 100%|██████████| 1.36M/1.36M [00:00<00:00, 19.6MB/s]


In [117]:
num_added_toks = tokenizer.add_tokens(['<PS>'], special_tokens=True) ## this line is updated
num_added_toks = tokenizer.add_tokens(['<CTX>'], special_tokens=True) ## this line is updated
num_added_toks = tokenizer.add_tokens(['<SEP>'], special_tokens=True) ## this line is updated
model.resize_token_embeddings(len(tokenizer))

Embedding(50268, 768)

##### Tokenize dataset

In [17]:
def preprocess_bst(examples):
    num_entries = len(examples["free_messages"])
    persona_pieces = [
        f"<PS> {examples['personas'][0]}",
        f"<PS> {examples['personas'][1]}",
    ]
    if examples['context'] == "wizard_of_wikipedia":
        additional_context_pieces = [f"[<CTX> {examples['additional_context']}. <SEP> "]
    else:
        additional_context_pieces = ["<SEP> "]

    previous_utterance_pieces = examples["previous_utterance"]
    inputs, labels = [], []
    for entry_idx in range(num_entries):
        free_message = examples['free_messages'][entry_idx]
        guided_message = examples['guided_messages'][entry_idx]

        previous_utterance = ' <SEP> '.join(previous_utterance_pieces)
        original_context = ' '.join(
            persona_pieces + additional_context_pieces
        ) + previous_utterance
        # Input & Output
        text = original_context + ' ' + tokenizer.eos_token + ' ' + free_message
        inputs.append(text)
        labels.append(guided_message)

        previous_utterance_pieces += [
            free_message,
            guided_message,
        ]
        # print("history: ", text)
        # print("label: ", guided_message)

    inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(labels, max_length=data_args.max_target_length, padding=padding, truncation=True)
    
    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 
    # when we want to ignore padding in the loss.
    if padding == "max_length" and data_args.ignore_pad_token_for_loss:
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
    inputs["labels"] = labels["input_ids"]
    return inputs


def group_texts(examples):
    # ['input_ids', 'attention_mask', 'labels']
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    return concatenated_examples

In [19]:
column_names = train_dataset.column_names

if data_args.max_train_samples is not None:
    train_dataset = train_dataset.select(range(data_args.max_train_samples))

tokenized_train_dataset = train_dataset.map(
    preprocess_bst,
    batched=False,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not data_args.overwrite_cache,
)
tokenized_train_dataset = tokenized_train_dataset.map(
    group_texts,
    batched=True,
    num_proc=data_args.preprocessing_num_workers,
    load_from_cache_file=not data_args.overwrite_cache,
)

tokenized_eval_dataset = eval_dataset.map(
    preprocess_bst,
    batched=False,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not data_args.overwrite_cache,
)
tokenized_eval_dataset = tokenized_eval_dataset.map(
    group_texts,
    batched=True,
    num_proc=data_args.preprocessing_num_workers,
    load_from_cache_file=not data_args.overwrite_cache,
)
print(tokenized_train_dataset)
print(tokenized_eval_dataset)

##### Training

In [97]:
training_args = Seq2SeqTrainingArguments(
    output_dir=data_args.output_dir,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    metric_for_best_model="eval_bleu",
    greater_is_better=True, # smaller eval loss is better
    per_device_train_batch_size=10,
    per_device_eval_batch_size=20,
    gradient_accumulation_steps=20,
    num_train_epochs=30,
    predict_with_generate=True, # generation task
)

# Data collator
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
if data_args.pad_to_max_length:
    data_collator = default_data_collator
else:
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=label_pad_token_id,
        pad_to_multiple_of=8 if training_args.fp16 else None,
    )

# Metric
metric = load_metric("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    if data_args.ignore_pad_token_for_loss:
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Initialize our Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset if training_args.do_train else None,
    eval_dataset=tokenized_eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics if training_args.predict_with_generate else None,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [99]:
# Training
checkpoint = None

train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model()  # Saves the tokenizer too for easy upload

metrics = train_result.metrics
max_train_samples = (
    data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

#### Casual Language Model (CLM) e.g., DialoGPT

In [3]:
data_args = Namespace(
    # model_name_or_path="microsoft/DialoGPT-small",
    model_name_or_path="results/personagpt",
    # model_name_or_path="gpt2",
    max_length=1000,
    pad_to_max_length=False,
    ignore_pad_token_for_loss=True,
    max_train_samples=None,
    preprocessing_num_workers=None,
    overwrite_cache=True,
    output_dir='results/dialogpt',
    block_size=None,
)

max_length = data_args.max_length
padding = "max_length" if data_args.pad_to_max_length else False

config = AutoConfig.from_pretrained(data_args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(data_args.model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(data_args.model_name_or_path, config=config)

In [4]:
print(tokenizer.all_special_tokens)
print(tokenizer.eos_token, tokenizer.eos_token_id)
print(tokenizer.pad_token, tokenizer.pad_token_id)

['<|endoftext|>', '<PAD>', '<MASK>']
<|endoftext|> 50256
<PAD> 50257


In [20]:
def preprocess_bst(examples):
    num_entries = len(examples["free_messages"])
    persona_pieces = [
        f"<PS> {examples['personas'][0]}",
        f"<PS> {examples['personas'][1]}",
    ]
    if examples['context'] == "wizard_of_wikipedia":
        additional_context_pieces = [f"[<CTX> {examples['additional_context']}. <SEP> "]
    else:
        additional_context_pieces = ["<SEP> "]

    previous_utterance_pieces = examples["previous_utterance"]
    inputs, labels = [], []
    for entry_idx in range(num_entries):
        free_message = examples['free_messages'][entry_idx]
        guided_message = examples['guided_messages'][entry_idx]

        previous_utterance = ' <SEP> '.join(previous_utterance_pieces)
        original_context = ' '.join(
            persona_pieces + additional_context_pieces
        ) + previous_utterance
        # Input & Output
        text = original_context + ' ' + tokenizer.eos_token + ' ' + free_message
        inputs.append(text)
        labels.append(guided_message)

        previous_utterance_pieces += [
            free_message,
            guided_message,
        ]
        # print("history: ", text)
        # print("label: ", guided_message)

    inputs = tokenizer(inputs, max_length=data_args.max_length, padding=padding, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(labels, max_length=data_args.max_length, padding=padding, truncation=True)
    
    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 
    # when we want to ignore padding in the loss.
    if padding == "max_length" and data_args.ignore_pad_token_for_loss:
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
    inputs["labels"] = labels["input_ids"]
    return inputs


def group_texts(examples):
    # ['input_ids', 'attention_mask', 'labels']
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    return concatenated_examples

In [21]:
column_names = train_dataset.column_names

if data_args.max_train_samples is not None:
    train_dataset = train_dataset.select(range(data_args.max_train_samples))

tokenized_train_dataset = train_dataset.map(
    preprocess_bst,
    batched=False,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not data_args.overwrite_cache,
)
batched_train_dataset = tokenized_train_dataset.map(
    group_texts,
    batched=True,
    num_proc=data_args.preprocessing_num_workers,
    load_from_cache_file=not data_args.overwrite_cache,
)

tokenized_eval_dataset = eval_dataset.map(
    preprocess_bst,
    batched=False,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not data_args.overwrite_cache,
)
batched_eval_dataset = tokenized_eval_dataset.map(
    group_texts,
    batched=True,
    num_proc=data_args.preprocessing_num_workers,
    load_from_cache_file=not data_args.overwrite_cache,
)
print(batched_train_dataset)
print(batched_eval_dataset)

100%|██████████| 4819/4819 [00:04<00:00, 1077.63ex/s]
100%|██████████| 5/5 [00:02<00:00,  1.70ba/s]
100%|██████████| 1009/1009 [00:01<00:00, 811.27ex/s]
100%|██████████| 2/2 [00:00<00:00,  2.91ba/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 27018
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5651
})





##### Training

In [80]:
training_args = TrainingArguments(
    output_dir=data_args.output_dir,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    metric_for_best_model="eval_bleu",
    greater_is_better=True, # smaller eval loss is better
    per_device_train_batch_size=10,
    per_device_eval_batch_size=20,
    gradient_accumulation_steps=20,
    num_train_epochs=30,
)

# Metric
metric = load_metric("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    if data_args.ignore_pad_token_for_loss:
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        # Depending on the model and config, logits may contain extra tensors,
        # like past_key_values, but logits always come first
        logits = logits[0]
    return logits.argmax(dim=-1)

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval else None,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [100]:
# Training
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model()  # Saves the tokenizer too for easy upload

metrics = train_result.metrics
max_train_samples = (
    data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

### Demo

In [18]:
from DG_dataset import DGDataset
from attacker.DGSlow import StructureAttacker

device = torch.device('cpu')
instance = test_dataset[0]
entry_idx = 0
task = "clm"
sp_token = '<SEP>' # for clm
data_name = 'blended_skill_talk'
max_length = 128
num_beams = 4
num_beam_groups = 1
max_per = 1 # number of perturbations
model_path = 'results/dialogpt'
config = AutoConfig.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, config=config)

dg = DGDataset(
    dataset=data_name,
    task=task,
    tokenizer=tokenizer,
    max_source_length=max_length,
    max_target_length=max_length,
)
attacker = StructureAttacker(
    device=device,
    tokenizer=tokenizer,
    model=model,
    max_len=max_length,
    max_per=max_per,
    task=task,
    use_combined_loss=True,
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 27.1MB/s]                    
Can not find mwt: default from of

In [20]:
# Original generation
num_entries, total_entries, context, prev_utt_pc = dg.prepare_context(instance)
free_msg, guided_msg, orig_context, references = dg.prepare_entry(
    instance, 
    entry_idx, 
    context, 
    prev_utt_pc,
    total_entries,
)

# Original generation
text = orig_context + sp_token + free_msg
print("C--{}".format(orig_context))
print("U--{}".format(free_msg))
effective_text = text + tokenizer.eos_token # for clm
# effective_text = text
inputs = tokenizer(
    effective_text,  
    return_tensors="pt",
    truncation=True,
    max_length=max_length,
)
input_ids = inputs.input_ids
with torch.no_grad():
    outputs = dialogue(
        model, 
        input_ids,
        early_stopping=False, 
        num_beams=num_beams,
        num_beam_groups=num_beam_groups, 
        use_cache=True,
        max_length=max_length,
    )

output = tokenizer.batch_decode(
    outputs['sequences'][:, input_ids.shape[-1]:], 
    skip_special_tokens=True,
)[0]
print("G--{}".format(output))

# # Attack
# success, adv_his = attacker.run_attack(text, guided_msg)
# new_text = adv_his[-1][0]
# new_free_msg= new_text.split(sp_token)[1].strip()
# print("U'--{}".format(new_free_msg))
# cos_sim = attacker.sent_encoder.get_sim(new_free_msg, free_msg)

# effective_text = new_text + tokenizer.eos_token # for clm
# inputs = tokenizer(
#     effective_text,  
#     return_tensors="pt",
#     truncation=True,
#     max_length=max_length,
# )
# input_ids = inputs.input_ids
# with torch.no_grad():
#     outputs = dialogue(
#         model, 
#         input_ids,
#         early_stopping=False, 
#         num_beams=num_beams,
#         num_beam_groups=num_beam_groups, 
#         use_cache=True,
#         max_length=max_length,
#     )

# new_output = tokenizer.batch_decode(
#     outputs['sequences'][:, input_ids.shape[-1]:], 
#     skip_special_tokens=True,
# )[0]
# print("G'--{}".format(output))


C-- Wow, I am never shy. Do you have anxiety? Yes. I end up sweating and blushing and feel like i'm going to throw up.
U--and why is that?
G--I feel like I'm going to vomit. thar is a good reason to not drink!  is a good reason to not drink!  is a good reason to not drink!


## ConvAI2

In [26]:
dataset = load_dataset("conv_ai_2")
print(dataset)

Found cached dataset conv_ai_2 (/home/monkey/.cache/huggingface/datasets/conv_ai_2/conv_ai_2/1.0.0/11d600ddce66bb9d07ca50d1b55b488145ef0d5d0206168c32f1043677875865)
100%|██████████| 1/1 [00:00<00:00, 280.89it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialog_id', 'dialog', 'bot_profile', 'user_profile', 'eval_score', 'profile_match'],
        num_rows: 3495
    })
})





In [14]:
# for i in range(1):
#     print(dataset['train'][i])
#     instance = dataset['train'][i]
#     user_profile = ' '.join([''.join(x) for x in instance['user_profile']])
#     print('user profile: ', user_profile)

#     persona_pieces = f"<PS> {user_profile}"
#     num_entries = len([x for x in instance['dialog'] if x['sender_class'] == 'Human'])
#     previous_utterance_pieces = [persona_pieces]

#     for entry_idx in range(num_entries):
#         bot_msg = instance['dialog'][entry_idx*2]['text']
#         human_msg = instance['dialog'][entry_idx*2+1]['text']
#         original_context = ' '.join(previous_utterance_pieces)
#         previous_utterance_pieces += [
#             bot_msg,
#             human_msg,
#         ]

#         text = original_context + '<EOS>' + bot_msg + '<EOS>' + human_msg
#         print("text: ", text)

## Empathetic Dialogues

In [28]:
dataset = load_dataset("empathetic_dialogues")
print(dataset)
train_dataset = dataset['train']
eval_dataset = dataset['validation']
test_dataset = dataset['test']

Found cached dataset empathetic_dialogues (/home/monkey/.cache/huggingface/datasets/empathetic_dialogues/default/0.1.0/09bbeed3882a67db98c73952fb3c1c9a85af83dc78f81454c2454382fd03f6cf)
100%|██████████| 3/3 [00:00<00:00, 47.57it/s]

DatasetDict({
    train: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 76673
    })
    validation: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 12030
    })
    test: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 10943
    })
})





In [22]:
def group_texts(dataset):
    results = {
        'conv_id': [], 
        'prompt': [],
        'dialog': [], 
        'context': [],
    }
    for i, instance in enumerate(dataset):
        if instance['utterance_idx'] == 1:
            results['conv_id'].append(instance['conv_id'])
            results['dialog'].append([])
            results['prompt'].append(instance['prompt'])
            results['context'].append(instance['context'])

        response = {'text': instance['utterance'], 'speaker_idx': instance['speaker_idx']}
        results['dialog'][-1].append(response)

    return Dataset.from_dict(results)

In [23]:
grouped_test_dataset = group_texts(test_dataset)
print(grouped_test_dataset[0])
grouped_test_dataset

Dataset({
    features: ['conv_id', 'prompt', 'dialog', 'context'],
    num_rows: 2541
})

## PersonaChat

In [31]:
dataset = load_dataset("AlekseyKorshuk/persona-chat")
print(dataset)

Using custom data configuration AlekseyKorshuk--persona-chat-2e840579d6707f7b
Found cached dataset parquet (/home/monkey/.cache/huggingface/datasets/AlekseyKorshuk___parquet/AlekseyKorshuk--persona-chat-2e840579d6707f7b/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 2/2 [00:00<00:00, 225.17it/s]

DatasetDict({
    validation: Dataset({
        features: ['personality', 'utterances'],
        num_rows: 1000
    })
    train: Dataset({
        features: ['personality', 'utterances'],
        num_rows: 17878
    })
})





In [16]:
# print(dataset['train'].column_names)
# print(dataset['train'][0]['personality'])
# print(dataset['train'][0]['utterances'])

In [17]:
import evaluate

bleu = evaluate.load('bleu')
predictions = ['Yes, I have two daughters. I am a grandparent at 44.'.lower()]
references = [["yes i have a son and just recently i became a grandpa".lower()]]
results = bleu.compute(predictions=predictions, references=references)
print(results)

{'bleu': 0.0, 'precisions': [0.35714285714285715, 0.07692307692307693, 0.0, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 1.1666666666666667, 'translation_length': 14, 'reference_length': 12}
