In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset

dataset = load_dataset('Salesforce/dialogstudio', 'FRAMES')['train']['log']

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def parse_dialog_studio_sample(raw_sample):
    utterances = []
    speakers = []
    for turn in raw_sample:
        ut = turn['user utterance']
        if ut != '':
            utterances.append(ut)
            speakers.append(0)
        ut = turn['system response']
        if ut != '':
            utterances.append(ut)
            speakers.append(1)
    return utterances, speakers

utterances, speakers = parse_dialog_studio_sample(dataset[0])
utterances, speakers

(["I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.",
  'Hi...I checked a few options for you, and unfortunately, we do not currently have any trips that meet this criteria.  Would you like to book an alternate travel option?',
  'Yes, how about going to Neverland from Caprica on August 13, 2016 for 5 adults. For this trip, my budget would be 1900.',
  'I checked the availability for this date and there were no trips available.  Would you like to select some alternate dates?',
  'I have no flexibility for dates... but I can leave from Atlantis rather than Caprica. How about that?',
  'I checked the availability for that date and there were no trips available.  Would you like to select some alternate dates?',
  "I suppose I'll speak with my husband to see if we can choose other dates, and then I'll come back to you. Thanks for your help"],
 [0, 1, 0, 1, 0, 1, 0])

In [4]:
path_to_root = '/home/alekseev_ilya/dialogue-augmentation'

import sys
sys.path.insert(1, path_to_root)

from mylib.utils.data.dialogue_data_type import Dialogue

dia = Dialogue(
    idx=7,
    utterances=utterances,
    speakers=speakers,
    source_dataset_name='FRAMES',
    idx_within_source=0
)

In [5]:
dia

{ 'content': [ { 'speaker': 0,
                 'utterance': "I'd like to book a trip to Atlantis from "
                              'Caprica on Saturday, August 13, 2016 for 8 '
                              'adults. I have a tight budget of 1700.'},
               { 'speaker': 1,
                 'utterance': 'Hi...I checked a few options for you, and '
                              'unfortunately, we do not currently have any '
                              'trips that meet this criteria.  Would you like '
                              'to book an alternate travel option?'},
               { 'speaker': 0,
                 'utterance': 'Yes, how about going to Neverland from Caprica '
                              'on August 13, 2016 for 5 adults. For this trip, '
                              'my budget would be 1900.'},
               { 'speaker': 1,
                 'utterance': 'I checked the availability for this date and '
                              'there were no trips av

In [7]:
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained('roberta-base')
tokenizer.all_special_tokens

['<s>', '</s>', '<unk>', '<pad>', '<mask>']

In [8]:
tokenizer2 = AutoTokenizer.from_pretrained('microsoft/mpnet-base')
tokenizer2.all_special_tokens

['<s>', '</s>', '[UNK]', '<pad>', '<mask>']

In [9]:
tokenizer('hellp world i will leave forever and die in your heart though')

{'input_ids': [0, 20030, 642, 232, 939, 40, 989, 6000, 8, 1597, 11, 110, 1144, 600, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
tokenizer2('hellp world i will leave forever and die in your heart though')

{'input_ids': [0, 3113, 2365, 2092, 1049, 2101, 2685, 5095, 2002, 3284, 2003, 2119, 2544, 2299, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}