## save my precious data

In [4]:
from datasets import load_from_disk

back_translate = load_from_disk('data/augmented/back-translate-collected')
insert = load_from_disk('data/augmented/insert-collected')
replace = load_from_disk('data/augmented/replace-collected')

In [7]:
from datasets import DatasetDict

dataset = DatasetDict({
    'back_translate': back_translate,
    'insert': insert,
    'replace': replace
})

In [8]:
dataset.push_to_hub('voorhs/augmented')

Creating parquet from Arrow format: 100%|██████████| 451/451 [00:01<00:00, 286.60ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:23<00:00, 23.95s/it]
Creating parquet from Arrow format: 100%|██████████| 451/451 [00:01<00:00, 247.73ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:28<00:00, 28.70s/it]
Creating parquet from Arrow format: 100%|██████████| 451/451 [00:01<00:00, 253.06ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:26<00:00, 26.56s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/voorhs/augmented/commit/133f760cc905ecd231b2e606bb556289541fead6', commit_message='Upload dataset', commit_description='', oid='133f760cc905ecd231b2e606bb556289541fead6', pr_url=None, pr_revision=None, pr_num=None)

## testing multiwoz-1-domain

In [1]:
from mylib.datasets import MultiWOZServiceClfDataset

dataset = MultiWOZServiceClfDataset(path='data/multiwoz', split='train')
len(dataset)

8437

In [3]:
dataset = MultiWOZServiceClfDataset(path='data/multiwoz', split='validation')
len(dataset)

1000

In [2]:
dataset = MultiWOZServiceClfDataset(path='data/multiwoz-1-domain', split='train')
len(dataset)

3250

In [5]:
dataset = MultiWOZServiceClfDataset(path='data/multiwoz-1-domain', split='validation')
len(dataset)

209

## backbone list

In [1]:
hf_models = [
    'google-bert/bert-base-uncased',
    'Shitao/RetroMAE',
    'FacebookAI/roberta-base',
    'WhereIsAI/UAE-Large-V1'
]

In [3]:
from transformers import AutoTokenizer

for m in hf_models:
    tokenizer = AutoTokenizer.from_pretrained(m)
    print(m, tokenizer.all_special_tokens)


google-bert/bert-base-uncased ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
Shitao/RetroMAE ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
FacebookAI/roberta-base ['<s>', '</s>', '<unk>', '<pad>', '<mask>']
WhereIsAI/UAE-Large-V1 ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']


In [4]:
t = AutoTokenizer.from_pretrained('FacebookAI/roberta-base')
t.cls_token

'<s>'

## prune demo

In [4]:
import json

dias = json.load(open('data/augmented/prune-test/00000.json', 'r'))

In [7]:
dias[2]

{'content': [{'speaker': 0, 'utterance': 'Hello.'},
  {'speaker': 0,
   'utterance': 'I am looking for a flight from JFK to MSP, can you help me in booking?'},
  {'speaker': 0, 'utterance': 'Myself Katherine Jones.'},
  {'speaker': 0,
   'utterance': 'I would like to return my journey for the morning flight.'},
  {'speaker': 0,
   'utterance': "That's ok, no problem. Thank you for the information."}],
 'source_dataset_name': 'AirDialogue',
 'idx_within_source': 268933,
 'id': 443619}

## domain datasets

In [4]:
from datasets import load_from_disk

dataset = load_from_disk('data/benchmarks/SGD')
dataset

DatasetDict({
    train: Dataset({
        features: ['content', 'source_dataset_name', 'idx_within_source', 'services'],
        num_rows: 5403
    })
    validation: Dataset({
        features: ['content', 'source_dataset_name', 'idx_within_source', 'services'],
        num_rows: 836
    })
    test: Dataset({
        features: ['content', 'source_dataset_name', 'idx_within_source', 'services'],
        num_rows: 1806
    })
})

In [5]:
# from mylib.datasets import DomainDataset

# dataset = DomainDataset('data/benchmarks-one-domain-filtered/sgd', 'train')
# len(dataset)

In [6]:
dataset['train'][10]

{'content': [{'speaker': 0,
   'utterance': "I'm starving! Can you help me find a restaurant that serves Breakfast in Fairfield?"},
  {'speaker': 1,
   'utterance': "There is 1 restaurant that meets your needs. Mimi's Cafe in Fairfield serves breakfast."},
  {'speaker': 0, 'utterance': 'Is there a band playing there?'},
  {'speaker': 1,
   'utterance': "No, unfortunately, they don't offer live music."},
  {'speaker': 0, 'utterance': "That's okay. It will work anyway."},
  {'speaker': 1, 'utterance': 'Do you wish to make a reservation?'},
  {'speaker': 0,
   'utterance': "Yes, I'll need to make a reservation for a table for 2."},
  {'speaker': 1, 'utterance': 'What time will you be visiting?'},
  {'speaker': 0,
   'utterance': 'Please make it for half past 5 in the evening.'},
  {'speaker': 1,
   'utterance': "You'd like a table for 2 today at Mimi's Cafe in Fairfield at 5:30 pm. Is this correct?"},
  {'speaker': 0,
   'utterance': "I'm sorry, I need to change that. I'd like it on the 9

## fixing problems :(

In [1]:
from datasets import load_from_disk

dataset = load_from_disk('data/augmented/prune-collected')
dataset

Dataset({
    features: ['content', 'source_dataset_name', 'idx_within_source', 'id'],
    num_rows: 450750
})

In [5]:
import json

json.loads('[null, -Infinity]')

[None, -inf]

In [3]:
dataset['content']

['[{"speaker": 0, "utterance": "Hi. I am Dorothy Anderson."}, {"speaker": 1, "utterance": "Hello, how can I assist you today?"}, {"speaker": 0, "utterance": "I want to cancel my reservation as my lab exam got cancelled. Can you help me with cancellation?"}, {"speaker": 1, "utterance": "Sure, please wait for a moment."}, {"speaker": 0, "utterance": "Ok."}, {"speaker": 1, "utterance": "Sorry, no reservation found with your name."}]',
 '[{"speaker": 0, "utterance": "hi . can you please buy me movie tickets at camera 7 ?"}, {"speaker": 0, "utterance": "miss hokusai"}, {"speaker": 0, "utterance": "base_date_plus_0 . any time is ok ."}, {"speaker": 0, "utterance": "what is the weather like in san francisco ?"}, {"speaker": 0, "utterance": "4"}, {"speaker": 0, "utterance": "yes"}, {"speaker": 0, "utterance": "yes , that \'s right"}, {"speaker": 1, "utterance": "I just bought you 4 tickets to see Miss Hokusai at Camera 7 at 12:00 pm base_date_plus_0. Is there anything else I can do for you?"},

## filtering by source

In [1]:
from datasets import load_from_disk

path_in = 'data/train-unfair/trivial'
in_dataset = load_from_disk(path_in)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
in_dataset[0]

{'source_dataset_name': 'AirDialogue',
 'idx_within_source': 185688,
 'id': 360925,
 'pos': [{'augmentation': 'back-translate',
   'content': [{'speaker': 0, 'utterance': 'Hey.'},
    {'speaker': 1, 'utterance': 'Hey, what can I do for you?'},
    {'speaker': 0,
     'utterance': "I'm very happy to visit Ellenville Faw Ice Pavees in New York."},
    {'speaker': 1,
     'utterance': 'Of course, can I know the dates of the trip?'},
    {'speaker': 0, 'utterance': 'My trip to the 28th and 30th of April.'},
    {'speaker': 1, 'utterance': 'Do you have any contact restrictions?'},
    {'speaker': 0, 'utterance': "I'm fine with maximum communications."},
    {'speaker': 1, 'utterance': 'May I ask your name?'},
    {'speaker': 0, 'utterance': 'My name is Gary Evans.'},
    {'speaker': 1, 'utterance': 'Wait a minute, please.'},
    {'speaker': 0, 'utterance': 'Please make sure I want to leave tonight.'},
    {'speaker': 1,
     'utterance': 'Sorry, no flights with the dates you choose.'},
    

## torch max

In [1]:
import torch

a = torch.tensor([[1,2],[3,4]])

torch.max(a, dim=0)[0]

tensor([3, 4])

## testing datasets

In [27]:
from datasets import load_from_disk

dataset = load_from_disk('data/train-retromae/trivial')
len(dataset)

433340

In [28]:
dataset.unique('source_dataset_name')

['AirDialogue',
 'SimJointGEN',
 'Disambiguation',
 'ABCD',
 'MetaLWOZ',
 'Taskmaster1',
 'MS-DC',
 'KETOD',
 'FRAMES']

In [3]:
dataset[0]

([{'speaker': 0, 'utterance': 'Hello.'},
  {'speaker': 1, 'utterance': 'Hello, how may I help you?'},
  {'speaker': 0,
   'utterance': 'Can you please help me in reserving a flight ticket to ATL from JFK?'},
  {'speaker': 1, 'utterance': 'Sure, help me with the trip dates?'},
  {'speaker': 0, 'utterance': 'My trip dates are from Aug 25 to Aug 27.'},
  {'speaker': 1, 'utterance': 'Do you have any preferences?'},
  {'speaker': 0,
   'utterance': 'I need a business class ticket. I would like to travel in connecting flight.'},
  {'speaker': 1, 'utterance': 'May I know your name please?'},
  {'speaker': 0, 'utterance': 'I am George Turner.'},
  {'speaker': 1,
   'utterance': 'Sorry, we did not find any flights running in your desired routes.'},
  {'speaker': 0, 'utterance': 'Ok, no problem. Thank you.'},
  {'speaker': 1, 'utterance': 'Thank you for reaching us.'}],
 [{'speaker': 0, 'utterance': 'Hello.'},
  {'speaker': 1, 'utterance': 'Hello, how may I help you?'},
  {'speaker': 0,
   'utte

In [6]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    origs =  [o for o, p in batch]
    pos =  [p for o, p in batch]
    return origs, pos
    
loader = DataLoader(
    dataset=dataset,
    batch_size=3,
    shuffle=False,
    num_workers=0,
    collate_fn=collate_fn,
    drop_last=False
)

In [9]:
origs, pos = next(iter(loader))
len(origs), len(pos)

(3, 3)

In [10]:
origs[0]

[{'speaker': 0, 'utterance': 'Hello.'},
 {'speaker': 1, 'utterance': 'Hello, how may I help you?'},
 {'speaker': 0,
  'utterance': 'Can you please help me in reserving a flight ticket to ATL from JFK?'},
 {'speaker': 1, 'utterance': 'Sure, help me with the trip dates?'},
 {'speaker': 0, 'utterance': 'My trip dates are from Aug 25 to Aug 27.'},
 {'speaker': 1, 'utterance': 'Do you have any preferences?'},
 {'speaker': 0,
  'utterance': 'I need a business class ticket. I would like to travel in connecting flight.'},
 {'speaker': 1, 'utterance': 'May I know your name please?'},
 {'speaker': 0, 'utterance': 'I am George Turner.'},
 {'speaker': 1,
  'utterance': 'Sorry, we did not find any flights running in your desired routes.'},
 {'speaker': 0, 'utterance': 'Ok, no problem. Thank you.'},
 {'speaker': 1, 'utterance': 'Thank you for reaching us.'}]

In [11]:
pos[0]

[{'speaker': 0, 'utterance': 'Q Hello.'},
 {'speaker': 1, 'utterance': 'Hello, how may haps I help you?'},
 {'speaker': 0,
  'utterance': 'Can you please help me in reserving a flight ticket to ATL X from JFK?'},
 {'speaker': 1, 'utterance': 'Sure, help me with the trip dates?'},
 {'speaker': 0, 'utterance': 'My trip dates are from Aug 25 to Aug 27.'},
 {'speaker': 1, 'utterance': 'Do you have any preferences?'},
 {'speaker': 0,
  'utterance': 'I need a business class ticket. I would like to travel in connecting flight.'},
 {'speaker': 1, 'utterance': 'May may I know your name please?'},
 {'speaker': 0, 'utterance': 'Hello I am George Turner.'},
 {'speaker': 1,
  'utterance': 'Sorry, we did not find any flights running in your desired routes.'},
 {'speaker': 0, 'utterance': 'Q Ok, no problem. Â Thank you.'},
 {'speaker': 1, 'utterance': 'Â Thank you for reaching us.'}]

## dataset statistics

Описание одного датасета:
- число семплов
- сколько каких аугментаций
- распределение числа токенов в диалоге
- распределение числа утерансов в диалоге
- сколько отфильтровано для bert, roberta, retromae

In [30]:
from datasets import load_from_disk
from transformers import AutoTokenizer
from mylib.modeling.dialogue import BaselineDialogueEncoder


def get_aug_indicators(pos, target):
    augs = [aug['augmentation'] for aug in pos]
    return {t: int(t in augs) for t in target}


def n_tokens(pos, tokenizers):
    res = {}
    for name, tok in tokenizers.items():
        for aug in pos:
            input_ids = BaselineDialogueEncoder._tokenize(tokenizer, [aug['content']])['input_ids']
            res[f'n-toks-{name}-{aug["augmentation"]}'] = input_ids.shape[1]
    return res


def n_utterances(pos):
    res = {}
    for aug in pos:
        res[f'n-uts-{aug["augmentation"]}'] = len(aug['content'])
    return res


def make_report(pack):
    dataset = load_from_disk(f'data/train/{pack}').select(range(10))

    if pack == 'trivial':
        target = ['back-translate', 'insert', 'replace']
    elif pack == 'advanced':
        target = ['back-translate', 'insert', 'replace', 'prune', 'shuffle']
    elif pack == 'crazy':
        target = ['back-translate', 'insert', 'replace', 'prune', 'shuffle', 'back-translate-prune', 'prune-insert', 'prune-replace', 'shuffle-insert', 'shuffle-replace']
    else:
        raise ValueError('unknown pack')
    
    dataset = dataset.map(
        function=get_aug_indicators,
        fn_kwargs=dict(target=target),
        input_columns='pos'
    )
    
    dataset = dataset.map(
        function=n_tokens,
        fn_kwargs=dict(
            tokenizers={
                'bert': AutoTokenizer.from_pretrained('google-bert/bert-base-uncased'),
                'roberta': AutoTokenizer.from_pretrained('FacebookAI/roberta-base'),
                'retromae': AutoTokenizer.from_pretrained('Shitao/RetroMAE')
            }
        ),
        input_columns='pos'
    )

    dataset = dataset.map(function=n_utterances, input_columns='pos')
    dataset = dataset.remove_columns(['source_dataset_name', 'idx_within_source', 'id', 'pos', 'orig'])
    dataset.save_to_disk(f'data/reports/{pack}')


In [31]:
make_report('trivial')

Map: 100%|██████████| 10/10 [00:00<00:00, 94.81 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 499.20 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 10/10 [00:00<00:00, 846.62 examples/s]


In [32]:
dataset = load_from_disk('data/reports/trivial')
dataset

Dataset({
    features: ['back-translate', 'insert', 'replace', 'n-toks-bert-back-translate', 'n-toks-bert-insert', 'n-toks-bert-replace', 'n-toks-roberta-back-translate', 'n-toks-roberta-insert', 'n-toks-roberta-replace', 'n-toks-retromae-back-translate', 'n-toks-retromae-insert', 'n-toks-retromae-replace', 'n-uts-back-translate', 'n-uts-insert', 'n-uts-replace'],
    num_rows: 10
})

In [33]:
dataset[0]

{'back-translate': 1,
 'insert': 1,
 'replace': 1,
 'n-toks-bert-back-translate': 149,
 'n-toks-bert-insert': 167,
 'n-toks-bert-replace': 162,
 'n-toks-roberta-back-translate': 149,
 'n-toks-roberta-insert': 167,
 'n-toks-roberta-replace': 162,
 'n-toks-retromae-back-translate': 149,
 'n-toks-retromae-insert': 167,
 'n-toks-retromae-replace': 162,
 'n-uts-back-translate': 14,
 'n-uts-insert': 14,
 'n-uts-replace': 14}