Main NLP tasks
- [ ] Pre-Training and Representation Learning 
- [ ] Question Answering 
- [x] Summarization 
- [ ] Token Classification - (NER, POS etc)
- [ ] Text Classification - (Sentiment Analysis, Topic Modelling)
- [ ] Text Generation
- [ ] Conversational 
- [ ] Speech-to-text and text-to-speech
- [ ] Document Search and Indexing, Faiss, Annoy
- [ ] Object Detection / Segmentation
- [ ] Vision Transformers and Visual QA
- [ ] Biomedical Use Cases -> Molecule / Sequence

----  




- NLP -> TFFLOw -> AlfaFold -> GQL -> Twitter -> XAI


In [None]:
import random 

def recursive_inspect(obj, o=0, n=3):
  ty = type(obj)
  print('\t'*o+'type:', ty)
  if ty == dict:    
    print('\t'*o+'number_of_keys:', len(obj))
    keys = keys = random.choices(list(obj.keys()), k=n) if len(obj.keys()) > n else list(obj.keys())
    print('\t'*o+'few_keys:', keys)
    for i, k in enumerate(keys):
      print('\t'*o+f'{i}. key:', k)
      recursive_inspect(obj[k], o=o+1, n=n)
  
  elif ty in [list, tuple]:
    few_vals = random.choices(obj, k=n) if len(obj) > n else obj
    print('\t'*o+'number_of_elements:', len(obj))
    for i, v in enumerate(few_vals):
      print('\t'*(o+1)+f'{i}. ')
      recursive_inspect(v, o=o+1, n=n)

  ## TODO: for all iterable, apply this
  ## TODO:DONE: random elems to show

  else:
    print('\t'*o+'truncated(1000) val:', str(obj)[:1000])

In [None]:
import google.colab.drive as drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
drive_path = '/content/drive/Shareddrives/PERSONAL_DRIVE_ONE/'

!mkdir -p /content/temp
temp_path = '/content/temp/'

In [None]:
from tqdm.notebook import tqdm 

## # EXERCISE 1: Summarization 
Let's Start with training/finetuning a Summarization model

In [None]:
!mkdir -p {drive_path}summarization/data/downloaded_and_processed
!mkdir -p {drive_path}summarization/codes/notebooks

In [None]:
downloaded_and_processed_loc = f'{drive_path}summarization/data/downloaded_and_processed/'

In [None]:
! pip install transformers datasets --quiet
! pip install --upgrade nltk --quiet
! pip install rouge-score sentencepiece --quiet

[K     |████████████████████████████████| 2.8 MB 12.1 MB/s 
[K     |████████████████████████████████| 270 kB 47.1 MB/s 
[K     |████████████████████████████████| 52 kB 1.4 MB/s 
[K     |████████████████████████████████| 636 kB 50.6 MB/s 
[K     |████████████████████████████████| 3.3 MB 31.6 MB/s 
[K     |████████████████████████████████| 895 kB 17.2 MB/s 
[K     |████████████████████████████████| 243 kB 48.5 MB/s 
[K     |████████████████████████████████| 119 kB 50.7 MB/s 
[K     |████████████████████████████████| 1.3 MB 39.6 MB/s 
[K     |████████████████████████████████| 294 kB 50.8 MB/s 
[K     |████████████████████████████████| 142 kB 46.9 MB/s 
[K     |████████████████████████████████| 1.5 MB 12.3 MB/s 
[K     |████████████████████████████████| 1.2 MB 18.6 MB/s 
[?25h

### Datasets

In [None]:
### Datasets 
import json 
import pandas as pd 
from datasets import load_dataset 

def read_json(path):
  with open(path, 'r') as f: return json.load(f)

def convert_train_into_df(data):
  tr = data['train'] if 'train' in data else data
  df_contents = {i: tr[i] for i in tr.column_names}
  return pd.DataFrame(df_contents)

def show_data(*data_dfs):
  pd.set_option('display.max_colwidth', 200)
  for i in data_dfs:
    print('shape of data:', i.shape)
    display(i.sample(5))
    print()

# billsum = load_dataset('billsum', split='train')
# amazon = load_dataset('amazon_reviews_multi', split='train')
# xsum = load_dataset('xsum', split='train')

In [None]:
# xsum_df = convert_train_into_df(xsum)
# amz_df = convert_train_into_df(amazon)
# bsum_df = convert_train_into_df(billsum)

# show_data(bsum_df, amz_df, xsum_df)

In [None]:
# # saving these dfs 
# xsum_df.to_csv(downloaded_and_processed_loc+'xsum.csv', index=False)
# bsum_df.to_csv(downloaded_and_processed_loc+'billsum.csv', index=False)
# amz_df.to_csv(downloaded_and_processed_loc+'amazon_reviews.csv', index=False)

# xsum_df = pd.read_csv(downloaded_and_processed_loc+'xsum.csv')
# bsum_df = pd.read_csv(downloaded_and_processed_loc+'billsum.csv')
# amz_df = pd.read_csv(downloaded_and_processed_loc+'amazon_reviews.csv')

# show_data(bsum_df, amz_df, xsum_df)

In [None]:
# only_useful_cols

from hashlib import sha1 as hash_fn
hash_it = lambda x: hash_fn(repr(x).encode('UTF-8')).hexdigest()[:10]
hash_dfrow_fn = lambda row: hash_it(' '.join(sorted([str(v) for v in row.values])))

# print(hash_it('😋 Get Emoji — All Emojis to ✂️ Copy and 📋 Paste 👌') == "1e8f81135a")

def bill_sum_proc(bsum_df):
  tqdm.pandas()
  bsum_df['_id'] = bsum_df.progress_apply(hash_dfrow_fn, axis=1)
  df_contents = []
  for text, summary, title, _id in bsum_df.values:
    df_contents.append(
        dict(
            _id = _id,
            _text = title + '\n\n' + text,
            _summary = summary
        )
    )  
  return pd.DataFrame(df_contents).dropna()

def amz_sum_proc(amz_df):
  amz_df_en = amz_df[amz_df.language=='en']
  tqdm.pandas()
  amz_df_en['_id'] = amz_df_en.progress_apply(hash_dfrow_fn, axis=1)
  df_contents = []
  for _, _, _, _, text, summary, _, _, _id in tqdm(amz_df_en.values):
    df_contents.append(
        dict(
            _id = _id,
            _text = text,
            _summary = summary
        )
    )
  return pd.DataFrame(df_contents).dropna()

def xsum_proc(xsum_df):
  tqdm.pandas()
  xsum_df['_id'] = xsum_df.progress_apply(hash_dfrow_fn, axis=1)
  df_contents = []
  for text, summary, _, _id in tqdm(xsum_df.values):
    df_contents.append(
        dict(
            _id = _id, 
            _text = text, 
            _summary = summary
        )
    )
  return pd.DataFrame(df_contents).dropna()

In [None]:
# bsum_df = bill_sum_proc(bsum_df)
# amz_df = amz_sum_proc(amz_df)
# xsum_df = xsum_proc(xsum_df)


# !mkdir -p {downloaded_and_processed_loc}only_useful/
only_useful_path = downloaded_and_processed_loc + 'only_useful/'

# xsum_df.to_csv(only_useful_path+'xsum.csv', index=False)
# bsum_df.to_csv(only_useful_path+'billsum.csv', index=False)
# amz_df.to_csv(only_useful_path+'amazon_reviews.csv', index=False)

xsum_df = pd.read_csv(only_useful_path+'xsum.csv')
bsum_df = pd.read_csv(only_useful_path+'billsum.csv')
amz_df = pd.read_csv(only_useful_path+'amazon_reviews.csv')

In [None]:
show_data(xsum_df, bsum_df, amz_df)

shape of data: (204017, 3)


Unnamed: 0,_id,_text,_summary
76024,08df396fcd,The operation follows months of investigation into the Artemis brothel.\nBrothel managers are accused of evading some €17.5m (£14m; $19.7m) in social security payments since 2006.\nProsecutors all...,"Nine hundred police officers have taken part in a raid on one of Germany's biggest brothels in Berlin, making six arrests over alleged human trafficking and tax fraud."
158563,f05decaecf,"Historians said documents, released by the US National Archives, supported the suspicion that the US did not want to anger its wartime ally, Joseph Stalin.\nThey showed the US was sent coded messa...",New evidence appears to back the idea that the Roosevelt administration helped cover up Soviet guilt for the 1940 Katyn massacre of Polish soldiers.
123194,1bd4f072a0,"Gerard Freyne, who was in his 50s, was discovered outside an apartment block on Lord Edward Street on Wednesday evening and was taken to hospital.\nHe died on Friday.\nThe 32-year-old man was arre...",A 32-year-old man has been arrested by police following the death of a man who was found in Limerick city with head injuries earlier this week.
82462,6d2c417f99,"Defender Liam Cooper and winger Hadi Sacko may well feature, but full-back Charlie Taylor (Achilles) remains out.\nKyle Bartley, Liam Bridcutt and Chris Wood are among those who could be rested by...",Championship side Leeds United are likely to make changes to the side that beat Nottingham Forest on Wednesday.
104340,086f957a37,"Farman, 27, who joined from Gateshead in 2012, made 232 appearances in two spells but was out of contract following the end of the season.\nHe kept 22 clean sheets during Lincoln's National League...",Lincoln City goalkeeper Paul Farman has signed a two-year deal with the club.



shape of data: (18949, 3)


Unnamed: 0,_id,_text,_summary
11974,cb4a1c0db8,Pine Forest Range Recreation Enhancement Act of 2013\n\nSECTION 1. SHORT TITLE.\n\n This Act may be cited as the ``Pine Forest Range Recreation \nEnhancement Act of 2013''.\n\nSEC. 2. DEFINITIO...,"Pine Forest Range Recreation Enhancement Act of 2013 - Designates specified federal land managed by the Bureau of Land Management (BLM) in Humboldt County, Nevada, and to be known as the Pine Fore..."
5197,9f14f3f78e,"To amend the Federal Election Campaign Act of 1971 to reduce the amount that a nonparty multicandidate political committee may contribute to a candidate in a congressional election, and for other ...",Amends the Federal Election Campaign Act of 1971 to reduce the contribution that a multicandidate political committee may make to a congressional candidate. \nProhibits: (1) a congressional candi...
1527,edf064fa1f,To promote preventive health care for Americans.\n\nSECTION 1. SHORT TITLE.\n\n This Act may be cited as the ``Wellness and Prevention Act of \n2007''.\n\nSEC. 2. FINDINGS.\n\n The Congress ...,Wellness and Prevention Act of 2007 - Authorizes the Secretary of Health and Human Services to pay up to a specified amount of the medical education loans incurred by any physician who receives bo...
18921,3fbc7485dc,A bill to amend the Workforce Investment Act of 1998 to authorize the Secretary of Labor to provide for 5-year pilot projects to establish a system of industry-validated national certifications of...,Workforce Investment for Next-Generation Technologies Act - WING Act - Amends the Workforce Investment Act of 1998 to direct the Secretary of Labor to carry out up to twenty pilot projects to esta...
7797,87d2a207db,"A bill to amend the Energy Policy and Conservation Act to establish efficiency standards for bottle-type water dispensers, commercial hot food holding cabinets, and portable electric spas.\n\nSECT...","Amends the Energy Policy and Conservation Act to include bottle-type water dispensers and compartment bottle-type water dispensers, commercial hot food holding cabinets, and portable electric spas..."



shape of data: (199977, 3)


Unnamed: 0,_id,_text,_summary
124319,42c077d373,I really like the retro styled gown . Only thing is it’s having only net/mesh over shoulder . Which doesn’t suit me so I returned this otherwise the fall of material & dosing is really beautiful .,Beautiful
2489,e612d88941,I didn't work at all!,One Star
179605,92584bc8dd,"Great patio heater, very easy to follow instructions on how to put together and looks great.",Satisfied!
45212,9e2e2bbe81,"The safe look nice but it is cheaply made, i was expecting something much more heavier, it is super light. Also the safe arrived scratch and damaged on the corners",The safe look nice but it is cheaply made
127674,ab1d15a288,"Brutal , creepy and well scored. Art The Clown brings the pain .",Awesome and Brutal





In [None]:
# checking for collision 
# all_ids = [j for i in [xsum_df, bsum_df, amz_df] for j in i['_id'].values.tolist()] 
# unique_ids = set(list(all_ids))
# len(all_ids), len(unique_ids)

### Metrics  
- Rogue, 1, 2, L  {rogue n means n-gram match precision recall and f1}
- Meteor - Rouge + Misalignment penalty
https://en.wikipedia.org/wiki/METEOR

In [None]:
from datasets import load_metric
meteor_metric = load_metric("meteor")
rogue_metric = load_metric('rouge')

Downloading:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Downloading:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [None]:
# text1 = 'Hello mr. Dj, I am in a very bad mood.'.lower()
# text2 = 'The Girl is saying hello to Mr. DJ and telling the guy that she is in a very bad mood'.lower()
# text3 = 'so full of shit.'.lower()
# text4 = 'what a load of crap'.lower()

def compute_metrics(t1, t2):
  t1 = t1 if type(t1) == list else [t1,]
  t2 = t2 if type(t2) == list else [t2,]
  display(meteor_metric.compute(predictions=t1, references=t2))
  display(rogue_metric.compute(predictions=t1, references=t2))
# cm = compute_metrics
# cm(text1, text2)
# cm(text2, text3)
# cm(text3, text4)

### Model

In [None]:
# TODO: Train using TPU
import torch 
device = 'cuda' if torch.cuda.is_available() else 'cpu'

from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
model_name = 't5-small'

model = T5ForConditionalGeneration.from_pretrained(model_name) #.to(device)
tokenizer = T5Tokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
class CustomDataset(Dataset):
  def __init__(self, df, tokenizer, config=None):
    self.data = df
    self._id = df._id.values
    self._summary = df._summary.values
    self._text = df._text.values
    self.tokenizer = tokenizer
    self.config = config or {}
    self.input_maxlen = self.config.get('input_maxlen', 512)
    self.output_maxlen = self.config.get('output_maxlen', 128)
    # self.device = self.config.get('device', 'cpu')

  def __len__(self):
    return len(self._text)

  def __getitem__(self, idx):
    _id = self._id[idx]
    _text = self._text[idx]
    _summary = self._summary[idx]

    text_encoding = self.tokenizer(_text, 
                                   max_length=self.input_maxlen,
                                   padding='max_length',
                                   truncation=True,
                                   return_tensors='pt') # .to(self.device)
    summary_encoding = self.tokenizer(_summary, 
                                   max_length=self.output_maxlen,
                                   padding='max_length',
                                   truncation=True,
                                   return_tensors='pt') # .to(self.device)
    
    enc_input_ids = text_encoding.pop('input_ids').squeeze()
    enc_attn_mask = text_encoding.pop('attention_mask').squeeze()
    dec_input_ids = summary_encoding.pop('input_ids').squeeze() 
    dec_attn_mask = summary_encoding.pop('attention_mask').squeeze() 

    return dict(
        enc_input_ids = enc_input_ids,
        enc_attn_mask = enc_attn_mask,
        dec_input_ids = dec_input_ids,
        dec_attn_mask = dec_attn_mask
    )



In [None]:
# for sample  
df1 = xsum_df.sample(30)

cd1 = CustomDataset(df1, tokenizer)
len(cd1)

30

In [None]:
cd1[3]

{'dec_attn_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'dec_input_ids': tensor([  37,  502,   13,    3,    9, 7213,  120,    3, 1092, 1158,  113, 3977,
           13, 1874,  441,  477,   13,  284,  119,   43, 2471,    3,    9, 6419,
           13,   70, 1362,   31,  336, 4413,  544,    5,    1,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,  

### Training and Validation Functions

In [None]:
def train(epoch, model, tokenizer, dataloader, optimizer, train_config=None):
  model.train() 
  device = (train_config or {}).get('device', 'cpu')
  for i, data_batch in enumerate(tqdm(dataloader), 0):
    decoder_in = data_batch['dec_input_ids'].to(device)[:, :-1].contiguous()
    decoder_attn_mask = data_batch['dec_attn_mask'].to(device)[:, :-1].contiguous()
    decoder_out = data_batch['dec_input_ids'].to(device)[:, 1:].clone().detach() 
    decoder_out[decoder_out==tokenizer.pad_token_id] = -100

    encoder_in = data_batch['enc_input_ids'].to(device)
    encoder_attn_mask = data_batch['enc_attn_mask'].to(device)
    output = model(input_ids=encoder_in, attention_mask=encoder_attn_mask,
                   decoder_input_ids=decoder_in, decoder_attention_mask=decoder_attn_mask, labels=decoder_out)
    
    loss = output[0]
    if i% config.get('train_log_step', 50) == 0:
      print(f'step:{i}, epoch:{epoch}, loss:{loss.item():.3f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  
def validate(epoch, model, tokenizer, dataloader, valid_config=None):
  model.eval()
  valid_config = valid_config or {}
  device = valid_config.get('device', 'cpu')
  predictions, actuals = [], []

  # metrics = []
  metric_fns = config.get('metric_fns') 


  with torch.no_grad():
    for i, data_batch in enumerate(tqdm(dataloader), 0):
      targets = data_batch.pop('dec_input_ids').to(device)
      enc_ids = data_batch.pop('enc_input_ids').to(device)
      enc_mask = data_batch.pop('enc_attn_mask').to(device)

      output = model.generate(input_ids=enc_ids, attention_mask=enc_mask,
                              max_length=valid_config.get('max_gen_length', 128),
                              num_beams=valid_config.get('num_beams', 2),
                              length_penalty=valid_config.get('length_penalty', 1.0),
                              repetition_penalty=valid_config.get('repetition_penalty', 2.5),
                              early_stopping=valid_config.get('early_stopping', True))
      
      pred = [tokenizer.decode(g, clean_up_tokenization_spaces=True, skip_special_tokens=True) for g in output]
      actual = [tokenizer.decode(g, clean_up_tokenization_spaces=True, skip_special_tokens=True) for g in targets]
      
      predictions.extend(pred)
      actuals.extend(actual)

      if i% config.get('val_log_step', 50) == 0 :
        metrics = {mf: metric_fns[mf].compute(predictions=predictions, references=actuals) for mf in metric_fns}
        print(metrics)
      

  return predictions, actuals #, metrics


In [None]:
def get_dataloaders(data_df, tokenizer, sample=True, config=None):
  config = config or {}
  if sample: 
    data_df = data_df.sample(config.get('train_sample_size', 2000), 
                             random_state=config.get('random_seed', 8)).copy(deep=True)
  
  train_size = int(len(data_df) * 0.8)
  train_df = data_df.iloc[:train_size, :]
  val_df = data_df.iloc[train_size:, :]

  train_set = CustomDataset(train_df, tokenizer=tokenizer, config=config)
  val_set = CustomDataset(val_df, tokenizer=tokenizer, config=config)

  train_dataloader = DataLoader(train_set, batch_size=config.get('train_batch_size', 16), shuffle=True, num_workers=0)
  val_dataloader = DataLoader(val_set, batch_size=config.get('valid_batch_size', 32), shuffle=True, num_workers=0)

  return train_dataloader, val_dataloader  #, dict(train_set=train_set, val_set=val_set)

### Main

In [None]:
config = dict(
    train_batch_size=16,
    valid_batch_size=32,
    learning_rate=3e-05,
    
    input_maxlen=512,
    output_maxlen=128,
    
    device=device,
    
    metric_fns=dict(
        rogue_metric=rogue_metric,
        meteor_metric=meteor_metric 
    ),
    max_gen_length=128,
    num_beams=2,
    
    train_sample_size=10000,
    random_seed = 8,
    epochs = 10,
    train_log_step = 20,
    val_log_step = 5,
)


tokenizer = tokenizer

train_dataloader, val_dataloader = get_dataloaders(xsum_df, tokenizer, sample=True, config=config)

# model = model.to(config.get('device', 'cpu'))

optimizer = torch.optim.Adam(params=model.parameters(), 
                             lr=config.get('learning_rate', 3e-04),
                             weight_decay=config.get('weight_decay', 0.99))

epoch_wise_metrics = []

for epoch in range(config.get('epochs', 10)):
  train(epoch, model, tokenizer, train_dataloader, optimizer, train_config=config)
  predictions, actuals = validate(epoch, model, tokenizer, val_dataloader, valid_config=config)

  metric_fns = config.get('metric_fns', None) or {}

  metrics = {mf: metric_fns[mf].compute(predictions=predictions, references=actuals) for mf in metric_fns}
  epoch_wise_metrics.append(metrics)

  print(f'epoch={epoch}')
  print(metrics)
  

In [None]:
trained_model_path = f'{drive_path}summarization/models/t5-small-xsum/'
!mkdir -p {trained_model_path}
model.save_pretrained(trained_model_path)
tokenizer.save_pretrained(trained_model_path)

# model1 = transformers.T5ForConditionalGeneration.from_pretrained(trained_model_path)
# tokenizer1 = T5Tokenizer.from_pretrained(trained_model_path)

In [None]:
# model.state_dict()

In [None]:
# saving and loading model

# torch.save(model.state_dict(), '/content/temp/summarization_model1.pt')
# model1 = T5ForConditionalGeneration.from_pretrained('t5-small')
# model1.load_state_dict(torch.load('/content/temp/summarization_model1.pt'))

In [None]:
# model.save_pretrained('/content/temp/t5-small-xsum')
# tokenizer.save_pretrained('/content/temp/t5-small-xsum')
# model1 = transformers.T5ForConditionalGeneration.from_pretrained('/content/temp/t5-small-xsum')
# tokenizer1 = T5Tokenizer.from_pretrained('/content/temp/t5-small-xsum')

### Inferencing Function

In [None]:
# summarizer = transformers.pipeline("summarization", model=model, tokenizer=tokenizer, framework='pt', device=0)

example_texts = xsum_df._text.values[:3].tolist()

summarizer(example_texts, min_length=100, max_length=500)

[{'summary_text': "- 'e a '' ' . ', , 's 'n' , in 'a , the 'i 't' sa s' es s -s ad 'd's, sts d''e' -d'ea ds ed ss t sy dd te d ."},
 {'summary_text': "s . 'es ' ''s , 's' e' a 'i 'a e 't ', ., , on s'' , and 'c' - 'en' d' .' sy . it's on 'd 'n's, sa's in s. .s ad's and sns -'s the , to ."},
 {'summary_text': "to the 's ' ''' - a 'd' . 'e' sy , 'a' a's'', s' es d's, ad 'i' d s sa ds as et 't st dt -s . it's in se ss res ta .s n '"}]

## EXERCISE 2: Conversational

### Exploring Datasets

In [None]:
! pip install transformers datasets --quiet

[K     |████████████████████████████████| 2.8 MB 4.3 MB/s 
[K     |████████████████████████████████| 270 kB 49.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 34.9 MB/s 
[K     |████████████████████████████████| 895 kB 41.9 MB/s 
[K     |████████████████████████████████| 636 kB 43.0 MB/s 
[K     |████████████████████████████████| 52 kB 1.4 MB/s 
[K     |████████████████████████████████| 243 kB 43.8 MB/s 
[K     |████████████████████████████████| 1.3 MB 35.9 MB/s 
[K     |████████████████████████████████| 123 kB 39.8 MB/s 
[K     |████████████████████████████████| 142 kB 40.7 MB/s 
[K     |████████████████████████████████| 294 kB 50.5 MB/s 
[?25h

### Datasets description
PEC - persona based empathic conversation. 2020

In [None]:
import pandas as pd 
from datasets import load_dataset 
pd.set_option('display.max_colwidth', 200)
from tqdm.notebook import tqdm 

def hf_dataset2df(hfds):
  columns = hfds.column_names 
  if 'train' in hfds:
    dfs = []
    for split, hfds1 in hfds.items():
      df = hfds1.to_pandas()
      df['split'] = split
      dfs.append(df)
    
    return pd.concat(dfs)
  
  else:
    return hfds.to_pandas()
  

#### pec

In [None]:


pec = load_dataset('pec', 'all')
# taskmaster = load_dataset('taskmaster3')
# cques = load_dataset('conv_questions')
# craiglist = load_dataset('craigslist_bargains')

In [None]:
pec_df = hf_dataset2df(pec['test'])
pec_df.head()

In [None]:
pec_df[pec_df.context_speakers.apply(lambda x: len(x)>1)].head()

Unnamed: 0,personas,context,context_speakers,response,response_speaker
11,[i had a delicious filet mignon steak the othe...,[i did it ! ! i placed second in the national ...,"[CycloneCowgirl, ichigoli, CycloneCowgirl]",probably could have gotten first if you got th...,haiduz
14,"[i look about 15yrs younger ., i have never ex...",[i did it ! ! i placed second in the national ...,"[CycloneCowgirl, loudtoys, CycloneCowgirl]",indeed it is .,loudtoys
25,"[i have five myself !, i ’ll have to get help ...",[i did it ! ! i placed second in the national ...,"[CycloneCowgirl, ichigoli, CycloneCowgirl]",wait so are you saying that the second tornado...,exotics
28,[i 'm the one who posted it in the first place...,[i did it ! ! i placed second in the national ...,"[CycloneCowgirl, Rianne764, Tito1337]",no escape from ~~that fucking tornado if you d...,red498cp_
29,[i did n't see the man at first and thought th...,[i did it ! ! i placed second in the national ...,"[CycloneCowgirl, Rianne764]",caught in a landslide,Tito1337


In [None]:
pec_df.personas.apply(lambda x: ' '.join(x)).unique().shape, pec_df.response_speaker.unique().shape

((29504,), (29504,))

In [None]:
## pseudo data model
# data = [dict(contexts=[dict(text=text, speaker=speaker), ..., ], response=response, spearker=r_speaker) for i in data]
# persona [dict(speaker=speaker, comments=comments)]

#### cques

In [None]:
# taskmaster = load_dataset('taskmaster3')
cques = load_dataset('conv_questions')
# craiglist = load_dataset('craigslist_bargains')

Downloading:   0%|          | 0.00/2.94k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset conv_questions/default (download: 3.12 MiB, generated: 5.73 MiB, post-processed: Unknown size, total: 8.85 MiB) to /root/.cache/huggingface/datasets/conv_questions/default/1.0.0/1c125aaecfbe2337113d4793c8b06fbf369635a61400d2c236e5c035ed418fba...


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/594k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/542k [00:00<?, ?B/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset conv_questions downloaded and prepared to /root/.cache/huggingface/datasets/conv_questions/default/1.0.0/1c125aaecfbe2337113d4793c8b06fbf369635a61400d2c236e5c035ed418fba. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
cques

DatasetDict({
    train: Dataset({
        features: ['domain', 'seed_entity', 'seed_entity_text', 'questions', 'answers', 'answer_texts'],
        num_rows: 6720
    })
    validation: Dataset({
        features: ['domain', 'seed_entity', 'seed_entity_text', 'questions', 'answers', 'answer_texts'],
        num_rows: 2240
    })
    test: Dataset({
        features: ['domain', 'seed_entity', 'seed_entity_text', 'questions', 'answers', 'answer_texts'],
        num_rows: 2240
    })
})

In [None]:
cques_df = hf_dataset2df(cques)
cques_df.sample(5)

Unnamed: 0,domain,seed_entity,seed_entity_text,questions,answers,answer_texts,split
120,books,https://www.wikidata.org/wiki/Q8337,Harry Potter,"[Who is the author of Harry Potter?, How many parts?, 6th part?, Start of series US?, Birth place rowling?]","[[https://www.wikidata.org/wiki/Q34660], [7], [https://www.wikidata.org/wiki/Q46887], [1 September 1998], [https://www.wikidata.org/wiki/Q1025095]]","[Joanne K. Rowling, 7, Harry Potter and the Half-Blood Prince, 1 September 1998, Yate]",validation
1496,music,https://www.wikidata.org/wiki/Q184827,Tool,"[Who is the lead vocalist in the rock band Tool?, What's the drummer's name?, What was the year that they began performing together?, In was city was the band founded?, Which member of the band wa...","[[https://www.wikidata.org/wiki/Q311621], [https://www.wikidata.org/wiki/Q447698], [1990], [https://www.wikidata.org/wiki/Q65], [https://www.wikidata.org/wiki/Q526330]]","[Maynard James Keenan, Danny Carey, 1990, Los Angeles, Justin Chancellor]",validation
311,music,https://www.wikidata.org/wiki/Q125603,The Clash,"[What is the name of the lead signer of The Clash?, And where was he born?, What was his real name?, What was their first album titled?, How many people were originally in the band?]","[[https://www.wikidata.org/wiki/Q310052], [https://www.wikidata.org/wiki/Q3640], [https://www.wikidata.org/wiki/Q310052], [https://www.wikidata.org/wiki/Q775208], [4]]","[Joe Strummer, Ankara, Turkey, John Graham Mellor, The Clash, 4]",test
6668,movies,https://www.wikidata.org/wiki/Q193066,Breakfast at Tiffany's,"[In which movie do Audrey Hepburn and George Peppard appear in?, Who directed the film?, Is the film based on a novel?, What is the name of the author that wrote the book?, Was he involved in writ...","[[https://www.wikidata.org/wiki/Q193066], [https://www.wikidata.org/wiki/Q56093], [Yes], [https://www.wikidata.org/wiki/Q134180], [Yes]]","[Breakfast at Tiffany's, Blake Edwards, Yes, Truman Capote, Yes]",train
2038,music,https://www.wikidata.org/wiki/Q203871,Rush,"[Where was Rush originally from?, Who played guitar in the band?, What is his birthday?, Has he won any awards?, Who played bass with him?]","[[https://www.wikidata.org/wiki/Q203871], [https://www.wikidata.org/wiki/Q348177], [27 August 1953], [https://www.wikidata.org/wiki/Q15278116], [https://www.wikidata.org/wiki/Q320895]]","[Rush was originally from Canada., Alex Lifeson, 27 August 1953, He has received the Officer of the Order of Canada., Geddy Lee was the bass player.]",validation


In [None]:
cques_df[['questions', 'answer_texts']].sample(5)

Unnamed: 0,questions,answer_texts
151,"[Who was the author of Fight Club?, Where was this movie filmed?, How long is movie?, Who played Tyler Durden?, Date of publication?]","[Chuck Palahniuk, Los Angeles, 139 minute, Brad Pitt, 11 November 1999]"
282,"[What's the name of the league that Manchester United is a member of?, where do they play their home games?, who was the coach in 2017?, what song do their fans sing?, when was their first FA cup ...","[Premier League, Old Trafford, José Mourinho, Glory Glory, 1909]"
918,"[When was the author Stephen King born?, What book did Stephen King write about a killer clown?, What is the title of the first book in The Dark Tower series?, What short story of him was turned i...","[21 September 1947, IT, The Dark Tower: The Gunslinger, The Running Man, Charlie the Choo-Choo: From the World of The Dark Tower]"
3972,"[What novel has the character named Nick Carraway?, Who wrote the book?, Was this his first novel?, What was the name of the author's wife?, What was the name of his child?]","[The Great Gatsby, F. Scott Fitzgerald, No, Zelda Fitzgerald, Frances Scott Fitzgerald]"
3152,"[What is the official abbreviation of the league name of Major League Soccer?, Major League Soccer has teams in what two countries?, What year was Major League Soccer created?, Who won league's fi...","[MLS, United States;Canada, 1993, D.C. United, Foxboro Stadium]"


#### craiglist_bargains

In [None]:
craiglist = load_dataset('craigslist_bargains')
display(craiglist)

cgl_df = hf_dataset2df(craiglist)

cgl_df.sample(5)

Downloading:   0%|          | 0.00/2.86k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset craigslist_bargains/default (download: 24.20 MiB, generated: 10.36 MiB, post-processed: Unknown size, total: 34.55 MiB) to /root/.cache/huggingface/datasets/craigslist_bargains/default/1.1.0/9e4e64d13ea36e435a8f56f9305d3c771115909b7fe83269bf7cd8dd40302338...


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading: 0.00B [00:00, ?B/s]

Downloading: 0.00B [00:00, ?B/s]

Downloading: 0.00B [00:00, ?B/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset craigslist_bargains downloaded and prepared to /root/.cache/huggingface/datasets/craigslist_bargains/default/1.1.0/9e4e64d13ea36e435a8f56f9305d3c771115909b7fe83269bf7cd8dd40302338. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['agent_info', 'agent_turn', 'dialogue_acts', 'utterance', 'items'],
        num_rows: 5247
    })
    test: Dataset({
        features: ['agent_info', 'agent_turn', 'dialogue_acts', 'utterance', 'items'],
        num_rows: 838
    })
    validation: Dataset({
        features: ['agent_info', 'agent_turn', 'dialogue_acts', 'utterance', 'items'],
        num_rows: 597
    })
})

Unnamed: 0,agent_info,agent_turn,dialogue_acts,utterance,items,split
4027,"{'Bottomline': ['None', 'None'], 'Role': ['buyer', 'seller'], 'Target': [180.0, 200.0]}","[0, 1, 1]","{'intent': ['', '', ''], 'price': [-1.0, -1.0, -1.0]}","[, Hi, I see you are interested in my lights. , ]","{'Category': ['electronics', 'electronics'], 'Images': ['electronics/6128917197_0.jpg', 'electronics/6128917197_0.jpg'], 'Price': [200.0, 200.0], 'Description': ['25pcs 18W G13 120cm 4ft 48"" 6500K...",train
553,"{'Bottomline': ['None', 'None'], 'Role': ['buyer', 'seller'], 'Target': [7.0, 13.0]}","[0, 1, 0, 1, 0, 1, 0, 0, 1]","{'intent': ['intro', 'unknown', 'inquiry', 'init-price', 'counter-price', 'counter-price', 'counter-price', 'offer', 'accept'], 'price': [-1.0, -1.0, -1.0, 13.0, 7.0, 13.0, -1.0, 13.0, -1.0]}","[hello, I am interested in buying your router. I am wondering, how old is it?, less than a year, Ok, how fast does it run?, 14x faster speed. It is a steal at $13, Ok, I was thinking of offering $...","{'Category': ['electronics', 'electronics'], 'Images': ['', ''], 'Price': [13.0, 13.0], 'Description': ['Excellent working condition,Gigabit Wireless Router', 'Excellent working condition,Gigabit ...",train
1448,"{'Bottomline': ['None', 'None'], 'Role': ['buyer', 'seller'], 'Target': [1976.0, 2600.0]}","[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1]","{'intent': ['intro', 'unknown', 'inquiry', 'disagree', 'inquiry', 'init-price', 'inquiry', 'inform', 'counter-price', 'counter-price', 'counter-price', 'inquiry', 'inform', 'offer', 'accept'], 'pr...","[Hello is this item still for sale?, yes hi , it's a perfect bike, Can you tell me a little about it?, it's a light bike, easy to handle, fairly new, pro-bike, it's like no other bike, you should ...","{'Category': ['bike', 'bike'], 'Images': ['bike/6148069748_0.jpg', 'bike/6148069748_0.jpg'], 'Price': [2600.0, 2600.0], 'Description': ['29 in. Carbon wheels, carbon cranks, XTR rear derailleur, ...",train
4344,"{'Bottomline': ['None', 'None'], 'Role': ['buyer', 'seller'], 'Target': [540.0, 600.0]}","[1, 0, 1, 0, 1, 0, 0]","{'intent': ['intro', 'unknown', 'init-price', 'counter-price', 'counter-price', 'disagree', 'quit'], 'price': [-1.0, -1.0, 600.0, 300.0, 550.0, -1.0, -1.0]}","[Hi there how are you?, Hi, I like the bike How old is it? , Its only 3 months old I'm moving and can't take it with me at 600 it's an absolute steal!, Ok, I'll offer you $300 since your in a pin...","{'Category': ['bike', 'bike'], 'Images': ['bike/6141799628_0.jpg', 'bike/6141799628_0.jpg'], 'Price': [600.0, 600.0], 'Description': ['Former MercuryViatel pro team bike,Custom Reynolds steel fram...",train
1297,"{'Bottomline': ['None', 'None'], 'Role': ['buyer', 'seller'], 'Target': [1212.0, 1595.0]}","[1, 0, 1, 0, 1, 0, 1]","{'intent': ['intro', 'init-price', 'counter-price', 'counter-price', 'insist', 'offer', 'accept'], 'price': [-1.0, 1100.0, 1350.0, 1250.0, 1350.0, 1350.0, -1.0]}","[Hello, Hello. I'd like to offer you 1100. I have to move to the area and have moving expenses. , I'm willing to work with you, but how about $1,350. It's a great place for a single person., That'...","{'Category': ['housing', 'housing'], 'Images': ['housing/6102501051_0.jpg', 'housing/6102501051_0.jpg'], 'Price': [1595.0, 1595.0], 'Description': ['The 532 Bancroft Apartments is a controlled acc...",train


In [None]:
cgl_df.sample(1, random_state=414).T

Unnamed: 0,2945
agent_info,"{'Bottomline': ['None', 'None'], 'Role': ['buyer', 'seller'], 'Target': [2232.0, 2480.0]}"
agent_turn,"[1, 0, 1, 0, 1, 1, 0, 0]"
dialogue_acts,"{'intent': ['intro', 'unknown', 'init-price', 'counter-price', 'counter-price', 'offer', 'counter-price', 'reject'], 'price': [-1.0, -1.0, 2480.0, 2232.0, 2300.0, 2300.0, 2250.0, -1.0]}"
utterance,"[Hey there!, Hello. Im interested in buying the home. How old is the home?, It's extremely new, everything in the house is brand new too. The yard is fantastic as well. Its really an excellent dea..."
items,"{'Category': ['housing', 'housing'], 'Images': ['housing/6127623686_0.jpg', 'housing/6127623686_0.jpg'], 'Price': [2480.0, 2480.0], 'Description': ['Everything is Brand New Home With Nice Yard !!,..."
split,train


In [None]:
list(zip(*cgl_df.sample(1, random_state=414)[[ 'agent_turn', 'utterance']].values[0].tolist()))

[(1, 'Hey there!'),
 (0, 'Hello. Im interested in buying the home. How old is the home?'),
 (1,
  "It's extremely new, everything in the house is brand new too. The yard is fantastic as well. Its really an excellent deal at $2480"),
 (0, "I wouldn't be able to spend that much. Would you accept $2232?"),
 (1,
  "It's already an excellent price as is, but we can go to $2300 if you can afford that. We assure you that you will love this place and all the new appliances."),
 (1, ''),
 (0, 'I would like to offer you $2250'),
 (0, '')]

#### taskmaster

In [None]:
taskmaster = load_dataset('taskmaster3')
display(taskmaster)

tsm_df = hf_dataset2df(taskmaster)

tsm_df.sample(5)

Downloading:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset taskmaster3/default (download: 298.88 MiB, generated: 136.96 MiB, post-processed: Unknown size, total: 435.84 MiB) to /root/.cache/huggingface/datasets/taskmaster3/default/1.0.0/8a769c3c26e6836c1568c1f9132e3249637636f18feaab706de214d8cc8bd601...


0 examples [00:00, ? examples/s]

Dataset taskmaster3 downloaded and prepared to /root/.cache/huggingface/datasets/taskmaster3/default/1.0.0/8a769c3c26e6836c1568c1f9132e3249637636f18feaab706de214d8cc8bd601. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['conversation_id', 'vertical', 'instructions', 'scenario', 'utterances'],
        num_rows: 23757
    })
})

Unnamed: 0,conversation_id,vertical,instructions,scenario,utterances,split
23520,dlg-ecd6209a-4d04-4abc-a727-f2b1873fceb0,Movie Tickets,,Recommendation,"[{'index': 0, 'speaker': 'user', 'text': 'Assistant, I would like to see a movie with a strong female lead.', 'apis': [{'name': 'find_movies', 'index': 0, 'args': array([{'arg_name': 'location', '...",train
7223,dlg-4aa630b9-067c-45a5-a7b0-08ccd9ac5c43,Movie Tickets,"SCENARIO: Pretend you’re *speaking to your digital assistant*. {{LIST You would like to *purchase tickets for a movie* currently showing in theaters., You *have already decided the movie* you wan...",Deferred Purchase,"[{'index': 0, 'speaker': 'user', 'text': 'Hi. I would like to buy movie tickets.', 'apis': [], 'segments': []}, {'index': 1, 'speaker': 'assistant', 'text': 'Okay. Which film are you interested in...",train
13920,dlg-2b0a4135-045f-4000-b3e9-1000fbe53a6c,Movie Tickets,,Successful Purchase with fake KB table,"[{'index': 0, 'speaker': 'user', 'text': 'Yea I need some help about movie titles do you mind?', 'apis': [], 'segments': []}, {'index': 1, 'speaker': 'assistant', 'text': 'Sure how can I help you?...",train
16450,dlg-79f2b2a4-02fa-45b6-8e1b-bed4975dc294,Movie Tickets,"SCENARIO: Pretend you’re *speaking to your digital assistant*. {{LIST You would like to *purchase tickets for a movie* currently showing in theaters., You *have already decided the movie* you wan...",Deferred Purchase,"[{'index': 0, 'speaker': 'user', 'text': 'Hi I need help ordering movie tickets', 'apis': [], 'segments': []}, {'index': 1, 'speaker': 'assistant', 'text': 'Hi. I am happy to help with that!', 'ap...",train
8289,dlg-bb0b44a3-e408-4555-9536-975917d6a940,Movie Tickets,"SCENARIO: Pretend you are *speaking to a digital assistant*. {{LIST You would like to *purchase tickets for a movie* currently showing in theatres., You *have already decided the movie* you want ...",Tickets Change,"[{'index': 0, 'speaker': 'user', 'text': 'My wife wants to see a movie tonight. What's playing near us?', 'apis': [], 'segments': [{'start_index': 29, 'end_index': 36, 'text': 'tonight', 'annotati...",train


In [None]:
tsm_df.sample(1, random_state=414)[ 'utterances'].values[0]

array([{'index': 0, 'speaker': 'user', 'text': 'Hi, I want you to get me a movie ticket.', 'apis': array([], dtype=object), 'segments': array([], dtype=object)},
       {'index': 1, 'speaker': 'assistant', 'text': 'For what movie?', 'apis': array([], dtype=object), 'segments': array([], dtype=object)},
       {'index': 2, 'speaker': 'user', 'text': 'The Photograph', 'apis': array([], dtype=object), 'segments': array([{'start_index': 0, 'end_index': 14, 'text': 'The Photograph', 'annotations': array([{'name': 'name.movie'}], dtype=object)}],
      dtype=object)},
       {'index': 3, 'speaker': 'assistant', 'text': 'Where do you want to see it?', 'apis': array([], dtype=object), 'segments': array([], dtype=object)},
       {'index': 4, 'speaker': 'user', 'text': 'Cinemark Riverside, Reno', 'apis': array([], dtype=object), 'segments': array([{'start_index': 0, 'end_index': 18, 'text': 'Cinemark Riverside', 'annotations': array([{'name': 'name.theater'}], dtype=object)},
       {'start_ind

In [None]:
print(tsm_df.sample(1, random_state=414)[ 'instructions'].values[0])

SCENARIO: Pretend you’re *speaking to a digital assistant*. {{LIST You would like to *purchase tickets for a movie currently showing in theaters.*, You *have already decided the movie* you want to see., In this conversation you *change your mind about the showtime and pick a different one*., The conversation ends with *a successful purchase*.}}

DETAILS: Click the link to get the details you’ll need — {{LINK movies_in_theaters_near_me https://www.google.com/search?q=movies+in+theaters+near+me}} — and include these details in the conversation: {{LIST Movie name, Theater name, Showtime, Number of tickets, Day }}

TASK: *Write the transcript* of your spoken conversation. Make sure: {{LIST it follows *all points in the specific scenario mentioned above*, it includes *at least 10 exchanges* (an exchange is one user turn and one assistant turn), both the assistant and user *confirm the ticket details* before booking, the assistant *confirms the tickets were in fact purchased*}}

SPEAKING STY

In [None]:
tsm_df.vertical.value_counts()[:10], '\n', tsm_df.scenario.value_counts()[:10]

(Movie Tickets    23757
 Name: vertical, dtype: int64,
 '\n',
 Successful Purchase with fake KB table        2947
 Recommendation                                1807
 Successful Purchase with fake KB table - 1    1707
 Auto Template 13 movie name resolution        1052
 Successful Purchase                           1040
 Screening Change                               974
 Tickets Change                                 870
 Theater Change                                 841
 Misunderstanding                               817
 Deferred Purchase                              737
 Name: scenario, dtype: int64)

### Inferencing with DialoGPT

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = 'microsoft/DialoGPT-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# # Let's chat for 5 lines
# for step in range(5):
#     # encode the new user input, add the eos_token and return a tensor in Pytorch
#     new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

#     # append the new user input tokens to the chat history
#     bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

#     # generated a response while limiting the total chat history to 1000 tokens, 
#     chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

#     # pretty print last ouput tokens from bot
#     print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.75G [00:00<?, ?B/s]

In [None]:
device = 'cuda'
model = model.to(device)
def dialogue_system(model, tokenizer):
  chat_history_ids = None 
  try:
    while True:
      user_input = input('user: ').strip() 
      user_input_ids = tokenizer.encode(user_input+tokenizer.eos_token, return_tensors='pt').to(device)
      bot_input = torch.cat([chat_history_ids, user_input_ids], dim=-1) if chat_history_ids is not None else user_input_ids
      chat_history_ids = model.generate(bot_input, max_length=1000,
                                        pad_token_id=tokenizer.eos_token_id,
                                        do_sample=True, num_beams=3,
                                        top_p=0.9,repeation_penalty=2.5)
      model_output = tokenizer.decode(chat_history_ids[:, bot_input.shape[-1]:][0], skip_special_tokens=True)
      print(f'dialogue system: {model_output}')
    
  except KeyboardInterrupt as e:
    print('Interrupted. Finishing!')
    return


In [None]:
dialogue_system(model, tokenizer)

In [None]:
# do it for datasets 
def dialogue_system_v2(model, tokenizer, dialogues):
  chat_history_ids = None 
  try:
    for user_input, user_ouput in zip(dialogues[::2], dialogues[1::2]):
      user_input = input('user: ').strip() 
      user_input_ids = tokenizer.encode(user_input+tokenizer.eos_token, return_tensors='pt').to(device)
      bot_input = torch.cat([chat_history_ids, user_input_ids], dim=-1) if chat_history_ids is not None else user_input_ids
      chat_history_ids = model.generate(bot_input, max_length=1000,
                                        pad_token_id=tokenizer.eos_token_id,
                                        do_sample=True, num_beams=3,
                                        top_p=0.9,repeation_penalty=2.5)
      model_output = tokenizer.decode(chat_history_ids[:, bot_input.shape[-1]:][0], skip_special_tokens=True)
      print(f'dialogue system: {model_output}')
      print(f'user_output:', user_output)
    
  except KeyboardInterrupt as e:
    print('Interrupted. Finishing!')
    return


### BlenderBot and BlenderBot2
Inferencing