<a href="https://colab.research.google.com/github/tgeral68/TP-1-chatbot/blob/main/2_dialogue_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install datasets



In [2]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from matplotlib import pyplot as plt
from collections import Counter

In [3]:
from datasets import load_dataset
dataset = load_dataset("multi_woz_v22", trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
tokenizer.add_special_tokens({'pad_token': '<|endoftext|>'})
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
model.resize_token_embeddings(len(tokenizer))



Embedding(50257, 768)

In [5]:
tokenizer

GPT2TokenizerFast(name_or_path='openai-community/gpt2', vocab_size=50257, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## Implement the dataset module

Create an object having as parent `torch.utils.data.dataset` implementing that return previous turn and answer of the dataset.

In [6]:
from torch.utils.data import Dataset

class WoZWindowedGenerationDataset:
    def __init__(self, dataset, window_size=3):
        self.dataset = dataset
        self.window_size = window_size
        self.index = []
        for i, dial in enumerate(dataset):
            for j, speaker in enumerate(dial['turns']['speaker']):
                if speaker == 1:
                    self.index.append((i,j))
    def __len__(self):
        return len(self.index)

    def __getitem__(self, index):
        i, j = self.index[index]
        dial = self.dataset[i]['turns']['utterance']

        turns = dial[j-1] if(j!= 0) else ''
        answer = dial[j]
        return {'turns': turns,
                'answer': answer}

class WoZHistoryWindowedGenerationDataset:
  def __init__(self, dataset, window_size=3):
      self.dataset = dataset
      self.window_size = window_size
      self.index = []
      for i, dial in enumerate(dataset):
          for j, speaker in enumerate(dial['turns']['speaker']):
              if speaker == 1:
                  self.index.append((i,j))
  def __len__(self):
      return len(self.index)

  def __getitem__(self, index):
      i, j = self.index[index]
      dial = self.dataset[i]['turns']['utterance']
      spea = self.dataset[i]['turns']['speaker']

      utterance = dial[max(j-self.window_size, 0):j]
      speaker = spea[max(j-self.window_size, 0):j]

      answer = dial[j]
      return {'utterance': utterance,
              'speaker': speaker,
              'answer': answer}


In [17]:
class DialogueCollator(Dataset):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    def __call__(self, data):
        input_tokens = self.tokenizer(['[USER]' + d['turns'] + "[BOT]" + d['answer'] for d in data],
                                 return_tensors='pt', return_length=True, padding=True)
        return {
            'input_ids': input_tokens.input_ids,
            'attention_mask': input_tokens.attention_mask
        }
class DialogueHistoryCollator(Dataset):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    def __call__(self, data):
        text = [''.join([('[USER]' if(speaker == 0) else '[BOT]') + turn for speaker, turn in zip(d['speaker'],d['utterance'])]) + "[BOT]" + d['answer'] for d in data]
        input_tokens = self.tokenizer(text,
                                 return_tensors='pt', return_length=True, padding=True)
        return {
            'input_ids': input_tokens.input_ids,
            'attention_mask': input_tokens.attention_mask
        }

In [26]:
from tqdm.notebook import trange, tqdm
from torch import optim
from torch import nn


class Trainer():
    def __init__(self, model, padding_idx=100):
        self.model = model
        self.optimizer = None

    def at_training_start(self, learning_rate = 1e-3):
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        self.criterion = nn.CrossEntropyLoss(ignore_index=50257)

    def validation_step(self, data):
        pass

    def training_step(self, data):
        y_pred = self.model(**data)
        y_truth = data["input_ids"][:, 1:].flatten()

        loss_reconstruction = self.criterion(y_pred.logits[:,:-1].reshape(y_truth.shape[0], -1), y_truth)
        (loss_reconstruction).backward()
        return loss_reconstruction.item()

    def on_validation_end(self, resp):
        pass

    def validation(self, validation_dl):
        pass

    def fit(self,
            training_dl,
            validation_dl,
            learning_rate = 1e-3,
            validation_frequency = 8,
            max_iter = 10000,
            use_gpu=False,

        ):
        if(use_gpu):
          self.model = self.model.cuda()
        self.at_training_start(learning_rate)

        iter_count = 0
        loss_buffer = []
        pbar = trange(max_iter)

        while(iter_count < max_iter):
            for data in training_dl:
                if use_gpu:
                    data = {k:v.cuda() for k, v in data.items()}
                self.optimizer.zero_grad()
                loss_buffer += [self.training_step(data)]
                self.optimizer.step()

                if(iter_count  % validation_frequency == 0):
                    print("Loss at iteration %s is %s"%(iter_count, np.mean(loss_buffer)))
                    self.validation(validation_dl)
                    loss_buffer = []
                iter_count += 1
                pbar.update(1)
                if(iter_count < max_iter):
                  break

        self.model = self.model.cpu()

In [19]:
training_set = WoZHistoryWindowedGenerationDataset(dataset['train'])

In [20]:
from torch.utils.data import DataLoader

collator = DialogueHistoryCollator(tokenizer)
training_dl = DataLoader(training_set, batch_size=16, shuffle=True, collate_fn=collator, num_workers=2)

In [21]:
training_set[2]

{'utterance': ['Any sort of food would be fine, as long as it is a bit expensive. Could I get the phone number for your recommendation?',
  'There is an Afrian place named Bedouin in the centre. How does that sound?',
  'Sounds good, could I get that phone number? Also, could you recommend me an expensive hotel?'],
 'speaker': [0, 1, 0],
 'answer': "Bedouin's phone is 01223367660. As far as hotels go, I recommend the University Arms Hotel in the center of town."}

In [27]:
my_trainer = Trainer(model)
my_trainer.fit(training_dl, None, validation_frequency=250, use_gpu=True, max_iter=2000)

  0%|          | 0/2000 [00:00<?, ?it/s]

Loss at iteration 0 is 1.0251469612121582
Loss at iteration 250 is 1.2526871712207794
Loss at iteration 500 is 1.1345678579807281
Loss at iteration 750 is 1.0691179563999176
Loss at iteration 1000 is 1.0412804293632507


KeyboardInterrupt: 

In [39]:
class Chatbot(object):
  def __init__(self):
    pass

  def answer(self, current_input):
    return "Not Implemented"

  def start(self):
    current_answer = "Start dialogue"
    current_input = ""
    while(current_input != 'exit'):
      current_input = input("Bot: "+current_answer + " \nUser: ")
      current_answer = self.answer(current_input)

class ChitChat(Chatbot):
  def __init__(self, model, tokenizer, collator, history_len = 1):
    self.model = model
    self.tokenizer = tokenizer
    self.utterance = []
    self.hlen = history_len

  def answer(self, current_input):
    self.utterance.append('[USER]'+current_input)
    # print(''.join(self.utterance[max(0, len(self.utterance) - self.hlen): ]))
    tokenized_text = self.tokenizer(''.join(self.utterance[max(0, len(self.utterance) - self.hlen): ]), return_tensors='pt')
    generated_token_ids = self.model.generate(**tokenized_text, do_sample=True, max_length=200, pad_token_id=model.config.eos_token_id)[0]
    answer = self.tokenizer.decode(generated_token_ids).split('[BOT]')[-1][:-len('<|endoftext|>')]
    self.utterance.append('[BOT]'+answer)
    return answer


In [40]:
cb = ChitChat(model.cpu(), tokenizer, collator, history_len=3)

In [41]:
cb.start()

Bot: Start dialogue 
User: Hello, I'am looking for a train from paris to london
Bot: What is your travel time? 
User: It will the first of january around 8pm
Bot: What day would you like to travel? 
User: The first january
Bot: TR8310 leaves Cambridge on Thursday and arrives in the city at 8pm. Would you like me to book it? 
User: I would prefer from paris
Bot: Booking was successful, the total fee is 17.4 GBP payable at the station. Your reference number is 8NISNX. The total fee is 70.6 GBP payable at the station.Reference number is : 2MZ0C09K4O. anything else I can help you with? 
User: No thanks 
Bot: Thank you for calling. Have a great day. 
User: bye
Bot: Thanks for using our service. 
User: ok I get it


KeyboardInterrupt: Interrupted by user