In [1]:
!pip install -U transformers
!pip install datasets
!pip install -U accelerate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

In [2]:
import numpy as np
import tempfile
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer

In [3]:
# Load the DailyDialog dataset
dataset = load_dataset('daily_dialog')

# Concatenate all utterances within a dialogue and map to 'dialog' key
def concatenate_utterances(example):
    example['dialog'] = " ".join(example['dialog'])
    return example

# Apply the function to all examples in the dataset
dataset = dataset.map(concatenate_utterances)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/4.85k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

The repository for daily_dialog contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/daily_dialog.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/4.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11118 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [4]:
# length of the dataset
print("Length of dataset:", len(dataset['train']))

# print the first example
print(dataset['train'][0])


Length of dataset: 11118
{'dialog': "Say , Jim , how about going for a few beers after dinner ?   You know that is tempting but is really not good for our fitness .   What do you mean ? It will help us to relax .   Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ?   I guess you are right.But what shall we do ? I don't feel like sitting at home .   I suggest a walk over to the gym where we can play singsong and meet some of our friends .   That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them .   Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too .   Good.Let ' s go now .   All right . ", 'act': [3, 4, 2, 2, 2, 3, 4, 1, 3, 4], 'emotion': [0, 0, 0, 0, 0, 0, 4, 4, 4, 4]}


In [None]:
# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-medium')

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# Encode the dataset
def encode(examples):
    encoded = tokenizer(examples['dialog'], truncation=True, padding='max_length', max_length=128)
    encoded['labels'] = encoded['input_ids'][:]
    return encoded

encoded_dataset = dataset.map(encode, batched=True)

Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=tempfile.mkdtemp(),   # output directory
    num_train_epochs=10,             # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=None,                # directory for storing logs
    fp16=True                        # use floating point 16 bit precision for training
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation']
)

In [None]:
# Evaluate before fine-tuning
pre_eval_results = trainer.evaluate(encoded_dataset['validation'])

# Get predictions for validation set before fine tuning for 10 samples
pre_val_predictions = trainer.predict(encoded_dataset['validation'].select(range(10)))

In [8]:
# Fine-tune the model
trainer.train()

Step,Training Loss
500,2.2199
1000,1.7255
1500,1.6183
2000,1.4961
2500,1.387
3000,1.3099
3500,1.2427
4000,1.1528
4500,1.1018
5000,1.06


Step,Training Loss
500,2.2199
1000,1.7255
1500,1.6183
2000,1.4961
2500,1.387
3000,1.3099
3500,1.2427
4000,1.1528
4500,1.1018
5000,1.06


TrainOutput(global_step=6950, training_loss=1.3051073823558341, metrics={'train_runtime': 5247.323, 'train_samples_per_second': 21.188, 'train_steps_per_second': 1.324, 'total_flos': 2.581323580440576e+16, 'train_loss': 1.3051073823558341, 'epoch': 10.0})

In [9]:
# Get predictions for validation set before fine tuning for 10 samples
pre_val_predictions = trainer.predict(encoded_dataset['validation'].select(range(10)))
# Evaluate after fine-tuning
post_eval_results = trainer.evaluate(encoded_dataset['validation'])

# Print the evaluation losses before and after fine-tuning
print('Evaluation Results before fine-tuning :', pre_eval_results['eval_loss'])
print('Evaluation Results after fine-tuning  :', post_eval_results['eval_loss'])

# Get predictions for validation set before fine tuning for 10 samples
post_val_predictions = trainer.predict(encoded_dataset['validation'].select(range(10)))

# Zip the pre and post tuning predictions
predictions = zip(pre_val_predictions.predictions, post_val_predictions.predictions)

Evaluation Results before fine-tuning : 4.766558647155762
Evaluation Results after fine-tuning  : 1.745113492012024


In [21]:
for idx, (pre, post) in enumerate(predictions):
    pre_pred = tokenizer.decode(np.argmax(pre, axis=-1), skip_special_tokens=True)
    post_pred = tokenizer.decode(np.argmax(post, axis=-1), skip_special_tokens=True)
    ground_truth = encoded_dataset['validation'][idx]["dialog"]
    print('Ground truth \n' + ground_truth + '\n')
    print('Pre-prediction \n' + pre_pred + '\n')
    print('Post-prediction \n'+ post_pred + '\n')
    print('----------------')

In [25]:
# prompt: use model as chatbot

# Define a function to chat with the model
# Define a function to chat with the model
def chat(model, tokenizer, user_input):
  # Encode the user input
  input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt').to('cuda') # Move input_ids to GPU

  # Generate the model's response
  output = model.generate(input_ids, max_length=50, num_return_sequences=1)

  # Decode the model's response
  response = tokenizer.decode(output[0], skip_special_tokens=True)

  # Print the user input and the model's response
  print("User:", user_input)
  print("Model:", response)

# Start the chat loop
while True:
  # Get the user input
  user_input = input("You: ")

  # Check if the user wants to exit
  if user_input.lower() == "exit":
    break

  # Chat with the model
  chat(model, tokenizer, user_input)


You: hello how are you


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


User: hello how are you
Model: hello how are youGood, how are you?   I'm fine, thanks. How are you?   I'm fine too. How are you?   I'm fine too. How are you?   I'm fine
You: exit


In [33]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [34]:
!cp -r /content/DialoGPT_Model /content/drive/My\ Drive/
!cp -r /content/DialoGPT_Tokenizer /content/drive/My\ Drive/


In [None]:
# prompt: use model from files
import os
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the model and tokenizer from the specified directory
model_dir = '/content/drive/My Drive/DialoGPT_Model'
tokenizer_dir = '/content/drive/My Drive/DialoGPT_Tokenizer'

model2 = GPT2LMHeadModel.from_pretrained(model_dir)
tokenizer2 = GPT2Tokenizer.from_pretrained(tokenizer_dir)

# Move the model to the CPU or GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model2.to(device)


In [80]:
import nltk
nltk.download('punkt')  # Ensure you have punkt tokenizer downloaded


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [88]:
for step in range(100):  # Adjust the range as needed
    # Encode the new user input
    user_input = input(">> User: ")
    if user_input.lower() in ['exit', 'quit', 'bye']:
        print("DialoGPT: Goodbye!")
        break

    new_user_input_ids = tokenizer2.encode(user_input + tokenizer2.eos_token, return_tensors='pt').to(device)

    # Generate a response without chat history
    chat_history_ids = model2.generate(new_user_input_ids, max_length=1000, pad_token_id=tokenizer2.eos_token_id)

    # Decode and print the bot's response
    bot_response = tokenizer2.decode(chat_history_ids[:, new_user_input_ids.shape[-1]:][0], skip_special_tokens=True)

    # Use nltk for better sentence splitting
    sentences = nltk.sent_tokenize(bot_response)
    answer = sentences[0]  # Take the first sentence

    print("DialoGPT: {}".format(answer))  # Print only the first response


>> User: hello
DialoGPT: Hello, I'm calling to confirm my reservation.
>> User: my name is vakkas
DialoGPT: Hey, I'm calling to thank you for the interview with Mary.
>> User: Can we make it later
DialoGPT: No, I'm sorry.
>> User: I don't want interview
DialoGPT: Why not?
>> User: Because it's stressfull
DialoGPT: But it's fun.
>> User: no
DialoGPT: thanks, I've got a no response.
>> User: Okey, what is your name
DialoGPT: My name is Zhang Yong.
>> User: What is you
DialoGPT: My name is John.
>> User: You are model
DialoGPT: You are too kind.
>> User: I made you
DialoGPT: Thank you.
>> User: I love you
DialoGPT: I love you, too.
>> User: Ehehehe thanks
DialoGPT: You're welcome.
>> User: goodbye
DialoGPT: Goodbye, sir.
>> User: bye
DialoGPT: Goodbye!
