##  Transfer learnt model built on a pretrained LLM such as GPT-2 for chatbot

### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import transformers
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import load_metric, Dataset, DatasetDict
import lime
import lime.lime_text

2024-08-07 02:41:13.885734: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-07 02:41:13.885870: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-07 02:41:14.006927: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Loading Dataset

In [2]:
file_path = '/kaggle/input/chat-data/chat_data.csv'
data = pd.read_csv(file_path)

In [3]:
data.head()

Unnamed: 0,conversations,id
0,"[{'from': 'human', 'value': ""I've been feeling...",identity_0
1,"[{'from': 'human', 'value': ""Hi, I'm feeling r...",identity_1
2,"[{'from': 'human', 'value': ""Hey, I hope you'r...",identity_2
3,"[{'from': 'human', 'value': ""I'm feeling reall...",identity_3
4,"[{'from': 'human', 'value': ""I'm feeling reall...",identity_4


### Preprocessing

In [4]:
import ast
def rems(row):
    # Remove the surrounding quotes and newlines
    data_str_cleaned = row.replace("\n", ",")

    # Safely evaluate the string as a Python literal
    data = ast.literal_eval(data_str_cleaned)

    return data

In [5]:
data['conversations'] = data['conversations'].apply(rems)

In [19]:
context_window = 2  # Number of previous turns to include as context

input_texts = []
output_texts = []

for conversation in data['conversations'][:5]:
    for i in range(context_window, len(conversation)):
        if conversation[i]["from"] == "gpt":  # Only consider gpt responses as output
            input_seq = []
            for j in range(context_window):
                input_seq.append(conversation[i - context_window + j]["value"])
            input_texts.append(" ".join(input_seq))
            output_texts.append(conversation[i]["value"])

# Now input_texts and output_texts contain the processed data from all conversations
print(f"Number of input-output pairs: {len(input_texts)}")

Number of input-output pairs: 27


In [20]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='left')
tokenizer.pad_token = tokenizer.eos_token  # Ensure padding token is set

In [21]:
inputs =tokenizer(input_texts, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
targets = tokenizer(output_texts, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

In [22]:
dataset = Dataset.from_dict({
    'input_ids': [ids.squeeze().tolist() for ids in inputs['input_ids']],
    'attention_mask': [mask.squeeze().tolist() for mask in inputs['attention_mask']],
    'labels': [label.squeeze().tolist() for label in targets['input_ids']]
})

In [23]:
train_dataset, val_dataset = dataset.train_test_split(test_size=0.3).values()

### Loading model

In [24]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [25]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1,
    evaluation_strategy="epoch"
)



In [26]:
pip install bert_score

  pid, fd = os.forkpty()


Note: you may need to restart the kernel to use updated packages.


In [27]:
pip install rouge_score

Note: you may need to restart the kernel to use updated packages.


In [28]:
from bert_score import score
rouge_metric = load_metric('rouge')
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE-L Score
    rouge_results = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
    bert_precision, bert_recall, bert_f1 = score(decoded_preds, decoded_labels, lang="en", verbose=False)
    bert_precision = bert_precision.mean().item()
    bert_recall = bert_recall.mean().item()
    bert_f1 = bert_f1.mean().item()
    # Return the ROUGE-L Score
    result = {
        "rougeL": rouge_results["rougeL"].mid.fmeasure,
        "bert_precision": bert_precision,
        "bert_recall": bert_recall,
        "bert_f1": bert_f1
    }


    return result

In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [30]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rougel,Bert Precision,Bert Recall,Bert F1
1,1.0823,0.953206,0.145454,0.770979,0.811288,0.790548


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TrainOutput(global_step=9, training_loss=2.5031369196044073, metrics={'train_runtime': 13.6785, 'train_samples_per_second': 1.316, 'train_steps_per_second': 0.658, 'total_flos': 4703256576000.0, 'train_loss': 2.5031369196044073, 'epoch': 1.0})

In [32]:
results = trainer.evaluate()
print("Evaluation Results:", results)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Results: {'eval_loss': 0.953205943107605, 'eval_rougeL': 0.14545445228712617, 'eval_bert_precision': 0.7709794044494629, 'eval_bert_recall': 0.8112878799438477, 'eval_bert_f1': 0.7905476093292236, 'eval_runtime': 3.1669, 'eval_samples_per_second': 2.842, 'eval_steps_per_second': 1.579, 'epoch': 1.0}


### Bert and RougeL calculation

In [33]:
print(f"ROUGE-L Score: {results['eval_rougeL']}")
print(f"BERT F1 Score: {results['eval_bert_f1']}")

ROUGE-L Score: 0.14545445228712617
BERT F1 Score: 0.7905476093292236


In [35]:
input_ids = inputs['input_ids'].to('cuda')
if 'attention_mask' in inputs:
    attention_mask = inputs['attention_mask'].to('cuda')
else:
    attention_mask = None

# Generate outputs with the model
generated_outputs = model.generate(
    input_ids=input_ids,
    max_new_tokens=50,  # Adjust as needed
    pad_token_id=tokenizer.eos_token_id,
    attention_mask=attention_mask
)

# Decode the generated outputs
decoded_outputs = tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)
print(decoded_outputs)

["Hey there, I'm here to listen and support you. It sounds like work has been really challenging lately. Can you tell me more about what's been going on? I recently got a promotion at work, which I thought would be exciting. But the added responsibilities and pressure have just taken a toll on my mental health. It's been a really moving experience for me. I'm really grateful for the support and support of my family and friends. I'm really grateful for the support of my friends and family. I'm really grateful for the support of my family and friends. I'm really grateful for the support of my", "I can understand how it can be overwhelming when we're faced with higher expectations. It's okay to acknowledge your emotions and allow yourself to feel sad in this situation. It's an important part of the healing process. What specific challenges have you been facing at work? Well, the workload has increased significantly, and I find it hard to maintain a work-life balance. I've been staying lat

In [36]:
chat_history = []

In [40]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [41]:
def generate_response(prompt, max_length=50, temperature=0.7, top_k=0):
    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    attention_mask = tokenizer(prompt, return_tensors="pt").attention_mask.to(device)
    # Generate text with sampling
    outputs = model.generate(
        inputs,
        attention_mask=attention_mask,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=1
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


### ChatBot

In [42]:
def chat():
    print("Welcome to the GPT-2 Chatbot. Type 'exit' to end the conversation.")
    while True:
        user_input = input("Human: ")
        if user_input.lower() == 'exit':
            break
        chat_history.append(f"Human: {user_input}")
        response = generate_response(user_input)
        chat_history.append(f"Bot: {response}")
        for line in chat_history[-10:]:  # Display the last 10 lines of conversation
            print(line)


In [43]:
chat()

Welcome to the GPT-2 Chatbot. Type 'exit' to end the conversation.


Human:  hi




Human: hi
Human: hi
Bot: hi, who is also a member of the National Council of the National Council of the National Council of the National Council of the National Council of the National Council of the National Council of the National Council of the National Council of the National Council of the National


Human:  how r u?


Human: hi
Human: hi
Bot: hi, who is also a member of the National Council of the National Council of the National Council of the National Council of the National Council of the National Council of the National Council of the National Council of the National Council of the National Council of the National
Human: how r u?
Bot: how r u?

I'm not sure if you're aware of the fact that the U.S. government has been using the word "r" to describe the U.S. government's use of the word "r" in the


Human:  exit


### Saving Model and Tokenizer

In [48]:
trainer.save_model('C:/Users/16479/Downloads')

In [49]:
tokenizer.save_pretrained('C:/Users/16479/Downloads')

('C:/Users/16479/Downloads/tokenizer_config.json',
 'C:/Users/16479/Downloads/special_tokens_map.json',
 'C:/Users/16479/Downloads/vocab.json',
 'C:/Users/16479/Downloads/merges.txt',
 'C:/Users/16479/Downloads/added_tokens.json')