<a href="https://colab.research.google.com/github/usanaphtal112/pretrained-LLM-ChatBot/blob/main/pretrained_gpt2_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Necessary package**

In [None]:
pip install transformers

In [None]:
import pandas as pd
import json
import torch
import os
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# **Read and preprocess the data**

In [None]:
# Read the JSON file into a DataFrame
with open('result.json', encoding='utf-8') as f:
    data = json.load(f)

df = pd.DataFrame(data)

In [None]:
df

Unnamed: 0,timestamp,sender,message
0,"3/29/23, 1:19 AM",WhatsApp,Messages and calls are end-to-end encrypted. N...
1,"3/29/23, 3:14 PM",boaz,"Hello it is Boaz, that's my new number"
2,"3/29/23, 3:16 PM",Honorine,Ok dear
3,"3/29/23, 5:32 PM",boaz,Hello
4,"3/30/23, 9:42 PM",boaz,Missed voice call
...,...,...,...
2429,"3/25/23, 7:51 PM",Honorine,byee
2430,"3/25/23, 7:51 PM",Honorine,i'm feeling some how headache i don't want to ...
2431,"3/25/23, 7:52 PM",Boaz Keny,ooh get rest please
2432,"3/25/23, 10:55 PM",Honorine,Yeah


In [None]:
df.isna().sum()

timestamp    0
sender       0
message      0
dtype: int64

In [None]:
df.duplicated().sum()

18

In [None]:
df.drop_duplicates()
df.head()

Unnamed: 0,timestamp,sender,message
0,"3/29/23, 1:19 AM",WhatsApp,Messages and calls are end-to-end encrypted. N...
1,"3/29/23, 3:14 PM",boaz,"Hello it is Boaz, that's my new number"
2,"3/29/23, 3:16 PM",Honorine,Ok dear
3,"3/29/23, 5:32 PM",boaz,Hello
4,"3/30/23, 9:42 PM",boaz,Missed voice call


# **Train pretrained gpt2 with my chat data**

In [None]:
# Load the pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add a new padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Read the conversation data from the JSON file
data_path = 'result.json'
# data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'result.json')

if not os.path.exists(data_path):
    raise ValueError(f"Input file path {data_path} not found")

with open(data_path, 'r', encoding='utf-8') as file:
    conversation_data = json.load(file)

# Extract messages from conversation data
conversation = []
for message_data in conversation_data:
    sender = message_data['sender']
    message = message_data['message']
    conversation.append(sender + ": " + message)

# Tokenize the conversation
tokenized_conversation = tokenizer.batch_encode_plus(
    conversation,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

# Create a PyTorch dataset from the tokenized conversation
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=data_path,
    block_size=128,
    overwrite_cache=False,
    cache_dir=None,
)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir='/content/',
    overwrite_output_dir=True,
    num_train_epochs=100,
    per_device_train_batch_size=4,
    save_steps=1000,
    save_total_limit=2,
)

# Set up the trainer
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Fine-tune the GPT-2 model
trainer.train()

# Save the fine-tuned model
output_dir = '/content/'
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)



Step,Training Loss
500,0.4812
1000,0.3838
1500,0.3399
2000,0.3165
2500,0.2905
3000,0.2719
3500,0.252
4000,0.237
4500,0.2242
5000,0.2145


('/content/tokenizer_config.json',
 '/content/special_tokens_map.json',
 '/content/vocab.json',
 '/content/merges.txt',
 '/content/added_tokens.json')

# **Test my chatbot**

In [None]:
# Load the fine-tuned GPT-2 model and tokenizer
model_path = '/content/'
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# Set the pad token ID to the EOS token ID
tokenizer.pad_token_id = tokenizer.eos_token_id

# Start the conversation
conversation = []

while True:
    # Get user input
    user_input = input("User: ")

    # Add user input to the conversation
    conversation.append("User: " + user_input)

    # Tokenize the conversation
    # inputs = tokenizer.encode("\n".join(conversation), return_tensors="pt")
    inputs = tokenizer.encode_plus("\n".join(conversation), return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Generate a response from the model
    # outputs = model.generate(inputs, max_length=500, num_return_sequences=1)
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512, num_return_sequences=1)

    # Decode the generated response
    # response = tokenizer.decode(outputs[:, inputs.shape[-1]:][0], skip_special_tokens=True)
    response = tokenizer.decode(outputs[:, inputs['input_ids'].shape[-1]:][0], skip_special_tokens=True)

    # Add model's response to the conversation
    conversation.append("Model: " + response)

    # Print the model's response
    print("Model:", response)

User: Hi, how are you doing?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: ?"
              },
              {
                "timestamp": "2/9/23, 9:04 PM",
                "sender": "Boaz Keny",
                "message": "I'm still in cls"
               },
               {
                  "timestamp": "2/9/23, 9:06 PM",
                  "sender": "Boaz Keny",
                 "message": "I'm still in cls"
              },
              {
                 "timestamp": "2/9/23, 9:06 PM",
               "sender": "Boaz Keny",
             "message": "I'm in cls"
        },
           {

               "timestamp": "11/18/22, 4:12/22, 4:13 PM",
            "sender": "message": "Hi honorine": "Hi honorine"
            "message": "are you was with the class": "Hi honorine"
         },
          {
        
User: are you ok?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 512, but `max_length` is set to 512. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Model:  
User: what is this?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 512, but `max_length` is set to 512. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Model:  


KeyboardInterrupt: ignored