In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration, AdamW, BlenderbotSmallTokenizer

# Assuming you have a GPU available, set the device and the entire model in developed using torch and using cuda cores to acclerate the training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [4]:
from langchain.llms import HuggingFacePipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_id = 'facebook/blenderbot-400M-distill'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

#model = BlenderbotForConditionalGeneration.from_pretrained("facebook/Blenderbot-90M").to(device)

pipe = pipeline(
    "text2text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=100
)

local_llm = HuggingFacePipeline(pipeline=pipe)

In [5]:
print(local_llm('hi, How are You doing? '))

 I'm doing well, thank you. How are you doing? Do you have any hobbies?


In [6]:
print(local_llm('How was Your day? '))

 It was good. I got to spend time with my family and friends. How was yours?


In [6]:
from langchain.llms import GPT4All, OpenAI
from langchain.prompts import PromptTemplate, prompt
from langchain.chains import LLMChain
from langchain.chains.prompt_selector import ConditionalPromptSelector
from langchain.memory import ConversationBufferWindowMemory


In [7]:
import os
import json

os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'API Key' #paste huggingface api key , if you plan to use hugging face api

In [8]:
print(device)

cuda


In [6]:
#chat_history_path = ""  # Replace with your personal chat file path


In [9]:

# Load your chat data from Instagram in JSON format, P.s. use "/" ehile setting up path
with open("paste/your/instagram/whatsapp/fb or any other chat history, json or html format", "r", encoding="utf-8") as file:
    chat_data = json.load(file)


In [8]:
#converting to much better format.

In [24]:
import json
from datetime import datetime

# Load the JSON data
with open("", "r", encoding="utf-8") as file:
    chat_data = json.load(file)

# Extract relevant information and create a list of messages- and edit it out accordingly
cleaned_data = []
for message in chat_data["messages"]:
    cleaned_message = {
        "sender_name-message": message["sender_name"] + " : "+ message.get("content", ""),
        "timestamp": datetime.fromtimestamp(message["timestamp_ms"] / 1000.0).strftime('%Y-%m-%d %H:%M:%S'),
        "reactions": [{"reaction": reaction["reaction"], "actor": reaction["actor"]} for reaction in message.get("reactions", [])],
        "attachment_link": message.get("share", {}).get("link", ""),
        "attachment_text": message.get("share", {}).get("share_text", "")
    }
    cleaned_data.append(cleaned_message)


In [36]:
#print(cleaned_data[3])

In [26]:
cleaned_data = [message for message in cleaned_data if all(value is not None for value in message.values())]

In [27]:
# Replace missing content with an empty string
for message in cleaned_data:
    message["sender_name-message"] = message["sender_name-message"] if message["sender_name-message"] is not None else ""


In [None]:
#test the cleaned data

In [2]:
#print(cleaned_data[9])

In [11]:
# Tokenize the data using the Blenderbot tokenizer
tokenizer = BlenderbotTokenizer.from_pretrained("facebook/Blenderbot-400M-distill")
tokenized_data = tokenizer([item["content"] for item in cleaned_data], return_tensors="pt", padding=True,truncation=True)


In [12]:

# Create a PyTorch dataset
class ChatDataset(Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.tokenized_data["input_ids"][idx],
            "attention_mask": self.tokenized_data["attention_mask"][idx]
        }

dataset = ChatDataset(tokenized_data)


In [13]:

# Create a PyTorch data loader
batch_size = 16 # Adjust as needed based on the cuda cores and type of model to be used.
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [14]:
# Instantiate the Blenderbot model
model = BlenderbotForConditionalGeneration.from_pretrained("facebook/Blenderbot-400M-distill").to(device)


In [15]:
# Setting up optimizer and learning rate scheduler, edit out after tweaking these hyper parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-3)
num_epochs = 3  # Adjust as needed


In [23]:
#sometimes you can face the cuda cores memory exception

torch.cuda.empty_cache()


In [16]:

# Training loop
for epoch in range(num_epochs):
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)

        # Backward pass and optimization
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Print loss for monitoring
        print(f"Epoch {epoch + 1}, Batch loss: {loss.item()}")


Epoch 1, Batch loss: 15.866052627563477
Epoch 1, Batch loss: 13.479949951171875
Epoch 1, Batch loss: 11.486916542053223
Epoch 1, Batch loss: 4.176311016082764
Epoch 1, Batch loss: 1.9785226583480835
Epoch 1, Batch loss: 1.8192580938339233
Epoch 1, Batch loss: 2.043797492980957
Epoch 1, Batch loss: 2.2424068450927734
Epoch 1, Batch loss: 1.8981144428253174
Epoch 1, Batch loss: 2.4779646396636963
Epoch 1, Batch loss: 1.8849140405654907
Epoch 1, Batch loss: 2.7129769325256348
Epoch 1, Batch loss: 2.4681954383850098
Epoch 1, Batch loss: 2.7810184955596924
Epoch 1, Batch loss: 3.317988157272339
Epoch 1, Batch loss: 2.323437213897705
Epoch 1, Batch loss: 2.090315341949463
Epoch 1, Batch loss: 3.308663845062256
Epoch 1, Batch loss: 1.996038794517517
Epoch 1, Batch loss: 2.4007747173309326
Epoch 1, Batch loss: 1.9496232271194458
Epoch 1, Batch loss: 2.585291862487793
Epoch 1, Batch loss: 2.1342251300811768
Epoch 1, Batch loss: 2.088373899459839
Epoch 1, Batch loss: 2.160135507583618
Epoch 1, B

In [17]:
# Save the trained model so you can load the model again
model.save_pretrained("blender_11st_m")


In [None]:
# Save the trained model to the specified directory
#blenderbot_model.save_pretrained("blender_1st_m")
#blenderbot_tokenizer.save_pretrained("blender_1st_m")

In [44]:
from transformers import BlenderbotForConditionalGeneration, BlenderbotTokenizer

# Load the saved model from the directory
loaded_blenderbot_model = BlenderbotForConditionalGeneration.from_pretrained("blender_11st_m")
#loaded_blenderbot_tokenizer = BlenderbotTokenizer.from_pretrained("blender_1st_m")


In [45]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
loaded_blenderbot_model = loaded_blenderbot_model.to(device)
#input_data = input_data.to(device)  

In [49]:

# Generate chat output; after training my own chat data, i tries to give response based on the chat data i fed the model. 
user_input = " how was your day "
input_ids = tokenizer(user_input, return_tensors="pt")["input_ids"].to(device)
output_ids = loaded_blenderbot_model.generate(input_ids, max_length=40, num_beams=5, no_repeat_ngram_size=2, top_k=50)

chat_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(f"Model Output: {chat_output}")


Model Output:  Iâs a bit a lot to your message !!!!


In [None]:
#improvements to be made, the data should be cleaned again, and need train with other datasets, as my text data contained multilingual language, and if a largere blenderbot model or t5 model can be used for better text outcomes.