### Chat-Bot

In [2]:
import csv

import re
from datetime import datetime
conversations = []
def preprocess_chat_data(file_path):
    
    current_conversation = []
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            for line in file:
                # Extract the timestamp, sender, and message
                timestamp_end = line.find(" - ")
                if timestamp_end != -1:
                    timestamp = line[:timestamp_end]
                    sender_message = line[timestamp_end + 3:]
                    
                    # Extract day, date, time, am/pm from the timestamp
                    timestamp_parts = timestamp.split(", ")
                    day_date = timestamp_parts[0]
                    time = timestamp_parts[1].strip()
                    am_pm = time[-2:].lower()
                    time = time[:-2].strip()
                    
                    # Parse the day, date, time into a datetime object
                    timestamp_obj = datetime.strptime(day_date + " " + time + " " + am_pm, "%d/%m/%y %I:%M %p")
                    
                    # Extract the sender and message
                    sender_end = sender_message.find(": ")
                    if sender_end != -1:
                        sender = sender_message[:sender_end]
                        message = sender_message[sender_end + 2:]
                        current_conversation.append((timestamp_obj, sender, message))
                elif current_conversation:
                    conversations.append(current_conversation)
                    current_conversation = []

        if current_conversation:
            conversations.append(current_conversation)

    except Exception as e:
        print("An error occurred during chat data processing:", e)

    return conversations


conversations = preprocess_chat_data("chat_stat_Harsh.txt")


In [3]:
import datetime
converted_data = []

for item in conversations:
    converted_item = [str(elem) for elem in item]
    converted_data.append(converted_item)

print(converted_data)

[["(datetime.datetime(2022, 8, 25, 14, 43), 'Harsh Cer', '<Media omitted>\\n')", "(datetime.datetime(2022, 9, 2, 0, 11), 'Utkarsh Kumar Sahu', '<Media omitted>\\n')", "(datetime.datetime(2022, 9, 2, 0, 11), 'Utkarsh Kumar Sahu', 'Animxplay\\n')", "(datetime.datetime(2022, 9, 5, 18, 57), 'Harsh Cer', 'I am Utkarsh Kumar Sahu, a third year undergrad at IIT BHU, Varanasi. I have a deep interest in the field of data science and am well versed with the classic machine learning algorithms, along with a strong first-hand understanding of SQL and Python.\\n')", "(datetime.datetime(2022, 9, 8, 13, 42), 'Utkarsh Kumar Sahu', 'https://edube.org/study\\n')", "(datetime.datetime(2022, 9, 11, 11, 59), 'Utkarsh Kumar Sahu', 'Bhai aaja room pe\\n')", "(datetime.datetime(2022, 9, 11, 13, 22), 'Utkarsh Kumar Sahu', '<Media omitted>\\n')", "(datetime.datetime(2022, 9, 17, 12, 20), 'Utkarsh Kumar Sahu', 'https://youtu.be/3JiqARgUZVE\\n')", "(datetime.datetime(2022, 9, 17, 19, 48), 'Utkarsh Kumar Sahu', 'B

In [4]:
import torch
from torch.utils.data import Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

# Define the conversation dataset class
class ConversationDataset(Dataset):
    def __init__(self, conversations, tokenizer, max_length=512):
        self.conversations = conversations
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.conversations)
    
    def __getitem__(self, idx):
        conversation = self.conversations[idx]
        encoded_inputs = self.tokenizer.encode_plus(
            conversation,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_length,
            padding='max_length'
        )
        
        input_ids = encoded_inputs['input_ids']
        attention_mask = encoded_inputs['attention_mask']
        
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long)
        }

# Example conversation list
# conversations = [
#     "11/05/23, 7:40 pm - Utkarsh Kumar Sahu: Poore do ghante lag gaye banane me",
#     "12/05/23, 8:01 am - Harsh Cer: Noobs",
#     "12/05/23, 8:02 am - Utkarsh Kumar Sahu: Haha",
#      .....
# ]

# Load the pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add a padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Create the conversation dataset
dataset = ConversationDataset(converted_data, tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir='./chat_model',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
)

# Define a custom Trainer class
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs):
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        labels = input_ids.clone()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        return loss

# Create a CustomTrainer instance
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=lambda data: {'input_ids': torch.stack([item['input_ids'] for item in data]),
                               'attention_mask': torch.stack([item['attention_mask'] for item in data])}
)

# Train the model
trainer.train()

# Save the model
trainer.save_model('./chat_model')
tokenizer.save_pretrained("./chat_model")


***** Running training *****
  Num examples = 1
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1
  Number of trainable parameters = 124439808


  0%|          | 0/1 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./chat_model
Configuration saved in ./chat_model\config.json


{'train_runtime': 26.074, 'train_samples_per_second': 0.038, 'train_steps_per_second': 0.038, 'train_loss': 10.616629600524902, 'epoch': 1.0}


Model weights saved in ./chat_model\pytorch_model.bin
tokenizer config file saved in ./chat_model\tokenizer_config.json
Special tokens file saved in ./chat_model\special_tokens_map.json
added tokens file saved in ./chat_model\added_tokens.json


('./chat_model\\tokenizer_config.json',
 './chat_model\\special_tokens_map.json',
 './chat_model\\vocab.json',
 './chat_model\\merges.txt',
 './chat_model\\added_tokens.json')

In [5]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the trained model and tokenizer
model_path = "./chat_model"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)


# Set the device to use (e.g., "cuda" for GPU or "cpu" for CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Disable the pad_token_id and attention_mask warning
model.config.pad_token_id = model.config.eos_token_id
model.config.use_cache = False

# Function to generate chat responses
def generate_response(input_text, max_length=100):
    # Tokenize the input text
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Generate text based on the input
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1)

    # Decode the generated output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_text

# Example usage
user_input = "Hello, how are you?"
response = generate_response(user_input)
print(response)


loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
Adding [PAD] to the vocabulary
loading configuration file ./chat_model\config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
   

Hello, how are you?

I'm a little bit of a nerd. I'm a little bit of a nerd. I'm a little bit of a nerd. I'm a little bit of a nerd. I'm a little bit of a nerd. I'm a little bit of a nerd. I'm a little bit of a nerd. I'm a little bit of a nerd. I'm a little bit of a nerd. I'm a little bit of a nerd. I'm


In [7]:
# Start the conversation
print("Chatbot: Hello! How can I assist you today?")

while True:
    try:
        user_input = input("User: ")
        if user_input == "exit":
            print("Exiting chat bot...")
            break

        # Process user input and generate a response
        response = generate_response(user_input)
        print("Chatbot:", response)

    except KeyboardInterrupt:
        print("\nExiting chat bot...")
        break

Chatbot: Hello! How can I assist you today?
Chatbot: I am fine how are you?"

"I am fine. I am fine. I am fine. I am fine. I am fine. I am fine. I am fine. I am fine. I am fine. I am fine. I am fine. I am fine. I am fine. I am fine. I am fine. I am fine. I am fine. I am fine. I am fine. I am fine. I am fine. I am fine. I am


IndexError: index -1 is out of bounds for dimension 1 with size 0