<a href="https://colab.research.google.com/github/waqarmm/Intelligent_chatbot_NLP/blob/main/NLP_MODULE_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
from transformers import T5ForConditionalGeneration, T5Tokenizer, TrainingArguments, Trainer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import torch

sentiment_mapping = {
    "positive": 1,
    "Curious to dive deeper": 2,
    "Disguised": 3,
    "Fearful": 4,
    "Happy": 5,
    "Sad": 6,
    "Surprised": 7,
}


with open('/content/drive/MyDrive/nlp/train-2-2.json', 'r') as json_file:
    dataset = json.load(json_file)


dataset_list = list(dataset.values())


model_name = "t5-base"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)


analyzer = SentimentIntensityAnalyzer()

training_args = TrainingArguments(
    output_dir="./english_only_model",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_steps=1000,
    save_total_limit=2,
    remove_unused_columns=False,
    report_to="tensorboard",
)


class ChatbotDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_seq_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]


        messages = item['content']
        input_text = ' '.join([f"{msg['message']}" for msg in messages])
        target_text = messages[-1]['message']  # Use the message from agent_2 as the target


        input_tokens = self.tokenizer.encode(input_text,
                                             add_special_tokens=True,
                                             max_length=self.max_seq_length,
                                             truncation=True,
                                             padding='max_length',
                                             return_tensors='pt')


        target_tokens = self.tokenizer.encode(target_text,
                                              add_special_tokens=False,
                                              max_length=self.max_seq_length,
                                              truncation=True,
                                              padding='max_length',
                                              return_tensors='pt')

        return {
            "input_ids": input_tokens[0],  # Remove the batch dimension
            "decoder_input_ids": target_tokens[0],  # Remove the batch dimension
        }


split_ratio = 0.9  # Adjust this ratio as needed
split_index = int(len(dataset_list) * split_ratio)
train_dataset = dataset_list[:split_index]
val_dataset = dataset_list[split_index:]


def tokenize_with_language(dataset, tokenizer, src_lang, tgt_lang, max_seq_length=128):
    tokenized_data = []
    for item in dataset:
        messages = item['content']
        input_text = ' '.join([f"{msg['message']}" for msg in messages])
        sentiment = messages[-1]['sentiment']

        input_tokens = tokenizer.encode(
            input_text,
            add_special_tokens=True,
            max_length=max_seq_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
            tgt_lang="en",  # Set target language to English
        )

        target_text = f"[SENTIMENT: {sentiment}]"
        target_tokens = tokenizer.encode(
            target_text,
            add_special_tokens=False,
            max_length=max_seq_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
            tgt_lang="en",  # Set target language to English
        )

        tokenized_data.append({
            "input_ids": input_tokens[0],
            "decoder_input_ids": target_tokens[0],
        })

    return tokenized_data

tokenized_train_dataset = tokenize_with_language(train_dataset, tokenizer,src_lang="en",tgt_lang="en")
tokenized_val_dataset = tokenize_with_language(val_dataset, tokenizer,src_lang="en",tgt_lang="en")


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=None,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
)


def compute_loss(model, inputs):
    # Forward pass
    outputs = model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["input_ids"].ne(tokenizer.pad_token_id),
        decoder_input_ids=inputs["decoder_input_ids"],
        labels=inputs["decoder_input_ids"],
    )
    # Extract the loss
    loss = outputs.loss
    return loss

trainer.compute_loss = compute_loss


trainer.train()


trainer.save_model()
model.to('cpu')


def generate_response(input_text):

    sentiment_scores = analyzer.polarity_scores(input_text)


    compound_score = sentiment_scores['compound']
    if compound_score >= 0.05:
        sentiment_label = "happy"
    elif compound_score <= -0.05:
        sentiment_label = "Curious to dive deeper"
    else:
        sentiment_label = "neutral"


    sentiment_label_mapped = sentiment_mapping.get(sentiment_label, "Neutral")


    user_input = f"User: {input_text}"
    sentiment_text = f"[SENTIMENT: {sentiment_label_mapped}]"
    input_ids = tokenizer.encode(user_input, sentiment_text,
                                 return_tensors="pt", max_length=128,
                                 truncation=True, padding=True,
                                 tgt_lang="en",  # Set target language to English
                                )

    response_ids = model.generate(
        input_ids=input_ids,
        max_length=128,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        tgt_lang="en",
    )

    response_text = tokenizer.decode(response_ids[0], skip_special_tokens=True)
    return response_text

# Example of generating a response
user_input = "I'm feeling great today!"
bot_response = generate_response(user_input)

# Print the bot's response
print("Bot Response:", bot_response)

In [None]:

model.to('cpu')

# Function for generating responses
def generate_response(input_text):
    # Detect sentiment using VADER sentiment analysis
    sentiment_scores = analyzer.polarity_scores(input_text)
  # "positive": 1,
  #   "Curious to dive deeper": 2,
  #   "Disguised": 3,
  #   "Fearful": 4,
  #   "Happy": 5,
  #   "Sad": 6,
  #   "Surprised": 7,
    # Choose an appropriate sentiment label based on VADER scores
    compound_score = sentiment_scores['compound']
    if compound_score >= 0.05:
        sentiment_label = "positive"
    elif compound_score <= -0.05:
        sentiment_label = "Curious to dive deeper"
    else:
        sentiment_label = "neutral"

    # Map the sentiment label to the corresponding label used in your dataset
    sentiment_label_mapped = sentiment_mapping.get(sentiment_label, "Neutral")  # Default to "Neutral" if not found

    # Generate a response based on the detected sentiment
    # user_input = f"User: {input_text}"
    # sentiment_text = f"[SENTIMENT: {sentiment_label_mapped}]"
    input_ids = tokenizer.encode(user_input,
                                 return_tensors="pt", max_length=128,
                                 truncation=True, padding=True,


                                )

    response_ids = model.generate(
        input_ids=input_ids,
        max_length=128,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=0.95,
        temperature=0.7,

    )

    response_text = tokenizer.decode(response_ids[0], skip_special_tokens=True)
    return response_text




#By the way, do you like Fish
#Did you know that a seahorse is the only fish to have a neck
#What about cats, do you like cats? I'm a dog fan myself.
#Have a good day.
 #Did you know Bruce Lee was a cha cha dancer?"
user_input = "By the way, do you like Fish"
bot_response = generate_response(user_input)


print("Bot Response:", bot_response)