# Argumentative Chatbot using pre-trained model
We will use a model from the Hugging Face library.

# setup our environment

### Setup python environment
1. Download Anaconda through this link https://www.anaconda.com/download and install
2. Once Anaconda is installed, luanch Jupyter notebook and copy this command "!pip install transformers==4.42.4 torch==2.3.1 tkinter" to install the libraries
3. the required libraries are transformers (4.42.4), tourch (2.3.1) and tkinter.

In [2]:
# Import required libraries and Loading a Pre-trained Model

from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers.utils import logging
import torch

logging.get_logger("transformers").setLevel(logging.ERROR)

# Load pre-trained model and tokenizer
model_name = "gpt2"  # You can also use "gpt-3", "gpt-neo", etc.
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set the padding token to the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

## Create a function to test run of the model

In [99]:

# function to generate response
def generate_response(user_input, max_length=250, temperature=0.7, top_p=0.9):
    # Encode the input and generate the response
    input_ids = tokenizer.encode(user_input, return_tensors='pt')
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Function to remove repetition from response
def remove_repetitions(text):
    sentences = text.split('. ')
    seen = set()
    unique_sentences = []
    for sentence in sentences:
        if sentence not in seen:
            seen.add(sentence)
            unique_sentences.append(sentence)
    return '. '.join(unique_sentences)

def generate_response_with_post_processing(user_input, max_length=150, temperature=0.7, top_p=0.9):
    raw_response = generate_response(user_input, max_length, temperature, top_p)
    cleaned_response = remove_repetitions(raw_response)
    return cleaned_response

# Test with this exampls. You can use different question from this
user_input = "What drives the cost of transportation in Europe"
response = generate_response_with_post_processing(user_input)
print(response)


What drives the cost of transportation in Europe is a long-term commitment to reduce emissions, and it can only be done through reductions that are consistent with European commitments," he said.


 (CBC) "If we do not have an ambitious plan for reducing our carbon footprint at home as well... then why should people pay more?"


## Let's Enhance the Argumentative Capabilities of the Model

In [101]:

# Define a custom dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, texts, max_length=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Encode the text and add padding if necessary
        inputs = self.tokenizer.encode_plus(
            self.texts[idx],
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()

        # Shift the input_ids to create labels
        labels = input_ids.clone()
        # Mask out padding tokens for loss calculation
        labels[labels == tokenizer.pad_token_id] = -100  

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

# Example dataset. You can use different text here
texts = [
    "Renewable energy is beneficial because it reduces greenhouse gas emissions.",
    "It’s true that renewables can be intermittent, but energy storage solutions are improving.",
]

dataset = CustomDataset(tokenizer, texts)


In [105]:
# Train the custom model
from transformers import Trainer, TrainingArguments

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=5,             
    per_device_train_batch_size=4,   
    save_steps=10_000,               
    save_total_limit=2,              
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=dataset,         
)

# Train the model
trainer.train()


{'train_runtime': 52.8917, 'train_samples_per_second': 0.189, 'train_steps_per_second': 0.095, 'train_loss': 1.8350400924682617, 'epoch': 5.0}


TrainOutput(global_step=5, training_loss=1.8350400924682617, metrics={'train_runtime': 52.8917, 'train_samples_per_second': 0.189, 'train_steps_per_second': 0.095, 'train_loss': 1.8350400924682617, 'epoch': 5.0})

In [107]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("fine-tuned-trained-gpt2")
tokenizer.save_pretrained("fine-tuned-trained-gpt2")

('fine-tuned-trained-gpt2\\tokenizer_config.json',
 'fine-tuned-trained-gpt2\\special_tokens_map.json',
 'fine-tuned-trained-gpt2\\vocab.json',
 'fine-tuned-trained-gpt2\\merges.txt',
 'fine-tuned-trained-gpt2\\added_tokens.json')

## Let's test run the fine-tuned model 

In [111]:
# Load the fine-tuned-trained model and tokenizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained("fine-tuned-trained-gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("fine-tuned-trained-gpt2")

def generate_response(user_input, max_length=150, temperature=0.7, top_p=0.9):
    input_ids = tokenizer.encode(user_input, return_tensors='pt')
    output = model.generate(
        input_ids,
        max_length=max_length,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Example usage
user_input = "The rate of child birth has reduced across Europe?"
response = generate_response(user_input)
print(response)


The rate of child birth has reduced across Europe?
A recent report by the World Health Organization (WHO) found that only 6% of births are safe for children under five years old. This means there is little evidence to suggest a link between high levels in maternal health and low rates among young people living outside their countries – especially as they grow older, according - particularly if these girls have been exposed to diseases such cancer or HIV/AIDS at higher risk than those who remain unvaccinated around them. For more information about developing interventions aimed towards preventing childhood diarrhoea: http://www-childbirthsonlinehealthcareservicesv4n1.europa.-us.html#2


# It's time to create user interface to interact with this model

In [2]:
import tkinter as tk
from tkinter import scrolledtext
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("fine-tuned-trained-gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("fine-tuned-trained-gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

# Function to generate a response from the model
def generate_response(user_input, max_length=250, temperature=0.7, top_p=0.9):
    # Encode the input and generate the response
    input_ids = tokenizer.encode(user_input, return_tensors='pt')
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

def remove_repetitions(text):
    sentences = text.split('. ')
    seen = set()
    unique_sentences = []
    for sentence in sentences:
        if sentence not in seen:
            seen.add(sentence)
            unique_sentences.append(sentence)
    return '. '.join(unique_sentences)

def generate_response_with_post_processing(user_input, max_length=150, temperature=0.7, top_p=0.9):
    raw_response = generate_response(user_input, max_length, temperature, top_p)
    cleaned_response = remove_repetitions(raw_response)
    return cleaned_response


# Function to handle user input and display the response
def send_message():
    user_input = user_input_entry.get("1.0", "end-1c")  # Get the text from the input box
    user_input_entry.delete("1.0", "end")  # Clear the input box
    if user_input.strip():
        chat_display.config(state=tk.NORMAL)
        chat_display.insert(tk.END, "User: " + user_input + "\n\n")
        chat_display.config(state=tk.DISABLED)

        response = generate_response_with_post_processing(user_input)
        chat_display.config(state=tk.NORMAL)
        chat_display.insert(tk.END, "Chatbot: " + response + "\n\n")
        chat_display.config(state=tk.DISABLED)

# Create the main application window
root = tk.Tk()
root.title("Argumentative Chatbot")
root.geometry("670x480")

# Create a display area for the chat conversation
chat_display = scrolledtext.ScrolledText(root, height=20, width=80, state=tk.DISABLED, wrap=tk.WORD)
chat_display.grid(column=0, row=0, padx=10, pady=10, columnspan=2)
chat_display.configure(state='disabled')

entry_label = tk.Label(root, text="Ask your question:")
entry_label.grid(column=0, row=1, padx=10, pady=10)

user_input_entry = tk.Text(root, height=2, width=70)
user_input_entry.grid(column=0, row=2, padx=10, pady=10)

send_button = tk.Button(root, text="Send", command=send_message)
send_button.grid(column=1, row=3, padx=10, pady=10)

root.mainloop()


OSError: fine-tuned-trained-gpt2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`