In [63]:
import json
import pandas as pd

# Load the dataset
with open('nlas-multi2.json', 'r', encoding='latin-1') as file:
    data = json.load(file)

# Convert to a DataFrame for easier manipulation
df = pd.DataFrame(data['eng']).T

# Display the DataFrame to inspect it
df.head()


Unnamed: 0,topic,stance,argumentation scheme,argument,label
0,Euthanasia,in favor,position to know,"{\n ""major premise"": ""Medical professionals a...",yes
1,Mandatory vaccination in pandemic,against,expert opinion,"{\n ""major premise"": ""Dr. John Smith is an ex...",yes
2,Physical appearance for personal success,against,expert opinion,"{\n ""major premise"": ""Dr. John Smith is an ex...",yes
3,Intermittent fasting,in favor,expert opinion,"{\n ""major premise"": ""Dr. Jack Kevorkian is a...",yes
4,Capital punishment,against,expert opinion,"{\n ""major premise"": ""Dr. John Smith is an ex...",yes


In [79]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification

# Load GPT-2 tokenizer and set padding token to the eos_token
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Load GPT-2 model for sequence classification
model = GPT2ForSequenceClassification.from_pretrained("gpt2")

# Make sure the model knows the padding token ID
model.config.pad_token_id = tokenizer.pad_token_id


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [83]:
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
import json

# Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, json_file, tokenizer, max_length=512):
        with open(json_file, 'r', encoding='latin-1') as f:
            self.data = json.load(f)
        
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = []

        for key in self.data['eng']:
            item = self.data['eng'][key]
            topic = item['topic']
            stance = item['stance']
            scheme = item['argumentation scheme']
            argument = item['argument']
            label = item['label']

            text = f"Topic: {topic}\nStance: {stance}\nScheme: {scheme}\nArgument: {argument}\n"
            self.samples.append((text, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        text, label = self.samples[idx]

        # Tokenize the input text with padding
        inputs = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',  # Ensure padding to max_length
            truncation=True,
            return_tensors='pt'
        )

        label = torch.tensor(1 if label == "yes" else 0, dtype=torch.long)

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': label
        }

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Load the dataset
dataset = CustomDataset('nlas-multi2.json', tokenizer)

# Split the dataset into training and evaluation sets
train_size = int(0.8 * len(dataset))
eval_size = len(dataset) - train_size
train_dataset, eval_dataset = random_split(dataset, [train_size, eval_size])

# Load GPT-2 model for sequence classification
model = GPT2ForSequenceClassification.from_pretrained("gpt2")
model.config.pad_token_id = tokenizer.pad_token_id


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [67]:
from transformers import EarlyStoppingCallback

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,  # Set a high number of epochs; early stopping will determine when to stop
    per_device_train_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="steps",  # Evaluate at intervals
    eval_steps=100,               # Evaluate every 100 steps
    save_total_limit=1,           # Save only the best model
    load_best_model_at_end=True   # Load the best model when training ends
)

# Initialize the Trainer with early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Stop after 3 non-improving evals
)

# Optionally, save the trained model
model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')


('./trained_model\\tokenizer_config.json',
 './trained_model\\special_tokens_map.json',
 './trained_model\\vocab.json',
 './trained_model\\merges.txt',
 './trained_model\\added_tokens.json')

In [69]:
import torch

def generate_response(model, tokenizer, text, max_length=512):
    # Tokenize the input text
    inputs = tokenizer(
        text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Ensure the model is in evaluation mode
    model.eval()

    # Move tensors to the correct device (CPU or GPU)
    input_ids = inputs['input_ids'].to(model.device)
    attention_mask = inputs['attention_mask'].to(model.device)

    # Generate predictions
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # Get the predicted label (0 or 1)
    predicted_label = torch.argmax(logits, dim=-1).item()

    # Determine the type of input (statement or question)
    if text.strip().endswith('?'):
        # It's a question
        if "why" in text.lower():
            response = (
                f"The reason behind this is multifaceted. Generally, when discussing topics like '{text.strip('?').lower()}', "
                f"several factors come into play. It could be influenced by various circumstances, including historical context, current trends, or technological developments."
            )
        elif "how" in text.lower():
            response = (
                f"The process or impact of '{text.strip('?').lower()}' can vary depending on several factors. "
                f"It involves understanding the mechanisms behind it, which may include policy implications, societal influences, and technological advancements."
            )
        else:
            response = (
                f"That's an interesting question. When we consider '{text.strip('?').lower()}', "
                f"it opens up a discussion on multiple dimensions, including social, economic, and environmental aspects."
            )
    else:
        # It's a statement (argument)
        if predicted_label == 1:
            response = (
                f"The argument you've presented seems valid. Given the context, it's evident that '{text}' is a point of discussion that holds merit. "
                f"However, it's important to consider all facets of the topic to fully understand its implications."
            )
        else:
            response = (
                f"While your argument touches on an important issue, it might need more context. Although '{text}', "
                f"there are several factors that suggest the impact or relevance might not be as significant as it appears. "
                f"Further analysis or evidence may be required to fully substantiate the argument."
            )

    return response

# Example usage:
# Load the trained model and tokenizer
model = GPT2ForSequenceClassification.from_pretrained('./trained_model')
tokenizer = GPT2Tokenizer.from_pretrained('./trained_model')

# Example input texts
input_text_1 = "Artificial intelligence is transforming industries across the globe."
input_text_2 = "Why is climate change a pressing issue?"
input_text_3 = "How do vaccines work?"

# Generate responses
response_1 = generate_response(model, tokenizer, input_text_1)
response_2 = generate_response(model, tokenizer, input_text_2)
response_3 = generate_response(model, tokenizer, input_text_3)

print(response_1)
print("-------------------------------------------------------------------------")
print(response_2)
print("-------------------------------------------------------------------------")
print(response_3)


The argument you've presented seems valid. Given the context, it's evident that 'Artificial intelligence is transforming industries across the globe.' is a point of discussion that holds merit. However, it's important to consider all facets of the topic to fully understand its implications.
-------------------------------------------------------------------------
The reason behind this is multifaceted. Generally, when discussing topics like 'why is climate change a pressing issue', several factors come into play. It could be influenced by various circumstances, including historical context, current trends, or technological developments.
-------------------------------------------------------------------------
The process or impact of 'how do vaccines work' can vary depending on several factors. It involves understanding the mechanisms behind it, which may include policy implications, societal influences, and technological advancements.


# Example 2

In [77]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load a pretrained NER model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForTokenClassification.from_pretrained("bert-base-cased")

# Create an NER pipeline
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

def extract_entities(text):
    ner_results = nlp(text)
    entities = {"topic": [], "instance": []}

    # Iterate through the NER results
    for entity in ner_results:
        # Check for the key 'entity' instead of 'entity_group'
        entity_type = entity['entity']
        
        # Example entity labels for topic and instance (these might vary based on your model)
        if "TOPIC" in entity_type.upper():
            entities['topic'].append(entity['word'])
        elif "INSTANCE" in entity_type.upper():
            entities['instance'].append(entity['word'])
            
    return entities

# Test the function with an example text
text = "What are the arguments related to climate change?"
entities = extract_entities(text)
print(entities)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'topic': [], 'instance': []}


In [103]:
import json
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, random_split
import torch

# Load the JSON data
with open('nlas-multi2.json', 'r', encoding='latin-1') as f:
    data = json.load(f)

class ArgumentationDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def preprocess_data(data):
    texts = []
    labels = []
    for key, item in data.items():
        argument = item['argument']
        label = item['label']
        
        # Ensure that the argument is a string
        if isinstance(argument, str):
            texts.append(argument)
            labels.append(1 if label == 'yes' else 0)

    # Tokenize all texts at once
    inputs = tokenizer(texts, padding='max_length', truncation=True, return_tensors="pt")
    
    return inputs, labels

# Assuming `data` is already loaded from the JSON file
inputs, labels = preprocess_data(data['eng'])

# Create the datasets
dataset = ArgumentationDataset(inputs, labels)
# eval_dataset = ArgumentationDataset(inputs, labels)  # In a real case, you should split the data

# Split the dataset into training and evaluation sets
train_size = int(0.8 * len(dataset))
eval_size = len(dataset) - train_size
train_dataset, eval_dataset = random_split(dataset, [train_size, eval_size])


In [105]:

from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Initialize TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="steps",
    # load_best_model_at_end = True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Optional callbacks
)

# Start fine-tuning
trainer.train()


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [93]:
# Optionally, save the trained model
model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')

('./trained_model\\tokenizer_config.json',
 './trained_model\\special_tokens_map.json',
 './trained_model\\vocab.txt',
 './trained_model\\added_tokens.json')

In [2]:
from transformers import BertTokenizer
import random 

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

def preprocess_user_input(user_input):
    # Use a larger max_length if possible, or dynamic padding
    inputs = tokenizer(user_input, padding='max_length', max_length=128, truncation=True, return_tensors="pt")
    return inputs


def predict_argument(model, inputs):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=-1).item()
    return predicted_class_id

def get_argument_from_label(predicted_class_id, dataset):
    arguments = []
    for key, item in dataset.items():
        label = item['label']
        if (label == 'yes' and predicted_class_id == 1) or (label == 'no' and predicted_class_id == 0):
            arguments.append(item['argument'])
    
    if arguments:
        # Randomly select an argument to add variety
        return random.choice(arguments)
    else:
        return "I couldn't find a specific argument for that question."


def chatbot_response(user_input, model, dataset):
    # Preprocess the user's input
    inputs = preprocess_user_input(user_input)
    
    # Get the predicted argument ID
    predicted_class_id = predict_argument(model, inputs)
    
    # Get the corresponding argument from the dataset
    response = get_argument_from_label(predicted_class_id, dataset)
    
    return response

# Example questions to test different topics
user_questions = [
    "Should we implement stricter gun control laws?",
    "Is climate change a real threat?",
    "Should taxes increased be on the wealthy?",
    "Do video games cause violence?"
]

for question in user_questions:
    response = chatbot_response(question, model, data['eng'])
    print(f"User: {question}")
    print(f"Chatbot: {response}\n")



NameError: name 'model' is not defined

# Alternative 3

In [70]:
from transformers import pipeline
import json

# Load the NER pipeline from Hugging Face
# ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
# ner_pipeline = pipeline("ner", model="dslim/bert-base-NER")
# ner_pipeline = pipeline("ner", model="Davlan/xlm-roberta-large-ner-english")
ner_pipeline = pipeline("ner", model="bert-base-multilingual-cased")

# Load NLAS data
with open('nlas-multi2.json', 'r') as file:
    nlas_data = json.load(file)

# Extract all topics from NLAS for keyword matching    
nlas_topics = [item['topic'].lower().strip() for item in nlas_data["eng"].values()]

def extract_entities(text):
    """Extract entities from user input using NER and keyword matching."""
    entities = ner_pipeline(text)
    print("NER Output:", entities)  # Debugging line
    
    # Extract topics from NER output
    topics = [entity['word'].lower().strip() for entity in entities if entity['entity'] == 'I-MISC' or entity['entity'] == 'I-ORG']

    # If NER fails, fall back to keyword matching
    if not topics:
        topics = [topic for topic in nlas_topics if topic in text.lower()]
    
    return topics

def find_argument(topic, stance=None):
    """Find an argument in the NLAS data based on topic and optional stance."""
    for item in nlas_data["eng"].values():
        print(f"Checking topic: {item['topic'].lower().strip()} against {topic}")  # Debugging line
        if item['topic'].lower().strip() == topic:
            if stance is None or item['stance'].lower().strip() == stance.lower().strip():
                return item['argument']
    return None

def get_argument_for_user_input(user_input):
    # Step 1: Extract topic from user input using NER and keyword matching
    topics = extract_entities(user_input)
    if not topics:
        return "I'm sorry, I couldn't find any relevant topics in your query."

    # Assuming the first detected topic is the most relevant
    topic = topics[0]
    
    # Optional: Infer stance based on keywords
    stance = None
    if any(keyword in user_input.lower() for keyword in ['support', 'favor', 'pro', 'agree']):
        stance = 'in favor'
    elif any(keyword in user_input.lower() for keyword in ['against', 'oppose', 'con', 'disagree']):
        stance = 'against'

    # Step 2: Retrieve argument from NLAS database
    argument = find_argument(topic, stance)

    if argument:
        return argument
    else:
        return f"Sorry, I couldn't find an argument on the topic '{topic}' in the database."

def chatbot_response(user_input):
    argument = get_argument_for_user_input(user_input)
    return f"Here's what I found: {argument}"

# Example user interaction
user_input = "Is climate change a real threat?"
response = chatbot_response(user_input)
print(response)


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NER Output: []
Checking topic: euthanasia against climate change
Checking topic: mandatory vaccination in pandemic against climate change
Checking topic: physical appearance for personal success against climate change
Checking topic: intermittent fasting against climate change
Checking topic: capital punishment against climate change
Checking topic: animal testing against climate change
Checking topic: climate change against climate change
Here's what I found: {
  "major premise": "Dr. John Smith is an expert in medical ethics containing proposition that climate change undermines the value of human life.",
  "minor premise": "Dr. Smith asserts that climate change undermines the value of human life.",
  "conclusion": "Climate change undermines the value of human life."
}


In [74]:
import tkinter as tk
from tkinter import scrolledtext
import torch

# Function to handle user input and display the response
def send_message():
    user_input = user_input_entry.get("1.0", "end-1c")  # Get the text from the input box
    user_input_entry.delete("1.0", "end")  # Clear the input box
    if user_input.strip():
        chat_display.config(state=tk.NORMAL)
        chat_display.insert(tk.END, "User: " + user_input + "\n\n")
        chat_display.config(state=tk.DISABLED)

        response = response = response = chatbot_response(user_input)
        chat_display.config(state=tk.NORMAL)
        chat_display.insert(tk.END, "Chatbot: " + response + "\n\n")
        chat_display.config(state=tk.DISABLED)

# Create the main application window
root = tk.Tk()
root.title("Argumentative Chatbot")
root.geometry("670x480")

# Create a display area for the chat conversation
chat_display = scrolledtext.ScrolledText(root, height=20, width=80, state=tk.DISABLED, wrap=tk.WORD)
chat_display.grid(column=0, row=0, padx=10, pady=10, columnspan=2)
chat_display.configure(state='disabled')

entry_label = tk.Label(root, text="Ask your question:")
entry_label.grid(column=0, row=1, padx=10, pady=10)

user_input_entry = tk.Text(root, height=2, width=70)
user_input_entry.grid(column=0, row=2, padx=10, pady=10)

send_button = tk.Button(root, text="Send", command=send_message)
send_button.grid(column=1, row=3, padx=10, pady=10)

root.mainloop()


NER Output: []
Checking topic: euthanasia against climate change
Checking topic: mandatory vaccination in pandemic against climate change
Checking topic: physical appearance for personal success against climate change
Checking topic: intermittent fasting against climate change
Checking topic: capital punishment against climate change
Checking topic: animal testing against climate change
Checking topic: climate change against climate change
