### Named Entity Recognition (NER) is a Natural Language Processing (NLP) technique used to identify and extract named entities from text. Named entities are words or phrases that refer to specific entities such as people, organizations, locations, dates, times, and other types of entities that have a specific name or title.

### To use the NLAS-multi corpus for training a NER model for our project, here are the steps:

 1. We need to setup our environment and Download/load the NLAS-multi corpus data 
 2. We will Preprocess the data to a suit the format needed for training our model (Pre-processing).
 3. We then convert the text and labels into numerical format that can be fed into a machine learning model (Feature Extraction).
 4. Use spacy to build and train the NER model (Model Building and Training).
 5. We evaluate the trained model and demonstrate how to use it for NER tasks (Model Evaluation).
 6. Lastly, we develop a user-friendly interface to interact with the chatbot.

### setup our environment

In [16]:
import json
import re
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
import random

### Test spacy pre-trained model to see how NER woks

In [9]:
nlp = spacy.load("en_core_web_lg")
doc = nlp('A professional is medical field that comes from Europe')
doc.ents

(Europe,)

# Data Cleaning and Pre-processing
 We will Preprocess the data to suit the format needed for training our model.
 we will process this data to extract sentences and tags for training an NER model. We need to extract relevant parts (such as the topic, stance, argumentation scheme, and the parts of the argument) and convert them into a format suitable for NER training.

In [98]:
# Load the JSON file
with open('nlas-multi-doubled.json', 'r', encoding='latin-1') as file:
    data = json.load(file)
data

{'eng': {'0': {'topic': 'Euthanasia',
   'stance': 'in favor',
   'argumentation scheme': 'position to know',
   'argument': '{\n "major premise": "Medical professionals (such as doctors and nurses) are in position to know about the treatment options available for terminally ill patients.",\n "minor premise": "Many medical professionals argue that euthanasia is a humane option for terminally ill patients who are experiencing unbearable suffering and have little hope for recovery.",\n "conclusion": "Euthanasia can be a morally justifiable option for terminally ill patients who are experiencing unbearable suffering and have little hope for recovery."\n}',
   'label': 'yes'},
  '1893': {'topic': 'Chemtrail conspiracy theory',
   'stance': 'against',
   'argumentation scheme': 'analogy',
   'argument': {'similarity premise': 'Generally, the theory of Flat Earth is similar to the Chemtrail conspiracy theory',
    'base premise': 'The Flat Earth theory is scientifically unsubstantiated and d

In [100]:
# Extract arguments from both languages
arguments_eng = data.get('eng', {})
arguments_esp = data.get('esp', {}) 

In [102]:
# Combine arguments from both languages
arguments = {**arguments_eng, **arguments_esp}

# Feature Extraction

In [109]:
# Annotate data
def annotate_data(arguments):
    annotated_data = []

    for key, value in arguments.items():
        topic = value['topic']
        stance = value['stance']
        scheme = value['argumentation scheme']
        argument = str(value['argument'])
        
        tokens = argument.split()
        labels = ["O"] * len(tokens)
        
        # Annotate topic
        if topic:
            topic_tokens = topic.split()
            topic_pattern = re.compile(r'\b' + r'\s+'.join(re.escape(token) for token in topic_tokens) + r'\b')
            match = topic_pattern.search(argument)
            if match:
                start_idx = len(argument[:match.start()].split())
                for i in range(len(topic_tokens)):
                    labels[start_idx + i] = "B-TOPIC" if i == 0 else "I-TOPIC"
        
        # Annotate stance
        if stance:
            stance_tokens = stance.split()
            stance_pattern = re.compile(r'\b' + r'\s+'.join(re.escape(token) for token in stance_tokens) + r'\b')
            match = stance_pattern.search(argument)
            if match:
                start_idx = len(argument[:match.start()].split())
                for i in range(len(stance_tokens)):
                    labels[start_idx + i] = "B-STANCE" if i == 0 else "I-STANCE"
        
        # Annotate scheme
        if scheme:
            scheme_tokens = scheme.split()
            scheme_pattern = re.compile(r'\b' + r'\s+'.join(re.escape(token) for token in scheme_tokens) + r'\b')
            match = scheme_pattern.search(argument)
            if match:
                start_idx = len(argument[:match.start()].split())
                for i in range(len(scheme_tokens)):
                    labels[start_idx + i] = "B-SCHEME" if i == 0 else "I-SCHEME"
        
        annotated_sentence = list(zip(tokens, labels))
        annotated_data.append(annotated_sentence)
    
    return annotated_data

annotated_data = annotate_data(arguments)

In [118]:
# Convert to CoNLL format
def convert_to_conll(annotated_data):
    conll_format_data = []
    for sentence in annotated_data:
        for token, label in sentence:
            conll_format_data.append(f"{token} {label}\n")
        conll_format_data.append("\n")
    
    return conll_format_data

conll_data = convert_to_conll(annotated_data)

# Save to a file
with open('ner_training_data.conll', 'w', encoding='utf-8') as file:
    file.writelines(conll_data)

## Model Building and Training
### using Train a Custom Named Entity Recognition Model Using spaCy

In [115]:
# Load the conll data
def load_data(file_path):
    with open(file_path, 'r', encoding='latin-1') as file:
        lines = file.readlines()
    
    texts = []
    annotations = []
    tokens = []
    labels = []

    for line in lines:
        if line.strip():
            token, label = line.split()
            tokens.append(token)
            labels.append(label)
        else:
            if tokens and labels:
                texts.append(' '.join(tokens))
                entities = []
                start = 0
                for token, label in zip(tokens, labels):
                    if label != 'O':
                        end = start + len(token)
                        entities.append((start, end, label.split('-')[1]))
                    start += len(token) + 1
                annotations.append({'entities': entities})
                tokens = []
                labels = []
    return texts, annotations

texts, annotations = load_data('ner_training_data.conll')

# Initialize a blank model
nlp = spacy.blank('en')

# Create the NER pipeline
ner = nlp.add_pipe('ner', last=True)

# Add labels to the NER pipeline
for label in ['TOPIC', 'STANCE', 'SCHEME']:
    ner.add_label(label)

# Convert data to spaCy's Example objects
examples = []
for text, annotation in zip(texts, annotations):
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotation)
    examples.append(example)

# Training the model
optimizer = nlp.begin_training()

for epoch in range(10):  # Number of epochs
    random.shuffle(examples)
    for example in examples:
        nlp.update([example], sgd=optimizer)

# Save the model
nlp.to_disk('ner_model')


### Model Evaluation

In [120]:
# Load the trained model
nlp = spacy.load('ner_model')

# Evaluate on validation data
texts, annotations = load_data('ner_training_data.conll')
for doc in nlp.pipe(texts):
    print([(ent.text, ent.label_) for ent in doc.ents])

[('position', 'SCHEME'), ('to', 'SCHEME'), ('know', 'SCHEME'), ('can', 'TOPIC')]
[('Chemtrail', 'TOPIC'), ('conspiracy', 'TOPIC'), ("theory',", 'TOPIC')]
[('position', 'SCHEME'), ('to', 'SCHEME'), ('know', 'SCHEME')]
[]
[('should', 'TOPIC')]
[('Chemtrail', 'SCHEME'), ('conspiracy', 'TOPIC'), ('theory', 'TOPIC')]
[('undermines', 'TOPIC')]
[]
[]
[]
[]
[('established', 'SCHEME'), ('rule', 'SCHEME'), ('established', 'SCHEME')]
[]
[]
[]
[]
[('is', 'TOPIC')]
[]
[('popular', 'SCHEME'), ('practice', 'SCHEME')]
[]
[('in', 'STANCE'), ('favor', 'STANCE')]
[('Chemtrail', 'TOPIC'), ('conspiracy', 'TOPIC'), ("theory.',", 'TOPIC')]
[('against', 'STANCE')]
[('against', 'STANCE')]
[]
[('Chemtrail', 'TOPIC'), ('conspiracy', 'TOPIC'), ('theory', 'TOPIC'), ('slippery', 'SCHEME'), ('slope', 'SCHEME')]
[]
[('sign,', 'SCHEME')]
[]
[]
[]
[]
[]
[('Chemtrail', 'TOPIC'), ('conspiracy', 'TOPIC'), ('theory', 'TOPIC')]
[]
[]
[('established', 'SCHEME'), ('established', 'SCHEME')]
[('Chemtrail', 'TOPIC'), ('conspirac

### Create a simple Tkinter user interface application to test the model

In [None]:
import tkinter as tk
from tkinter import scrolledtext
import spacy

# Load the trained NER model
nlp = spacy.load("ner_model")

# Function to annotate text using the NER model
def annotate_text():
    text = input_text.get("1.0", tk.END).strip()
    doc = nlp(text)
    annotated_text = ""
    for token in doc:
        if token.ent_type_:
            annotated_text += f"{token.text}({token.ent_type_}) "
        else:
            annotated_text += f"{token.text} "
    output_text.delete("1.0", tk.END)
    output_text.insert(tk.END, annotated_text)

# Create the main window
root = tk.Tk()
root.title("NER Annotator")

# Create a text widget for input
input_label = tk.Label(root, text="Input Text:")
input_label.pack()
input_text = scrolledtext.ScrolledText(root, wrap=tk.WORD, width=50, height=10)
input_text.pack(padx=10, pady=10)

# Create a button to annotate text
annotate_button = tk.Button(root, text="Annotate", command=annotate_text)
annotate_button.pack(pady=10)

# Create a text widget for output
output_label = tk.Label(root, text="Annotated Text:")
output_label.pack()
output_text = scrolledtext.ScrolledText(root, wrap=tk.WORD, width=50, height=10)
output_text.pack(padx=10, pady=10)

# Run the Tkinter event loop
root.mainloop()
