In [1]:
!pip install spacy-transformers
!python -m spacy download en_core_web_trf

Collecting spacy-transformers
  Downloading spacy_transformers-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting transformers<4.37.0,>=3.4.0 (from spacy-transformers)
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers<4.37.0,>=3.4.0->spacy-transformers)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading spacy_transformers-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (197 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.8/197.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:

# **IMPORTS**

In [13]:
from google.colab import drive
import pandas as pd
import ast
import torch
import time
import spacy
from spacy.tokens import Span
from spacy import displacy
from spacy.training import Example
from spacy.tokens import DocBin, Span
from spacy import displacy
from sklearn.model_selection import train_test_split
# spacy.require_gpu()
import re
import pandas as pd
import ast
from collections import defaultdict
from google.colab import runtime
from collections import Counter

In [28]:
# Mount Google Drive
drive.mount('/content/drive')
drive_path = '/content/drive/MyDrive/And Elements/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **HELPER FUNCTIONS**

In [4]:
def load_training_single_entity_data_from_csv(input_file):
    """
    Load training data from a CSV file without filtering entities by type or count.

    Args:
    - input_file (str): The path to the input CSV file.

    Returns:
    - training_data (list): A list of tuples containing text and entity annotations in the original format.
    """
    # Load the CSV file into a DataFrame
    df = pd.read_csv(input_file)
    print(f"Total rows in CSV: {len(df)}")

    training_data = []
    total_entity_count = 0
    entity_limit = 50000
    for _, row in df.iterrows():
        article = row["article"]

        # Convert the Entities column from string to a list of tuples using ast.literal_eval
        entities = ast.literal_eval(row["Entities"])
        if total_entity_count < entity_limit:
        # Add all entities without filtering
          training_data.append((article, {"entities": entities}))
          total_entity_count += len(entities)
        else:
          break

    # Debugging/verification logs
    print(f"Total entities processed: {total_entity_count}")

    return training_data

def load_training_data_from_csv(input_file):
    """
    Load training data from a CSV file, filtering entities with the category 'Tags',
    and convert it back to the original format, ensuring no more than specified entities
    per type are included.

    Args:
    - input_file (str): The path to the input CSV file.

    Returns:
    - training_data (list): A list of tuples containing text and entity annotations in the original format.
    """
    # Load the CSV file into a DataFrame
    df = pd.read_csv(input_file)

    # Initialize a dictionary to track counts for each entity type
    entity_counts = defaultdict(int)
    entity_limit = 300000

    training_data = []
    total_entity_count = 0

    for _, row in df.iterrows():
        article = row["article"]

        # Convert the Entities column from string to a list of tuples using ast.literal_eval
        entities = ast.literal_eval(row["Entities"])

        # Filter entities with the category and limit to 20,000 per entity type
        filtered_entities = []
        for entity in entities:
            entity_type = entity[2]
            if entity_type in ['Setting','Tags'] and entity_counts[entity_type] < entity_limit:
                filtered_entities.append(entity)
                entity_counts[entity_type] += 1

        # Append the article and its annotations if there are any filtered entities
        if filtered_entities:
            training_data.append((article, {"entities": filtered_entities}))
            total_entity_count += len(filtered_entities)

    # Debugging/verification logs
    print(f"Total entities processed: {total_entity_count}")
    print("Entity type counts:", dict(entity_counts))

    return training_data

def count_entities(data):
    """
    Count the occurrences of each entity type in the dataset.

    Args:
    - data (list): A list of tuples containing article text and entity annotations.

    Returns:
    - entity_counts (Counter): A counter object with the counts of each entity type.
    """
    entity_counts = Counter()

    # Iterate over each article and its entities
    for _, annotations in data:
        for entity in annotations['entities']:
            entity_type = entity[2]  # Get the entity type (label)
            entity_counts[entity_type] += 1

    return entity_counts

def preprocess_text(text):
    """
    Preprocess the text by:
    - Replacing periods in emails and URLs with spaces.
    - Replacing special characters (!, +, -, @, em dashes, quotation marks) with spaces.
    - Handling URLs (e.g., http://example becomes http example).
    - Removing special characters between words.
    - Removing trailing spaces and converting text to lowercase.

    Args:
    - text (str): The input text to preprocess.

    Returns:
    - str: The cleaned text.
    """
    text = str(text)  # Ensure the input is a string

    # Handle URLs by replacing ":" and "/" with spaces
    text = re.sub(r"http[s]?://", "http ", text)  # Replace http:// or https:// with "http"
    text = re.sub(r"[/:]", " ", text)  # Replace remaining ":" and "/" with spaces

    # Remove periods in emails and URLs
    text = re.sub(r"(?<=\S)\.(?=\S)", " ", text)

    # Replace specific special characters (!, +, -, @, em dash, quotation marks, etc.) with spaces
    text = re.sub(r"[!+\-@“”\"‘’—–]", " ", text)

    # Replace special characters between words
    text = re.sub(r"(?<=\w)[^\w\s,](?=\w)", " ", text)

    # Convert to lowercase
    text = text.lower()

    # Remove extra spaces and trim leading/trailing spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [None]:
def create_docbin_with_negatives(data, spans_key="sc"):
    """
    Converts annotated data into a DocBin for span categorization, including negative examples.

    Args:
        data (list): A list of tuples (text, annotations).
                     annotations should have the format {"entities": [(start, end, label), ...]}.
        spans_key (str): The key under which spans will be stored in the Doc.

    Returns:
        DocBin: A DocBin containing processed Doc objects.
    """
    db = DocBin()

    for i, (text, annotations) in enumerate(data):
        doc = nlp.make_doc(text)  # Create a Doc object
        spans_data = annotations.get("entities", [])  # Default to empty list if no entities are provided
        spans = []

        # Create valid spans
        for start, end, label in sorted(spans_data, key=lambda x: (x[0], -(x[1] - x[0]))):  # Prioritize longer spans
            # Validate and create span
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print(f"Skipping invalid span: {start}-{end} for label {label} in text {i}: {text[:50]}...")
            else:
                spans.append(span)

        # Filter spans to remove any remaining overlaps
        doc.spans[spans_key] = spacy.util.filter_spans(spans)

        # Add to DocBin, including empty entities for negative examples
        db.add(doc)

    return db

def create_docbin_spancat(data, spans_key="sc"):
    """
    Converts annotated data into a DocBin for span categorization.

    Args:
        data (list): A list of tuples (text, annotations).
                     annotations should have the format {"entities": [(start, end, label), ...]}.
        spans_key (str): The key under which spans will be stored in the Doc.

    Returns:
        DocBin: A DocBin containing processed Doc objects.
    """
    db = DocBin()

    for i, (text, annotations) in enumerate(data):
        doc = nlp.make_doc(text)  # Create a Doc object
        spans_data = annotations["entities"]
        spans = []

        # Create valid spans
        for start, end, label in sorted(spans_data, key=lambda x: (x[0], -(x[1] - x[0]))):  # Prioritize longer spans
            # Validate and create span
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print(f"Skipping invalid span: {start}-{end} for label {label} in text {i}: {text[:50]}...")
            else:
                spans.append(span)

        # Filter spans to remove any remaining overlaps
        # doc.spans[spans_key] = spans
        doc.spans[spans_key] = spacy.util.filter_spans(spans)

        # Add to DocBin
        db.add(doc)

    return db

def create_docbin_ner(data):
    """
    Converts annotated NER data into a DocBin for training.

    Args:
        data (list): A list of tuples (text, annotations).
                     annotations should have the format {"entities": [(start, end, label), ...]}.

    Returns:
        DocBin: A DocBin containing processed Doc objects with entity annotations.
    """
    db = DocBin()

    for i, (text, annotations) in enumerate(data):
        doc = nlp.make_doc(text)  # Create a Doc object
        spans_data = annotations["entities"]
        ents = []

        # Create valid entity spans
        for start, end, label in sorted(spans_data, key=lambda x: (x[0], -(x[1] - x[0]))):  # Prioritize longer spans
            # Validate and create span
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print(f"Skipping invalid entity: {start}-{end} for label {label} in text {i}: {text[:50]}...")
            else:
                ents.append(span)

        # Filter entities to remove any overlaps
        ents = spacy.util.filter_spans(ents)

        # Assign entities to the doc
        doc.ents = ents

        # Add the doc to the DocBin
        db.add(doc)

    return db


In [None]:
# Load the training data
training_data = load_training_single_entity_data_from_csv(drive_path + "preprocessed_data_setting.csv")

Total rows in CSV: 17138
Total entities processed: 19203
17138


In [None]:
# Load the training data
training_data = load_training_data_from_csv(drive_path + "preprocessed_data_tags.csv")


Total entities processed: 117326
Entity type counts: {'Tags': 117326}
("the 2024 people s choice awards host and nominees have been revealed. vote for your favorites here the people s choice awards will excitingly air during awards season in 2024, amplifying the fans' voices more than ever. read, below, what to expect. the mission of the people s choice awards continues to be about giving a voice to those whose opinions matter most the people, cassandra tryon, senior vice president, entertainment live events, nbcuniversal television and streaming, said to billboard. moving the telecast to the heart of awards season and expanding our reach to peacock s audience creates a platform for the people s voice to be heard louder than ever, giving stars and their fans an opportunity to celebrate together related mariska hargitay got a new cat and named it after taylor swift see pic who is the 2024 people s choice awards host? actor simu liu will host this year s pcas. i m so grateful to have bee

In [None]:
# Training data format: Each entry has text and entity spans with labels
TRAIN_DATA = training_data
# Load a pre trained SpaCy model
nlp = spacy.load("en_core_web_trf")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  model.load_state_dict(torch.load(filelike, map_location=device))


# **SPLITTING DATA**

In [None]:
# Split data into train and test sets
train_data, test_data = train_test_split(training_data, test_size=0.15, random_state=42)
valid_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

In [None]:
# Count entities in each dataset
train_entity_counts = count_entities(train_data)
valid_entity_counts = count_entities(valid_data)
test_entity_counts = count_entities(test_data)

# Print the counts for each entity type in the datasets
print("Entity counts in training data:")
print(train_entity_counts)

print("\nEntity counts in validation data:")
print(valid_entity_counts)

print("\nEntity counts in test data:")
print(test_entity_counts)

Entity counts in training data:
Counter({'Tags': 99623})

Entity counts in validation data:
Counter({'Tags': 8832})

Entity counts in test data:
Counter({'Tags': 8871})


In [None]:
print("Training data: "+ str(len(train_data)))
print("Validation data: "+ str(len(valid_data)))
print("Testing data: "+ str(len(test_data)))

Training data: 10822
Validation data: 955
Testing data: 955


# **FORMATTING DATA FOR SPACY**

In [None]:
# Convert train and test data to DocBin for NER Model
train_db = create_docbin_ner(train_data)
valid_db = create_docbin_ner(valid_data)
test_db = create_docbin_ner(test_data)

In [None]:
# Convert train and test data to DocBin for SPANCAT Model
train_db = create_docbin_spancat(train_data)
valid_db = create_docbin_spancat(valid_data)
test_db = create_docbin_spancat(test_data)

Skipping invalid span: 1129-1132 for label Tags in text 1626: 3 year old boy with autism missing after family sa...
Skipping invalid span: 1289-1297 for label Tags in text 4308: pamela anderson bashes #metoo, says the movement i...
Skipping invalid span: 901-909 for label Tags in text 9867: students asked to list positive aspects of slavery...


In [None]:
train_db.to_disk("./train.spacy")
valid_db.to_disk("./valid.spacy")
test_db.to_disk("./test.spacy")

# **CREATING CONFIG**

In [None]:
!python -m spacy init fill-config '/content/drive/MyDrive/And Elements/base_config.cfg' '/content/drive/MyDrive/And Elements/config.cfg'

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/drive/MyDrive/And Elements/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


# **DEBUGGING DATA FOR SPACY**

In [None]:
# Debugging data for SpanCat
!python -m spacy debug data '/content/drive/MyDrive/And Elements/config_sc.cfg' --paths.train ./train.spacy --paths.dev ./valid.spacy

# Debugging data for NER
# !python -m spacy debug data '/content/drive/MyDrive/And Elements/config_ner.cfg' --paths.train ./train.spacy --paths.dev ./valid.spacy

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[1m
tokenizer_config.json: 100% 25.0/25.0 [00:00<00:00, 224kB/s]
config.json: 100% 481/481 [00:00<00:00, 2.46MB/s]
vocab.json: 100% 899k/899k [00:00<00:00, 14.3MB/s]
merges.txt: 100% 456k/456k [00:00<00:00, 13.6MB/s]
tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 21.3MB/s]
  _torch_pytree._register_pytree_node(
model.safetensors: 100% 499M/499M [00:02<00:00, 215MB/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.amp.autocast(self._mixed_precision):
[38;5;2m✔ Pipeline can be initialized with data[0m
[38;5;2m✔ Corpus is loadable[0m
[1m
Language: en
Training pipeline: transformer, spancat
10822 training docs
955 evaluation docs
[38;5;2m✔ No overla

# **TRAINING**

In [None]:
CONFIG_PATH = '/content/drive/MyDrive/And Elements/config.cfg'
OUTPUT_PATH = '/content/drive/MyDrive/And Elements/output_ner'

In [None]:
torch.cuda.empty_cache()
!python -m spacy train CONFIG_PATH --output OUTPUT_PATH --paths.train ./train.spacy --paths.dev ./valid.spacy --gpu-id 0

  _torch_pytree._register_pytree_node(
[38;5;4mℹ Saving to output directory: /content/drive/MyDrive/And
Elements/output_ner[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.amp.autocast(self._mixed_precision):
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
  0       0        6419.37    388.86  

# **EVALUATION**

In [None]:
!python -m spacy evaluate OUTPUT_PATH +'/model-best' ./test.spacy --gpu-id 0

  _torch_pytree._register_pytree_node(
[38;5;4mℹ Using GPU: 0[0m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  self._model.load_state_dict(torch.load(filelike, map_location=device))
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
[1m

TOK     100.00
NER P   40.08 
NER R   34.04 
NER F   36.82 
SPEED   8521  

[1m

              P       R       F
Tags      31.98   33.20   32.58
Setting   48.92   34.67   40.58



In [None]:
runtime.unassign()

# **EVALUATING BEST MODELS**

In [None]:
!python -m spacy evaluate '/content/drive/MyDrive/And Elements/output/model-best-ner-setting' ./test.spacy --gpu-id 0

  _torch_pytree._register_pytree_node(
[38;5;4mℹ Using GPU: 0[0m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  self._model.load_state_dict(torch.load(filelike, map_location=device))
Token indices sequence length is longer than the specified maximum sequence length for this model (580 > 512). Running this sequence through the model will result in indexing errors
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
[1m

TOK     100.00
NER P   47.61 
NER R   63.54 
NER F   54.43 
SPEED   7602  

[1m

              P       R       F
Setting   47.61   63.54   54.43



In [None]:
!python -m spacy evaluate '/content/drive/MyDrive/And Elements/output/model-best-ner-tags' ./test.spacy --gpu-id 0

  _torch_pytree._register_pytree_node(
[38;5;4mℹ Using GPU: 0[0m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  self._model.load_state_dict(torch.load(filelike, map_location=device))
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
[1m

TOK     100.00
NER P   49.68 
NER R   55.70 
NER F   52.52 
SPEED   8449  

[1m

           P       R       F
Tags   49.68   55.70   52.52



In [None]:
!python -m spacy evaluate '/content/drive/MyDrive/And Elements/output/model-best-1.7' ./test.spacy --gpu-id 0

  _torch_pytree._register_pytree_node(
[38;5;4mℹ Using GPU: 0[0m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  self._model.load_state_dict(torch.load(filelike, map_location=device))
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
[1m

TOK      100.00
SPAN P   72.38 
SPAN R   65.11 
SPAN F   68.56 
SPEED    7978  

[1m

                    P       R       F
Killers         79.60   72.80   76.05
Type_of_Story   81.04   62.89   70.82
Victim          76.06   57.36   65.40
Setting          0.00    0.00    0.00
Social_Media    83.92   77.76   80.72



# **INFERENCE**

In [None]:
# Load your custom trained model
custom_nlp_span = spacy.load('/content/drive/MyDrive/And Elements/output/model-best-1.7') # Killers, Victims, Type of Story, Social Media
custom_nlp_setting = spacy.load('/content/drive/MyDrive/And Elements/output/model-best-ner-setting') # Setting
custom_nlp_tag = spacy.load('/content/drive/MyDrive/And Elements/output/model-best-ner-tags') # Tags
pre_trained_nlp = spacy.load('en_core_web_trf') # Crime, Characters, Location

In [19]:
text = """
1 1,361 VOTES Phil Spector’s Castle: $3,998,000 Photo: Zillow In a 2009 retrial, record producer Phil Spector was convicted of slaying actor Lana Clarkson. The incident occurred six years prior when Spector shot Clarkson at his Alhambra, CA, home. As part of a divorce settlement with his wife in 2016, Spector was forced to sell this property. The house itself - often referred to as the Pyrenees Castle - is a mansion boasting 35 rooms in total and ample space

Its estimated value is $3. 9 million. 1,361 VOTES Would you live here? 2 1,280 VOTES Gardette-LePrete Mansion: $4,900,000 Photo: Frances Benjamin Johnston / Wikimedia Commons / Public Domain According to New Orleans legend, the Gardette-LePrete Mansion was the site of a grim set of slayings in the late 19th century. The story states a relative of the Turkish sultan was slain, along with other members of his household

The home has since passed through many different hands, but it was put up for sale in 2016. The property lives up to its mansion moniker, with the building sprawling across more than 13,000 square feet with 15 bedrooms and over a dozen bathrooms - the grounds even boast several courtyards. The price at the time of listing was $4. 9 million. 1,280 VOTES Would you live here? 3 1,172 VOTES The Los Feliz Murder House: $2

3 Million Photo: realtor. com This notorious property is located at 2475 Glendower Place in the Los Feliz neighborhood of Los Angeles, CA. In 1959, Dr. Harold N. Perelson killed his wife in this house. He then went after his teenage daughter - who managed to escape to the safety of a neighbor’s home - before taking his own life. By the time police arrived, Perelson was already gone. A couple bought the expansive home for just under $2

3 million when it was put into probate in 2016. According to an advertisement for the building, the 5,000-square-foot house features a ballroom and a library. 1,172 VOTES Would you live here? 4 1,365 VOTES The JonBenet Ramsey Murder Home: $2 Million Photo: realtor. com The property located at 749 15th Street in Boulder, CO, may not look menacing, but it happens to be the site of one of the most infamous cases in US history

In 1996, 6-year-old JonBenet Ramsey was tragically slain in her family's home. The case has never officially been solved, despite confessions from possible culprits. Investigations mostly concentrated on both the family and a possible intruder. The Ramseys moved out of the house following the passing of their daughter, after which it was bought by Carol Schuller Milner, who left after a brief period

The 11,000-square-foot home was put on the market for $2 million, but it garnered few bids. 1,365 VOTES Would you live here? 5 1,535 VOTES Amityville Horror House: $605,000 Photo: Paul Hawthorne/Getty Images You might not know its address - 108 Ocean Ave, Amityville, NY - but you know it as the site of the Amityville Horror. The property needs little introduction, as the home and its history has featured in many books, films, and TV shows over the last five decades

The building first became embroiled in controversy in 1974 when resident Ronald DeFeo Jr. slew several members of his family inside. Following the police investigation and subsequent trial, the house was sold to the Lutz family. The family left just 28 days after moving in, citing paranormal activity. The house, which sold for $605,000 in 2017, contains 3,600 square feet of living space, five bedrooms, a boathouse, and a sprinkler system

1,535 VOTES Would you live here? 6 1,221 VOTES O. J. And Nicole Brown Simpson's House: $1,720,000 Photo: Google In 1994, Nicole Brown Simpson lived at 879 S Bundy Drive in Los Angeles. It's also where she was slain, along with Ron Goldman, in one of the most notorious cases in American history. Police accused Brown Simpson's ex-husband, football star O. J. Simpson. The media covered the trial - and Simpson's eventual acquittal - closely

"""

In [25]:
# sample = test_data[10]
# text = sample[0]
t0 = time.time()
text = preprocess_text(text)
custom_doc = custom_nlp_span(text)
custom_doc_tag = custom_nlp_tag(text)
custom_doc_setting = custom_nlp_setting(text)
ner_doc = pre_trained_nlp(text)
t1 = time.time()
print(f"\n > Inference time on CPU: {t1-t0} seconds")
# Create a new list for combined spans
combined_spans = []

# Add spans from the custom span categorizer
for span in custom_doc.spans['sc']:
    combined_spans.append(Span(custom_doc, span.start, span.end, label=span.label_))

for span in custom_doc_setting.ents:
    combined_spans.append(Span(custom_doc, span.start, span.end, label=span.label_))

for span in custom_doc_tag.ents:
    combined_spans.append(Span(custom_doc, span.start, span.end, label=span.label_))

# Filter entities from the NER model (only Date, Location, Person) and rename Person to Character
for entity in ner_doc.ents:
    if entity.label_ in ["DATE", "LOC", "PERSON"]:  # Check if the entity is Date, Location, or Person
        label = "Character" if entity.label_ == "PERSON" else "Crime" if entity.label_ == "DATE" else entity.label_
        combined_spans.append(Span(custom_doc, entity.start, entity.end, label=label))

# Assign the combined spans back to the doc
custom_doc.spans["combined"] = combined_spans
displacy.render(custom_doc, style="span", options={"spans_key": "combined"})


 > Inference time: 18.420571327209473 seconds


# **INFERENCE ON TEST DATA**

In [26]:
df = pd.read_csv("articles.csv")

In [27]:
# Process text with the trained span categorizer

for i in range(len(df)):
  text = df.iloc[i]['content']
  text = preprocess_text(text)
  custom_doc = custom_nlp_span(text)
  custom_doc_tag = custom_nlp_tag(text)
  custom_doc_setting = custom_nlp_setting(text)
  ner_doc = pre_trained_nlp(text)

  # Create a new list for combined spans
  combined_spans = []

  # Add spans from the custom span categorizer
  for span in custom_doc.spans['sc']:
      combined_spans.append(Span(custom_doc, span.start, span.end, label=span.label_))

  for span in custom_doc_setting.ents:
      combined_spans.append(Span(custom_doc, span.start, span.end, label=span.label_))

  for span in custom_doc_tag.ents:
      combined_spans.append(Span(custom_doc, span.start, span.end, label=span.label_))

  # Filter entities from the NER model (only Date, Location, Person) and rename Person to Character
  for entity in ner_doc.ents:
      if entity.label_ in ["DATE", "LOC", "PERSON"]:  # Check if the entity is Date, Location, or Person
          label = "Character" if entity.label_ == "PERSON" else "Crime" if entity.label_ == "DATE" else entity.label_
          combined_spans.append(Span(custom_doc, entity.start, entity.end, label=label))

  # Assign the combined spans back to the doc
  custom_doc.spans["combined"] = combined_spans

  options = {
      "colors": {
          "Character": "#FFFACD",  # Lemon Chiffon (Light Yellow)
          "Location": "#ADD8E6",   # Light Blue
          "Crime": "#FFC0CB",      # Light Pink
          "Type_of_Story": "#E6E6FA",  # Lavender (Light Purple)
          "Setting": "#F0FFF0",    # Honeydew (Pale Green)
          "Social_Media": "#FAFAD2",  # Light Goldenrod Yellow
          "Killers": "#D8BFD8",    # Thistle (Soft Purple)
          "Victims": "#FFE4E1",     # Misty Rose (Light Rose)
          "Setting": "#FFE4E1",
          "Tags":"#FFE4E1"
      }
  }
  html = displacy.render(custom_doc, style="span", options={"spans_key": "combined",**options}, jupyter = False)

  if html:
      with open(f"./test_articles/test_article_{df.iloc[i]['id']}.html", "w", encoding="utf-8") as f:
          f.write(html)
      print(f"Visualization saved to 'test_article_{df.iloc[i]['id']}.html'")
  else:
      print("Failed to generate HTML from displaCy render")

  with torch.cuda.amp.autocast(self._mixed_precision):


Visualization saved to 'test_article_67408.html'
Visualization saved to 'test_article_67407.html'
Visualization saved to 'test_article_67406.html'
Visualization saved to 'test_article_67405.html'
Visualization saved to 'test_article_67404.html'
Visualization saved to 'test_article_67403.html'
Visualization saved to 'test_article_67402.html'
Visualization saved to 'test_article_67401.html'
Visualization saved to 'test_article_67400.html'
Visualization saved to 'test_article_67399.html'
Visualization saved to 'test_article_67398.html'
Visualization saved to 'test_article_67397.html'
Visualization saved to 'test_article_67396.html'
Visualization saved to 'test_article_67395.html'
Visualization saved to 'test_article_67394.html'
Visualization saved to 'test_article_67393.html'
Visualization saved to 'test_article_67392.html'
Visualization saved to 'test_article_67391.html'
Visualization saved to 'test_article_67390.html'
Visualization saved to 'test_article_67389.html'
Visualization saved 