In [1]:
import json
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
from spacy import displacy
import random
import os

ValueError: 'in' is not a valid parameter name

In [8]:
# Function to load JSON data from a file
def load_json_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

In [9]:
# Function to convert data to spaCy format
def convert_data(data):
    training_data = []
    for item in data["annotations"]:
        text, annotations = item
        entities = [(start, end, label) for start, end, label in annotations["entities"]]
        training_data.append((text, {"entities": entities}))
    return training_data

# Directory containing JSON files
data_directory ='/home/hp/Documents/Mini_Project/Labelled/json'


In [10]:
# Collect all training data
all_training_data = []
for filename in os.listdir(data_directory):
    if filename.endswith('.json'):
        file_path = os.path.join(data_directory, filename)
        data = load_json_data(file_path)
        training_data = convert_data(data)
        all_training_data.extend(training_data)


In [11]:
# Create a blank spaCy model
nlp = spacy.blank("en")

# Create the NER component and add it to the pipeline
ner = nlp.add_pipe("ner")

# Add new labels to the NER component
for _, annotations in all_training_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Disable other pipeline components (if any)
pipe_exceptions = ["ner"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [12]:
# Training the NER model
with nlp.disable_pipes(*unaffected_pipes):
    optimizer = nlp.begin_training()
    for iteration in range(300):
        random.shuffle(all_training_data)
        losses = {}
        batches = minibatch(all_training_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            examples = [Example.from_dict(nlp.make_doc(text), ann) for text, ann in zip(texts, annotations)]
            nlp.update(examples, drop=0.5, losses=losses)
        print(f"Iteration {iteration + 1}, Losses: {losses}")

# Save the trained model
nlp.to_disk("trained_model")

Appeal by special le..." with entities "[(5, 28, 'CASE_NUMBER'), (93, 113, 'DATE'), (122, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Appeal by spec..." with entities "[(3, 34, 'CASE_NUMBER'), (95, 115, 'COURT'), (117,...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Appeal from a Judgm..." with entities "[(0, 29, 'CASE_NUMBER'), (71, 92, 'DATE'), (101, 1...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Appeal from an order of the High ..." with entities "[(0, 15, 'CASE_NUMBER'), (45, 64, 'COURT'), (71, 9...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored dur

Iteration 1, Losses: {'ner': 227848.77526474}
Iteration 2, Losses: {'ner': 25453.51418352127}
Iteration 3, Losses: {'ner': 14444.473355752882}
Iteration 4, Losses: {'ner': 9762.34137943387}
Iteration 5, Losses: {'ner': 9790.802645353368}
Iteration 6, Losses: {'ner': 8641.489277748391}
Iteration 7, Losses: {'ner': 8366.082490460947}
Iteration 8, Losses: {'ner': 8307.956423712429}
Iteration 9, Losses: {'ner': 8339.695376803167}
Iteration 10, Losses: {'ner': 7719.39710308332}
Iteration 11, Losses: {'ner': 8138.272821195424}
Iteration 12, Losses: {'ner': 8395.42183168605}
Iteration 13, Losses: {'ner': 7868.245239930227}
Iteration 14, Losses: {'ner': 7430.796526025049}
Iteration 15, Losses: {'ner': 6886.451936483383}
Iteration 16, Losses: {'ner': 7275.877849505487}
Iteration 17, Losses: {'ner': 7421.575437409803}
Iteration 18, Losses: {'ner': 7045.706613612361}
Iteration 19, Losses: {'ner': 6754.291660819203}
Iteration 20, Losses: {'ner': 6271.920367442071}
Iteration 21, Losses: {'ner': 691

In [13]:
#Test the model


# Load the trained model
nlp = spacy.load("trained_model")

# Function to visualize entities in text
def visualize_ner(text):
    doc = nlp(text)
    displacy.render(doc, style="ent", jupyter=True)



In [14]:
# Specify the path to your text file
file_path = "/home/hp/Documents/Mini_Project/dataset/IN-Abs/test-data/judgement/6276.txt"

# Read the content of the file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()


visualize_ner(text)