In [12]:
import spacy
import pandas as pd
import json
from spacy.training.example import Example
from spacy.util import minibatch
import random
from spacy.scorer import Scorer



In [13]:
file_path = 'streetdata_01.json'
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)


In [14]:
TRAIN_DATA = []
for item in data:
    text = item['data']['Message']
    entities = []
    for annotation in item['annotations'][0]['result']:
        start = annotation['value']['start']
        end = annotation['value']['end']
        label = annotation['value']['labels'][0]
        entities.append((start, end, label))
    TRAIN_DATA.append((text, {"entities": entities}))

print(TRAIN_DATA[:2])

[('На Жибек Жолы, дом 15, уже неделю не убирают мусор во дворе. Становится грязно.', {'entities': [(3, 13, 'Street'), (15, 21, 'NUM')]}), ('Абылай хана, 23 — освещение на улице не работает. Вечером темно, ходить страшно.', {'entities': [(0, 11, 'Street'), (13, 15, 'NUM')]})]


In [6]:
nlp = spacy.load("ru_core_news_sm")
ner = nlp.get_pipe("ner")

for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [7]:
optimizer = nlp.resume_training()
n_iter = 70  #

for i in range(n_iter):
    random.shuffle(TRAIN_DATA)
    losses = {}
    batches = minibatch(TRAIN_DATA, size=8)
    for batch in batches:
        texts, annotations = zip(*batch)
        examples = [Example.from_dict(nlp.make_doc(text), ann) for text, ann in zip(texts, annotations)]
        nlp.update(examples, drop=0.3, losses=losses)
    print(f"Losses at iteration {i}: {losses}")

Не горит именно зеленый св..." with entities "[(0, 10, 'Street'), (12, 22, 'Street')]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.




Losses at iteration 0: {'tok2vec': 0.0, 'morphologizer': 0.0, 'parser': 0.0, 'ner': 2285.78585636703}
Losses at iteration 1: {'tok2vec': 0.0, 'morphologizer': 0.0, 'parser': 0.0, 'ner': 816.6105131774129}
Losses at iteration 2: {'tok2vec': 0.0, 'morphologizer': 0.0, 'parser': 0.0, 'ner': 558.3277106643404}
Losses at iteration 3: {'tok2vec': 0.0, 'morphologizer': 0.0, 'parser': 0.0, 'ner': 456.27556227042595}
Losses at iteration 4: {'tok2vec': 0.0, 'morphologizer': 0.0, 'parser': 0.0, 'ner': 398.2143917155371}
Losses at iteration 5: {'tok2vec': 0.0, 'morphologizer': 0.0, 'parser': 0.0, 'ner': 336.354144773628}
Losses at iteration 6: {'tok2vec': 0.0, 'morphologizer': 0.0, 'parser': 0.0, 'ner': 374.2538978840434}
Losses at iteration 7: {'tok2vec': 0.0, 'morphologizer': 0.0, 'parser': 0.0, 'ner': 289.24045866616365}
Losses at iteration 8: {'tok2vec': 0.0, 'morphologizer': 0.0, 'parser': 0.0, 'ner': 280.6083080416447}
Losses at iteration 9: {'tok2vec': 0.0, 'morphologizer': 0.0, 'parser': 0

In [None]:
output_dir = "street_modelF"
nlp.to_disk(output_dir)

Модель сохранена в папке street_modelF


In [10]:
text = "На углу Байтурсынова и Фурманова обнаружена проблема."
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)


На углу Peresechenie
Байтурсынова Street
Фурманова Street


In [5]:
def remove_overlapping_entities(train_data):
    cleaned_data = []
    for text, annotations in train_data:
        entities = annotations["entities"]
        entities = sorted(entities, key=lambda x: x[0]) 
        non_overlapping_entities = []
        prev_start, prev_end = -1, -1

        for start, end, label in entities:
            if start >= prev_end:  # добавляем, если нет пересечения
                non_overlapping_entities.append((start, end, label))
                prev_start, prev_end = start, end
            else:
                print(f"Overlap detected and removed: {(start, end, label)} in text: {text}")

        cleaned_data.append((text, {"entities": non_overlapping_entities}))

    return cleaned_data

# Применяем функцию к данным
TRAIN_DATA = remove_overlapping_entities(TRAIN_DATA)