In [None]:
import pandas as pd
import json

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 5)

In [None]:
path_to_labeled_lpu = r'D:\Veeple\IT\Python_scripts\Scripts\Обучение aidar_musin\инфа из Согласия\Список_ЛПУ_labeled.txt'

In [None]:
df_lpu_labeled = pd.read_csv(path_to_labeled_lpu)

In [None]:
df_lpu_labeled['label'] = df_lpu_labeled['label'].apply(json.loads)
df_lpu_labeled

In [None]:
def transform_lpu_dataframe(df):
    transformed_data = []

    for text, labels in zip(df["text"], df["label"]):
        extracted_entities = []

        for entity in labels:
            entity_type = entity["labels"][0]  # Берем первый тип сущности
            extracted_entities.append({
                "type": entity_type,
                "text": entity["text"],
                "start": entity["start"],
                "end": entity["end"]
            })

        transformed_data.append({
            "address": text,
            "extracted_entities": extracted_entities
        })

    return transformed_data

In [None]:
new_result = transform_lpu_dataframe(df_lpu_labeled)

In [None]:
from transformers import pipeline
import torch
import logging

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
logging.info(f"using device: {device}")

address_ner_pipeline = pipeline("ner", model="aidarmusin/address-ner-ru", device=device)

In [None]:
def process_address(address):
    entities = address_ner_pipeline(address)
    return entities

In [None]:
path_to_5000lpu = r'D:\Veeple\IT\Python_scripts\Scripts\Обучение aidar_musin\инфа из Согласия\Список_ЛПУ.xlsx'

In [None]:
df = pd.read_excel(path_to_5000lpu)
df = df[['Адрес ЛПУ']]
df

In [None]:
df['Адрес ЛПУ_aidared'] = df['Адрес ЛПУ'].apply(process_address)
df

In [None]:
def merge_subtokens(entities, address):
    """
    Объединяет субтокены в цельные слова, используя оригинальный текст из address.
    """
    merged_entities = []
    current_entity = None

    for entity in entities:
        entity_type = entity['entity'].replace("B-", "").replace("I-", "")  # Убираем B- и I-
        start, end = entity["start"], entity["end"]

        if entity['entity'].startswith("B-") or current_entity is None or current_entity["type"] != entity_type:
            # Если новый B-тег или тип сущности изменился, начинаем новую сущность
            if current_entity:
                merged_entities.append(current_entity)  # Сохраняем предыдущую сущность
            current_entity = {
                "type": entity_type,
                "text": address[start:end],  # Берём оригинальный текст
                "start": start,
                "end": end
            }
        else:
            # Продолжаем текущую сущность
            current_entity["end"] = end  # Обновляем конец
            current_entity["text"] = address[current_entity["start"]:end]  # Берём текст заново
    
    if current_entity:
        merged_entities.append(current_entity)  # Добавляем последнюю сущность

    return merged_entities

In [None]:
def transform_dataframe(df):
    """
    Преобразует датафрейм в нужный формат.
    """
    transformed_data = []

    for _, row in df.iterrows():
        address = row["Адрес ЛПУ"]
        if isinstance(address, list):
            address = ", ".join(address)  # Соединяем через запятую

        extracted_entities = merge_subtokens(row["Адрес ЛПУ_aidared"], address)
        
        transformed_data.append({
            "address": address,
            "extracted_entities": extracted_entities
        })
    
    return transformed_data

In [None]:
old_result = transform_dataframe(df)

In [None]:
old_dataset = old_result[:2500].copy()

In [None]:
new_dataset = new_result.copy()

In [None]:
mixed_dataset = old_dataset + new_dataset
mixed_dataset

In [None]:
# Сохранение в файл
with open("mixed_dataset.json", "w", encoding="utf-8") as f:
    json.dump(mixed_dataset, f, ensure_ascii=False, indent=4)