# 객체-관계 추출

In [2]:
import os
import re
import json
import torch
from model_loader.config import generation_loader
from model_loader.local_loader import LocalModelLoader
from model_loader.ollama_loader import OllamaModelLoader

def extract_entities_and_relations_with_llm(text_to_analyze) :
    if isinstance(generation_loader, LocalModelLoader) :
        llm_model = generation_loader.model
        llm_tokenizer = generation_loader.tokenizer

        if llm_model is None or llm_tokenizer is None :
            return None, None
        
        try :
            llm_model.eval()
        except Exception as e :
            return None, None
        
        is_local_model = True

    elif isinstance(generation_loader, OllamaModelLoader) :
        is_local_model = False

    else :
        return None, None
    
    prompt = f"""
        Extract entities and their relations from the following sentence.

        **Entities** should be **unique nouns or concepts**, extracted as **noun phrases** whenever possible. Identify **concrete objects or concepts** rather than complex activities or phenomena as entities.

        **Relations** should clearly describe the connection between two entities, preferring **reusable predicate verbs** for a knowledge graph. Use **concise verbs** or clear, hyphenated forms like **'part_of' or 'includes'**.

        Use the following specific and concise relation verbs:
        * 'X circulates through Y' -> **'circulates_through'**
        * 'X supplies Y' -> **'supplies'**
        * 'X is a part of Y' -> **'part_of'**
        * 'X causes Y' -> **'causes'**
        * 'X generates Y' -> **'generates'**
        * 'X exists_in Y' -> **'exists_in'**
        * 'X is the central organ of Y' -> **'central_organ_of'**
        * 'X regulates Y' -> **'regulates'**
        * 'X is caused_by Y' -> **'caused_by'**
        * 'X mediates Y' -> **'mediates'**

        Avoid descriptive or ambiguous relations (e.g., 'X is related to Y', 'X is responsible for Y', 'X performs Y', 'X plays an important role in Y'), and convert them into more precise relations.

        Output the result **only in the following JSON format**, with no other explanations or text:

        ```json
        {{
            "entities": [
                {{"name": "Entity1", "type": "Type (e.g., Organ, System, Substance, Function, Disease)"}},
                {{"name": "Entity2", "type": "Type"}}
            ],
            "relations": [
                {{"head": "Entity1", "relation": "Relation_Type (e.g., part_of, causes)", "tail": "Entity2"}},
                {{"head": "Entity3", "relation": "generates", "tail": "Entity4"}}
            ]
        }}

        sentence : "{text_to_analyze}"
        JSON result : 
    """

    generated_text = ""
    if is_local_model :
        inputs = llm_tokenizer(prompt, return_tensors="pt").to(llm_model.device)
        max_new_tokens = 512

        with torch.no_grad() :
            outputs = llm_model.generate(
                **inputs,
                max_new_tokens = max_new_tokens,
                num_return_sequences = 1,
                do_sample = True,
                temperature = 0.4,
                top_p = 0.9,
                repetition_penalty = 1.2,
                eos_token_id = llm_tokenizer.eos_token_id
            )
        generated_text = llm_tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    else :
        generated_text = generation_loader.generate(prompt)

    print("--- LLM 원본 출력 ---")
    print(generated_text)

    try :
        generated_text = re.sub(r"```json\s*", '', generated_text, flags = re.IGNORECASE)
        generated_text = re.sub(r"\s*```", '', generated_text)

        json_start = generated_text.find('{')
        json_end = generated_text.rfind('}')

        if json_start != -1 and json_end != -1 and json_end > json_start :
            json_str = generated_text[json_start:json_end + 1]
            parsed_result = json.loads(json_str.strip())

            extracted_entities = parsed_result.get("entities", [])
            extracted_relations = parsed_result.get("relations", [])

            return extracted_entities, extracted_relations
        
        else :
            return None, None
        
    except json.JSONDecodeError as e :
        print(f"JSON 파싱 오류 : {e}")
        return None, None
    except Exception as e :
        print(f"JSON 파싱 중 오류 : {e}")
        return None, None
    
if __name__ == "__main__" :
    data_folder = "./data/split_file/anatomy"
    output_base_folder = "./data/extracted_results"
    os.makedirs(output_base_folder, exist_ok = True)
    file_list = ["1_Embryology.md", "2_Osteology.md", "3_Syndesmology.md"]
    for filename in os.listdir(data_folder) :
        if filename in file_list :
            filepath = os.path.join(data_folder, filename)
            output_filename = os.path.splitext(filename)[0] + "_1" + ".json"
            output_filepath = os.path.join(output_base_folder, output_filename)

            if os.path.exists(output_filepath) :
                print(f"파일 {output_filename}이 존재")
                continue
            
            with open(filepath, 'r', encoding="utf-8") as f :
                text_content = f.read()

            paragraphs = re.split(r"\n\s*\n+", text_content)

            document_extracted_data = {
                "filename" : filename,
                "full_original_text" : text_content,
                "paragraphs_data" : []
            }

            for i, paragraph in enumerate(paragraphs) :
                if not paragraph.strip() :
                    continue

                entities, relations = extract_entities_and_relations_with_llm(paragraph)

                paragraph_data = {
                    "paragraph_id" : i,
                    "original_paragraph_text" : paragraph.strip(),
                    "entities" : entities if entities is not None else [],
                    "relations" : relations if relations is not None else []
                }
                document_extracted_data["paragraphs_data"].append(paragraph_data)

                if entities is None and relations is None :
                    print("관게 추출 실패")

            if document_extracted_data["paragraphs_data"] :
                with open(output_filepath, 'w', encoding="utf-8") as f :
                    json.dump(document_extracted_data, f, ensure_ascii=False, indent=4)
            else :
                print(f"파일 {filename}에서 개체, 관계 추출 실패")
            print("\n" + '=' * 50 + "\n")

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


--- LLM 원본 출력 ---
```json
{
    "entities": [
        {"name": "Embryology", "type": "Field_of_Study"}
    ],
    "relations": []
}
```
--- LLM 원본 출력 ---
```json
{
    "entities": [
        {"name": "Embryology", "type": "Field"},
        {"name": "changes", "type": "Process"},
        {"name": "animal", "type": "Organism"},
        {"name": "egg", "type": "Biological_Structure"},
        {"name": "adult condition", "type": "Life_Stage"},
        {"name": "ontogeny", "type": "Field"},
        {"name": "individual", "type": "Organism"},
        {"name": "phylogeny", "type": "Field"},
        {"name": "animal kingdom", "type": "Taxonomic_Group"}
    ],
    "relations": [
        {"head": "Embryology", "relation": "deals_with", "tail": "changes"},
        {"head": "changes", "relation": "occur_during", "tail": "growth"},
        {"head": "animal", "relation": "undergoes", "tail": "growth"},
        {"head": "growth", "relation": "starts_from", "tail": "egg"},
        {"head": "growth", "r

# 고유 관게, 객체 추출

In [3]:
import json

def extract_unique_entites_and_relations(json_file_path) :
    unique_entities = set()
    unique_relations = []

    try :
        with open(json_file_path, 'r', encoding="utf-8") as f :
            data = json.load(f)
    except FileNotFoundError :
        print(f"경로 없음 : {json_file_path}")
        return set(), []
    except json.JSONDecodeError :
        print(f"json 디코딩 실패 : {json_file_path}")
        return set(), []
    
    print(f"json 파일 : {data}")
    if "paragraphs_data" in data :
        for paragraph in data["paragraphs_data"] :
            if "entities" in paragraph :
                for entity in paragraph["entities"] :
                    if "name" in entity and "type" in entity :
                        if entity["name"] and entity["type"] :
                            unique_entities.add((entity["name"], entity["type"]))
                        elif entity["name"] and not entity["type"] :
                            unique_entities.add((entity["name"], "UNKNOWN_TYPE"))
                        elif not entity["name"] and entity["type"] :
                            unique_entities.add(("UNKNOWN_NAME", entity["type"]))

            if "relations" in paragraph :
                for relation in paragraph["relations"] :
                    if relation not in unique_relations :
                        unique_relations.append(relation)

    return unique_entities, unique_relations

if __name__ == "__main__" :
    json_file_path = "./data/extracted_results/5_Angiology.json"
    unique_entity_path = "./data/unique/5_Angiology_unique.txt"
    entities, relations = extract_unique_entites_and_relations(json_file_path)

    print("고유 엔티티 : ")
    for entity in sorted(list(entities)) :
        print(f"name : {entity[0]}, type : {entity[1]}")

    print("고유 관계 : ")
    if relations :
        for relation in relations :
            print(relation)
    else :
        print("추출된 관계 없음")

    try :
        with open(unique_entity_path, 'w', encoding="utf-8") as f :
            f.write("<고유 엔티티>\n")
            for entity in sorted(list(entities)) :
                f.write(f"이름 : {entity[0]}, 타입 : {entity[1]}\n")

            f.write("\n<고유 관계> : \n")
            if relations :
                for relation in relations :
                    f.write(json.dumps(relation, ensure_ascii=False) + "\n")
            else :
                f.write("추출된 관계 없음\n")
    except IOError as e :
        print(f"파일 저장 중 오류 : {e}")

json 파일 : {'filename': '5_Angiology.md', 'full_original_text': '#### Page 333\n# V. Angiology\n## Introduction\n\nT HE VASCULA R system is divided for descriptive purposes into ( a ) the blood vascular system, which\ncomprises the heart and bloodvessels for the circulation of the blood; and ( b ) the lymph vascular system,\nconsisting of lymph glands and lymphatic vessels, through which a colorless fluid, the lymph, circulates. It\nmust be noted, however, that the two systems communicate with each other and are intimately associated\ndevelopmentally.\n\nThe heart is the central organ of the blood vascular system, and consists of a hollow muscle; by its\ncontraction the blood is pumped to all parts of the body through a complicated series of tubes, termed\narteries. The arteries undergo enormous ramification in their course throughout the body, and end in minute\nvessels, called arterioles, which in their turn open into a close-meshed network of microscopic vessels,\ntermed capillaries.

# 고유 객체-관계 정제 후 json파일에 반영

In [13]:
import json
import os

def parse_txt_data(txt_filepath):
    unique_entities = {}  
    unique_relations = set() 

    with open(txt_filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            
            if line.startswith("이름 :"):
                parts = line.split(', ')
                if len(parts) == 2:
                    name_part = parts[0].replace("이름 : ", "").strip()
                    type_part = parts[1].replace("타입 : ", "").strip()
                    unique_entities[name_part] = type_part
            
            elif line.startswith("{") and line.endswith("}"):
                try:
                    rel_data = json.loads(line)
                    if all(k in rel_data for k in ["head", "relation", "tail"]):
                        unique_relations.add((rel_data["head"], rel_data["relation"], rel_data["tail"]))
                except json.JSONDecodeError:
                    print(f"경고: 관계 파싱 오류 - {line}")
    return unique_entities, unique_relations

def update_json_data(json_data, unique_entities, unique_relations):
    for paragraph_data in json_data.get("paragraphs_data", []):
        for entity in paragraph_data.get("entities", []):
            if entity["name"] in unique_entities:
                entity["type"] = unique_entities[entity["name"]]
        
        filtered_relations = []
        for relation in paragraph_data.get("relations", []):
            rel_tuple = (relation.get("head"), relation.get("relation"), relation.get("tail"))
            if rel_tuple in unique_relations:
                filtered_relations.append(relation)
        paragraph_data["relations"] = filtered_relations
    return json_data

def main():
    txt_filename = "5_Angiology_unique.txt"
    json_filename = "5_Angiology.json"
    
    txt_filepath = os.path.join("data", "unique", txt_filename)
    json_filepath = os.path.join("data", "extracted_results", json_filename)

    if not os.path.exists(txt_filepath):
        print(f"오류: {txt_filepath} 파일을 찾을 수 없습니다.")
        return
    if not os.path.exists(json_filepath):
        print(f"오류: {json_filepath} 파일을 찾을 수 없습니다.")
        return

    unique_entities, unique_relations = parse_txt_data(txt_filepath)

    with open(json_filepath, 'r', encoding='utf-8') as f:
        json_data = json.load(f)

    updated_json_data = update_json_data(json_data, unique_entities, unique_relations)

    with open(json_filepath, 'w', encoding='utf-8') as f:
        json.dump(updated_json_data, f, ensure_ascii=False, indent=4)

if __name__ == "__main__":
    main()

In [None]:
def generate_knowledge_graph_from_json(json_folder = "./data/extracted_results", output_kg_file = "./data/knowledge_graphs") :
    os.makedirs(output_kg_file, exist_ok=True)

    for filename in os.listdir(json_folder) :
        if filename in os.listdir(json_folder) :
            if filename.endswith(".json") :
                filepath = os.path.join(json_folder, filename)
                all_entities = {}
                all_relations = []
                try :
                    with open(filepath, 'r', encoding="utf-8") as f :
                        data = json.load(f)
                    for entity in data.get("entities", []) :
                        name = entity.get("name")
                        entity_type = entity.get("type")
                        if name and entity_type :
                            all_entities[name] = entity_type

                    for relation in data.get("relations", []) :
                        head = relation.get("head")
                        rel_type = relation.get("relation")
                        tail = relation.get("tail")
                        if head and rel_type and tail :
                            all_relations.append({"head" : head, "relation" : rel_type, "tail" : tail})

                    base_filename = os.path.splitext(filename)[0]
                    output_kg_file = os.path.join(output_kg_file, f"{base_filename}_kg.txt")

                    with open(output_kg_file, 'w', encoding="utf-8") as f :
                        f.write("<엔티티> : \n")
                        for name in sorted(all_entities.keys()) :
                            f.write(f"Entity : {name}, Type : {all_entities[name]}")
                        f.write("<관계> : \n")
                        for rel in all_relations :
                            f.write(f'Relation : {rel["head"]} -- ({rel["relation"]}) --> {rel["tail"]}\n')

                    print(f"지식 그래프 요약 정보 : {output_kg_file}")
                    print(f"총 {len(all_entities)}개의 고유 엔티티와 {len(all_relations)}개의 관계 추출")
                    print("=" * 100)
                except json.JSONDecodeError as e :
                    print(f"경고 : {filename} 파일 파싱 오류 : {e}")
                except Exception as e :
                    print(f"경고 : {filename} 파일 처리 중 알 수 없는 오류 : {e}")

    

if __name__ == "__main__" : 
    extracted_json_folder = "./data/extracted_results"
    knowledge_graph_output_file = "./data/extracted_results/knowledge_graphs"
    generate_knowledge_graph_from_json(extracted_json_folder, knowledge_graph_output_file)


지식 그래프 요약 정보 : ./data/extracted_results/knowledge_graphs/1_Embryology_kg.txt
총 1130개의 고유 엔티티와 1308개의 관계 추출


이름 : 19998
----------------------------------------------------------------------------------------------------
타입 : type
