# 프로세스 순서
1. _extract_entities_relations() : 사용자의 질문에서 핵심 엔티티/관계 추출
2. _search_knowledge_graph() : 1번에서 추출된 단어를 바탕으로 지식그래프에서 유사한 단어를 찾고, 연결된 엣지를 가져옴
3. _retrieve_and_rerank_context() : 2번에서 찾은 정보를 이용해 전체 문서에서 관련 있는 후보 문단을 가져온 뒤, Reranker 모델을 이용해 사용자 질문과 가장 관련성이 높은 순서대로 정렬
4. generate_response() : 3번에서 정렬된 문단들을 관련성이 높은 것부터 LLM이 처리할 수 있는 최대 길이를 넘지 않도록 최종 context를 만듬
5. _build_llm_prompt() : context와 질문을 정해진 템플릿에 포맷팅
6. _call_llm_generate() : 최종 답변 생성

In [None]:
import os
import re
import nltk
import json
import torch
import chromadb
import numpy as np
import networkx as nx
import nltk.downloader
from datetime import datetime
from pykeen.models import ComplEx
from model_loader.config import *
from pykeen.pipeline import pipeline
from transformers import AutoTokenizer
from nltk.tokenize import sent_tokenize
from pykeen.triples import TriplesFactory
from sentence_transformers import CrossEncoder
from pykeen.optimizers import AdamW as PyKeenAdamW
from typing import List, Dict, Any, Tuple, Optional

nltk.download("punkt")
nltk.download("punkt_tab")

generation_loader = generation_loader

class QASystem:
    def __init__(self, 
                 graphml_path: str, 
                 md_path: str,
                 vector_db_path: str = "./chroma_db_split", 
                 similarity_threshold: float = 0.5,
                 chunk_token_threshold: int = 500):
        self.graphml_path = graphml_path
        self.md_path = md_path
        self.similarity_threshold = similarity_threshold
        self.chunk_token_threshold = chunk_token_threshold
        self.llm_loader = None
        
        self.tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
        self.reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L6-v2")

        self.graph = nx.read_graphml(graphml_path)
        self.client = chromadb.PersistentClient(path=vector_db_path)
        
        self.entity_collection = self.client.get_or_create_collection(name="entities_split")
        self.relation_collection = self.client.get_or_create_collection(name="relations_split")
        self.chunk_collection = self.client.get_or_create_collection(name="chunks")
        self.entity_relation_extraction_prompt_template = """
            Extract entities and their relations from the following sentence.

            **Entities** should be **unique nouns or concepts**, extracted as **noun phrases** whenever possible. Identify **concrete objects or concepts** rather than complex activities or phenomena as entities.

            **Relations** should clearly describe the connection between two entities, preferring **reusable predicate verbs** for a knowledge graph. Use **concise verbs** or clear, hyphenated forms like **'part_of' or 'includes'**.

            Output the result **only in the following JSON format**, with no other explanations or text:

            ```json
            {{
                "entities": [
                    {{"name": "Entity1", "type": "Type (e.g., Organ, System, Substance, Function, Disease)"}},
                    {{"name": "Entity2", "type": "Type"}}
                ],
                "relations": [
                    {{"head": "Entity1", "relation": "Relation_Type (e.g., part_of, causes)", "tail": "Entity2"}},
                    {{"head": "Entity3", "relation": "generates", "tail": "Entity4"}}
                ]
            }}

            sentence : "{text_to_analyze}"
            JSON result :
        """

        self._initialize_vector_db()
        self._initialize_chunk_db()
        self.kge_model, self.triples_factory = self._train_kge_model()

    def _preprocess_text(self, text: str) -> str:
        return text.upper().replace(' ', '_')

    def _create_chunks_from_text(self, text: str, page_num: str) -> List[Dict[str, Any]] :
        chunks = []
        paragraphs = re.split("\n\n+", text)
        for para in paragraphs :
            para = para.strip()
            if not para :
                continue

            para_tokens = self.tokenizer.tokenize(para)

            if len(para_tokens) <= self.chunk_token_threshold :
                chunks.append({"document": para, "metadata": {"source_page": page_num}})
            else :
                sentences = sent_tokenize(para)
                current_chunk_sentences = []
                current_chunk_tokens = 0

                for sentence in sentences :
                    sentence_tokens = self.tokenizer.tokenize(sentence)

                    if current_chunk_tokens + len(sentence_tokens) > self.chunk_token_threshold and current_chunk_sentences :
                        chunk_text = " ".join(current_chunk_sentences)
                        chunks.append({"document": chunk_text, "metadata": {"source_page": page_num}})
                        current_chunk_sentences = [sentence]
                        current_chunk_tokens = len(sentence_tokens)
                    else :
                        current_chunk_sentences.append(sentence)
                        current_chunk_tokens += len(sentence_tokens)

                if current_chunk_sentences :
                    chunk_text = " ".join(current_chunk_sentences)
                    chunks.append({"document": chunk_text, "metadata": {"source_page": page_num}})

        return chunks


    def _initialize_chunk_db(self) :
        if self.chunk_collection.count() > 0 :
            print("DB가 이미 초기화되어있음")
            return
        
        print("청크 DB 초기화 시작")
        all_chunks = []
        all_md_files = [f for f in os.listdir(self.md_path) if f.endswith(".md")]

        for md_file in all_md_files :
            with open(os.path.join(self.md_path, md_file), 'r', encoding="utf-8") as f :
                content = f.read()

            page_matches = re.finditer(r"####\s+Page\s+(\d+)\b(.*?)(?=####\s+Page|\Z)", content, re.S)
            for match in page_matches :
                page_num = match.group(1).strip()
                page_content = match.group(2).strip()
                if page_content :
                    chunks = self._create_chunks_from_text(page_content, page_num)
                    all_chunks.extend(chunks)

        if all_chunks :
            documents = [chunk["document"] for chunk in all_chunks]
            metadatas = [chunk["metadata"] for chunk in all_chunks]

            ids = [f"chunk_{i}_{datetime.now().timestamp()}" for i in range(len(documents))]
            self.chunk_collection.add(ids=ids, documents=documents, metadatas=metadatas)
        print(f"청크DB 초기화 완료. {self.chunk_collection.count()}개의 청크 추가")


    def _initialize_vector_db(self):
        if self.entity_collection.count() == 0:
            nodes_to_add = []
            unique_nodes = set()
            for node, data in self.graph.nodes(data=True):
                processed_node = self._preprocess_text(node)
                if processed_node not in unique_nodes :
                    metadata = {k: str(v) for k, v in data.items()}
                    metadata['original_name'] = node
                    nodes_to_add.append({'id': processed_node, 'document': processed_node, 'metadata': metadata})
                    unique_nodes.add(processed_node)
            
            if nodes_to_add:
                ids = [item['id'] for item in nodes_to_add]
                documents = [item['document'] for item in nodes_to_add]
                metadatas = [item['metadata'] for item in nodes_to_add]
                self.entity_collection.add(ids=ids, documents=documents, metadatas=metadatas)

        if self.relation_collection.count() == 0:
            edges_to_add = []
            unique_processed_relations = set()
            for u, v, data in self.graph.edges(data=True):
                relation_type = data.get('type')
                if relation_type:
                    processed_relation = self._preprocess_text(relation_type)
                    if processed_relation not in unique_processed_relations :
                        metadata = {'original_name': relation_type}
                        edges_to_add.append({'id': processed_relation, 'document': processed_relation, 'metadata': metadata})
                        unique_processed_relations.add(processed_relation)

            if edges_to_add:
                ids = [item['id'] for item in edges_to_add]
                documents = [item['document'] for item in edges_to_add]
                metadatas = [item['metadata'] for item in edges_to_add]
                self.relation_collection.add(ids=ids, documents=documents, metadatas=metadatas)
    
    def _train_kge_model(self) :
        triples = []
        for u, v, data in self.graph.edges(data=True) :
            relation_type = data.get("type")
            if relation_type and isinstance(relation_type, str):
                triples.append((str(u), str(relation_type), str(v)))

        if not triples :
            print("KGE 모델 학습을 위한 트리플이 없음")
            return None, None
        # print(f"생성된 트리플 : {len(triples)}")
        # if len(triples) > 0 :
        #     print(f"첫 5개 : {triples[:5]}")

        triples_array = np.array(triples)
        # print(f"Numpy 배열의 형태 : {triples_array.shape}")

        training_triples_factory = TriplesFactory.from_labeled_triples(
            triples=triples_array,
            create_inverse_triples=True
        )
        # print(f"TriplesFactory 생성 완료. 엔티티 수 : {training_triples_factory.num_entities}, 관계 수 : {training_triples_factory.num_relations}")

        training_set, testing_set = training_triples_factory.split()

        result = pipeline(
            training=training_set,
            testing=testing_set,
            model=ComplEx,
            optimizer=PyKeenAdamW,
            training_kwargs=dict(num_epochs=100, batch_size=256, use_tqdm_batch=False),
            optimizer_kwargs=dict(lr=0.01),
            device="cuda" if torch.cuda.is_available() else "cpu"
        )

        print("KGE 모델 학습 완료")
        return result.model, training_triples_factory
    
    def _get_kge_embedding(self, entity_name: str) -> Optional[torch.Tensor] :
        if self.kge_model is None or self.triples_factory is None :
            return None
        
        if entity_name in self.triples_factory.entity_to_id :
            entity_id = self.triples_factory.entity_to_id[entity_name]
            return self.kge_model.entity_representations[0](torch.tensor([entity_id], device=self.kge_model.device)).real.detach().cpu()
        return None
    
    def _get_kge_relation_embedding(self, relation_name: str) -> Optional[torch.Tensor] :
        if self.kge_model is None or self.triples_factory is None :
            return None
        
        if relation_name in self.triples_factory.relation_to_id :
            relation_id = self.triples_factory.relation_to_id[relation_name]
            return self.kge_model.relation_representations[0](torch.tensor([relation_id], device=self.kge_model.device)).real.detach().cpu()
        return None

    def _extract_entities_relations(self, question) :
        prompt = self.entity_relation_extraction_prompt_template.format(text_to_analyze=question)
        raw_llm_output = self._call_llm_generate(prompt)

        try :
            json_start = raw_llm_output.find("{")
            json_end = raw_llm_output.rfind("}") + 1
            if json_start != -1 and json_end != -1 and json_end > json_start :
                json_str = raw_llm_output[json_start:json_end]
                extracted_data = json.loads(json_str)
                return extracted_data.get("entities", []), extracted_data.get("relations", [])
            else :
                print(f"LLM 답변에서 유효한 JSON 형태를 찾을 수 없음 : {raw_llm_output}")
                return [], []
            
        except json.JSONDecodeError as e :
            print(f"개체 추출 과정에서 JSON 디코딩 오류 발생: {e}")
            print(f"오류 발생 원문: {raw_llm_output}")
            return [], []

    def _search_knowledge_graph(self, entities: List[str]) -> List[Dict[str, Any]]:
        if not entities:
            return []

        # 엔티티 리스트
        query_texts = [e['name'] for e in entities if 'name' in e]
        if not query_texts:
            return []

        # 벡터DB를 이용해 유사 엔티티 검색
        entity_results = self.entity_collection.query(
            query_texts=query_texts,
            n_results=5,
            include=["metadatas", "documents", "distances"]
        )
        
        # 유사도 거리가 작은 엔티티 필터링
        similar_entities = set()
        if entity_results.get('distances'):
            for i, dists in enumerate(entity_results['distances']):
                for j, dist in enumerate(dists):
                    if dist <= self.similarity_threshold:
                        meta = entity_results['metadatas'][i][j]
                        similar_entities.add(meta['original_name'])
        
        found_results = []
        seen = set()
        # u : 시작노드, v : 끝노드, data : 엣지의 데이터
        for u, v, data in self.graph.edges(data=True): # 그래프 객체를 순회하며 유사 엔티티를 찾음
            if u in similar_entities or v in similar_entities:
                identifier = (u, v, data.get('type'))
                if identifier not in seen:
                    result = data.copy()
                    result['source_node'] = u
                    result['target_node'] = v
                    found_results.append(result)
                    seen.add(identifier)
                    
        return found_results

    def _retrieve_and_rerank_context(self, question: str, search_results: List[Dict[str, Any]], top_k_retrieval: int = 20, top_k_rerank: int = 5) -> List[Dict[str, Any]]:
        mentioned_entities = set()
        for res in search_results: # 검색된 결과의 양 끝 노드 추가
            mentioned_entities.add(res.get('source_node'))
            mentioned_entities.add(res.get('target_node'))
        
        mentioned_entities = {e for e in mentioned_entities if e}

        if not mentioned_entities:
            return []

        query_text = " ".join(list(mentioned_entities))
        
        results = self.chunk_collection.query(
            query_texts=[query_text],
            n_results=top_k_retrieval,
            include=["documents", "metadatas"]
        )
        
        candidate_chunks = []
        seen_chunks = set()
        if results['documents'] and results['documents'][0]:
            for i in range(len(results['documents'][0])):
                doc = results['documents'][0][i]
                if doc not in seen_chunks:
                    candidate_chunks.append({
                        "document": doc,
                        "metadata": results['metadatas'][0][i]
                    })
                    seen_chunks.add(doc)

        if not candidate_chunks:
            return []

        rerank_pairs = [(question, chunk['document']) for chunk in candidate_chunks]
        if not rerank_pairs:
            return []

        scores = self.reranker.predict(rerank_pairs)

        reranked_results = []
        for score, chunk in zip(scores, candidate_chunks) :
            chunk["rerank_score"] = score
            reranked_results.append(chunk)

        reranked_results.sort(key=lambda x : x["rerank_score"], reverse=True)

        return reranked_results[:top_k_rerank]
    
    # def _build_llm_prompt(self, question: str, context: str, pages: List[str]) -> str:
    def _build_llm_prompt(self, question: str, context: str) -> str:
        prompt = f"""
        You are a helpful assistant who answers questions based on the provided context.
        You MUST cite the source page number for every piece of information you use.

        **Instructions:**
        1. Answer the user's question clearly and concisely using ONLY the provided context and knowledge graph information.
        2. For every statement, you MUST provide the source page number in parentheses, like this: (Page XX).
        3. If a single piece of information is supported by multiple pages, cite all of them: (Page X, Y, Z).
        4. If no context is available, state that you are answering based on the graph structure alone.

        **Example of a GOOD answer:**
        The ductus arteriosus degenerates into the ligamentum arteriosum after birth(page 360). This is a normal physiological change that happens post-delivery(page 361).

        **Example of a BAD answer:** -> (This is a bad answer because it lacks the mandatory citation)
        The ductus arteriosus becomes the ligamentum arteriosum.

        ---
        **Context:**
        {context}
        ---
        **Question:**
        {question}
        ---
        **Answer:**
        """
        return prompt.strip()
    
    def _call_llm_generate(self, prompt: str) -> str:
        if self.llm_loader:
            if hasattr(self.llm_loader, "tokenizer") and hasattr(self.llm_loader, "model"):
                tokenizer = self.llm_loader.tokenizer
                model = self.llm_loader.model

                input_ids = tokenizer.encode(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
                attention_mask = (input_ids != tokenizer.pad_token_id).long().to(model.device)

                output = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_new_tokens=500,
                    temperature=0.0,
                    do_sample=False,
                    top_p=0.85,
                    repetition_penalty=1.2,
                    early_stopping=True,
                    num_beams=3,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id
                )
                generated_ids = output[0][input_ids.shape[-1]:]
                raw_answer = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
                return raw_answer
            else:
                raw_answer = self.llm_loader.generate(prompt)
                return raw_answer
        else:
            print("generation_loader가 로드되지 않음")
            return "LLM 로더가 설정되지 않았습니다."

    def generate_response(self, question: str) -> Tuple[str, str]:
        entities, relations = self._extract_entities_relations(question)
        if not entities and not relations:
            return "질문에서 유효한 엔티티나 릴레이션을 추출할 수 없습니다.", ""
        
        search_results = self._search_knowledge_graph(entities)
        if not search_results:
            return "관련 정보를 지식 그래프에서 찾을 수 없습니다.", ""
            
        reranked_chunks = self._retrieve_and_rerank_context(question, search_results)
        if not reranked_chunks:
            no_context_message = "지식 그래프에서 관련 엔티티를 찾았지만, 문서에서 해당 내용을 포함하는 구체적인 컨텍스트를 찾을 수 없습니다."
            simple_answer_parts = []
            for res in search_results[:3] :
                simple_answer_parts.append(f"{res['source_node']} -[{res.get('type')}]-> {res['target_node']}")
            if simple_answer_parts :
                no_context_message += "\n그래프 기반 정보: " + ", ".join(simple_answer_parts)
            return no_context_message, ""
        
        final_context_parts = []
        final_pages = set()
        current_len = 0

        if hasattr(self.llm_loader, 'tokenizer') and self.llm_loader.tokenizer is not None:
            print("LLM 로더의 특정 토크나이저를 사용하여 길이를 계산합니다.")
            llm_tokenizer = self.llm_loader.tokenizer
            max_len = getattr(llm_tokenizer, 'model_max_length', 512) - 150
            
            base_prompt = self._build_llm_prompt(question, "")
            base_prompt_len = len(llm_tokenizer.tokenize(base_prompt))
            current_len += base_prompt_len

            for chunk in reranked_chunks:
                page_num = chunk['metadata'].get('source_page', 'N/A')
                context_snippet = f"... {chunk['document']} ... (출처: Page {page_num})"
                chunk_token_len = len(llm_tokenizer.tokenize(context_snippet))
                
                if current_len + chunk_token_len <= max_len:
                    final_context_parts.append(context_snippet)
                    current_len += chunk_token_len
                    if page_num != 'N/A':
                        final_pages.add(page_num)
                else:
                    break
        else:
            print("범용 토크나이저를 사용하여 길이를 근사치로 계산합니다. (Ollama 등)")
            proxy_tokenizer = self.tokenizer  
            max_len = 2048 - 500 

            for chunk in reranked_chunks:
                page_num = chunk['metadata'].get('source_page', 'N/A')
                context_snippet = f"... {chunk['document']} ... (출처: Page {page_num})"
                chunk_token_len = len(proxy_tokenizer.tokenize(context_snippet))
                
                if current_len + chunk_token_len <= max_len:
                    final_context_parts.append(context_snippet)
                    current_len += chunk_token_len
                    if page_num != 'N/A':
                        final_pages.add(page_num)
                else:
                    break

        if not final_context_parts:
            return "관련 정보를 찾았으나, 모델의 입력 길이 제한으로 인해 컨텍스트를 구성할 수 없습니다.", ""

        context = "\n\n".join(final_context_parts)
        
        prompt = self._build_llm_prompt(question, context)
        
        answer = self._call_llm_generate(prompt)
        return answer, context

def save_results_to_file(question: str, answer: str, context: str, output_dir: str, file_index: int):
    os.makedirs(output_dir, exist_ok=True)

    timestamp = datetime.now().strftime("%H%M%S_%f")
    file_name = f"result_{file_index}_{timestamp}.txt"
    file_path = os.path.join(output_dir, file_name)
    
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(f"[질문]\n{question}\n\n")
        f.write(f"[근거]\n{context}\n\n")
        f.write(f"[답변]\n{answer}\n")

if __name__ == '__main__':
    qa_system = QASystem(
        graphml_path="./data/knowledge_graph/knowledge_graph.graphml",
        md_path="./data/split_file/anatomy/"
    )
    
    qa_system.llm_loader = generation_loader
    
    questions = [
        ############## 1_Embryology.md
        "What are the two essential components of a higher organism cell as defined in the text?", # 7페이지
        "Describe the four main phases of indirect cell division (karyokinesis) as outlined in the text.", # 7페이지
        "What is the primary role of the yolk-sac in the embryo's early development?", # 20페이지
        "How does the embryo separate from the yolk-sac, and what does the enclosed part of the yolk-sac form?", # 19페이지
        "What significant developments occur in a human embryo during the Second Week?", # 33페이지
        "What are the key characteristics of the human embryo by the end of the Third Week?", # 33페이지
        
        ############## 2_Osteology.md
        "What are the three groups into which the cells of a primitive segment differentiate, and what do they form?", # 38페이지
        "How is each vertebral body formed from primitive segments during development?", # 38페이지
        "What are the sphenoidal air sinuses, and where are they located within the sphenoid bone?", # 88페이지
        "Describe the sphenoidal rostrum and its articulation.",# 88
        "What is the tibia, and where is it located in the human leg?", # 158
        "Describe the superior articular surface of the tibia's upper extremity.", # 158

        ############## 3_Syndesmology.md
        "What are joints or articulations, and how are immovable joints characterized?", # 174
        "How does the articular lamella differ from ordinary bone tissue?", # 174
        "Where is the synovial membrane located in relation to the glenoid cavity and humerus, and how does it interact with the Biceps brachii tendon?", # 207
        "List some of the bursae located near the shoulder-joint and specify which ones communicate with the synovial cavity.", # 207
        "What is the function of the plantar calcaneonavicular ligament, and what condition results if it yields?", # 236
        "How are the navicular bone and the three cuneiform bones connected, and what type of movement do they permit?", # 236

        ############## 4_Myology.md
        "How does the nervous system serve as an indicator for the origin and migration paths of developing muscles, despite not influencing muscle differentiation?", # 250
        "Describe the structural components of striped or voluntary muscle, from bundles to individual fibers.", # 250
        "What is the triangular ligament and where is it located?", # 290
        "What structures perforate the superficial layer (inferior fascia) of the urogenital diaphragm?", # 290
        "Where does the Extensor digitorum longus muscle originate, and what structures are located between it and the Tibialis anterior?", # 322
        "What is the Peronæus tertius, and where is it inserted?", # 322

        ############## 5_Angiology.md
        "What are the main characteristics of the middle coat (tunica media) of arteries, and how does its composition vary with vessel size?", # 334
        "Describe the composition and variations of the external coat (tunica adventitia) in arteries.", # 334
        "How do the Vitelline Veins develop into parts of the portal and hepatic veins?", # 345
        "What happens to the Umbilical Veins during embryonic development and after birth?", # 345
        "What are the three phases of a cardiac cycle and what happens during each?", # 358
        "What are the main peculiarities observed in the fetal heart's vascular system?" # 359
    ]   

    today = datetime.now()
    folder_name = f"{today.month}월{today.day}일"
    output_dir = os.path.join("./result", "knowledge_graph", folder_name)
    for i, q in enumerate(questions):
        print(f"질문: {q}")
        response, context = qa_system.generate_response(q)
        print(f"답변: {response}\n")
        save_results_to_file(q, response, context, output_dir, i + 1)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
using automatically assigned random_state=756960678
No random seed is specified. Setting to 1382302168.
INFO:pykeen.triples.triples_factory:Creating inverse triples.


DB가 이미 초기화되어있음


Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

INFO:pykeen.triples.triples_factory:Creating inverse triples.


Evaluating on cuda:0:   0%|          | 0.00/827 [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.07s seconds


KGE 모델 학습 완료
질문: What are the two essential components of a higher organism cell as defined in the text?
범용 토크나이저를 사용하여 길이를 근사치로 계산합니다. (Ollama 등)
답변: According to the text, the two essential components of a higher organism cell are cytoplasm and a nucleus (Page 6). The cytoplasm is described as a soft, jelly-like material, while the nucleus is a small spherical body embedded within it (Page 6).

질문: Describe the four main phases of indirect cell division (karyokinesis) as outlined in the text.
답변: 관련 정보를 지식 그래프에서 찾을 수 없습니다.

질문: What is the primary role of the yolk-sac in the embryo's early development?
범용 토크나이저를 사용하여 길이를 근사치로 계산합니다. (Ollama 등)
답변: The yolk-sac provides nourishment to the embryo during its earlier stages of existence (Page 20). It contains vitelline fluid which may be utilized for this nourishment (Page 20). Nutritive material is absorbed from the yolk-sac and conveyed to the embryo via the vitelline circulation (Page 20).

질문: How does the embryo separate from the yo

: 

In [None]:
# def _search_knowledge_graph(self, entities: List[str], relations: List[str]) -> List[Dict[str, Any]]:
    #     processed_entities = [self._preprocess_text(e["name"]) for e in entities if "name" in e]
    #     processed_relations = [self._preprocess_text(r["relation"]) for r in relations if "relation" in r]

    #     found_results = []

    #     similar_entities = []
    #     if processed_entities:
    #         entity_results = self.entity_collection.query(
    #             query_texts=processed_entities,
    #             n_results=2, 
    #             include=["metadatas", "distances"]
    #         )
    #         if entity_results['distances']:
    #             for i, dists in enumerate(entity_results['distances']):
    #                 for j, dist in enumerate(dists):
    #                     if dist <= self.similarity_threshold:
    #                         meta = entity_results['metadatas'][i][j]
    #                         similar_entities.append(meta['original_name'])
        
    #     similar_relations = []
    #     if processed_relations:
    #         relation_results = self.relation_collection.query(
    #             query_texts=processed_relations,
    #             n_results=1,
    #             include=["metadatas", "distances"]
    #         )
    #         if relation_results['distances'] and relation_results['distances'][0]:
    #             if relation_results['distances'][0][0] <= self.similarity_threshold:
    #                 meta = relation_results['metadatas'][0][0]
    #                 similar_relations.append(meta['original_name'])

    #     similar_entities = list(set(similar_entities))

    #     inferred_relations = []
    #     if self.kge_model and entities :
    #         for entity_data in entities :
    #             entity_name = entity_data["name"]
    #             head_emb = self._get_kge_embedding(entity_name)
    #             if head_emb is not None :
    #                 for rel_name in self.triples_factory.relation_to_id.keys() :
    #                     rel_emb = self._get_kge_relation_embedding(rel_name)
    #                     if rel_emb is not None :
    #                         if rel_name not in similar_relations :
    #                             inferred_relations.append(rel_name)

    #     all_relevant_relations = list(set(similar_relations + inferred_relations))

    #     if similar_entities and all_relevant_relations:
    #         for u, v, data in self.graph.edges(data=True):
    #             if (u in similar_entities or v in similar_entities) and data.get('type') in all_relevant_relations:
    #                 result = data.copy()
    #                 result['source_node'] = u
    #                 result['target_node'] = v
    #                 if 'source_page' in result:
    #                     found_results.append(result)
        
    #     if not found_results and similar_entities:
    #         for u, v, data in self.graph.edges(data=True):
    #             if u in similar_entities or v in similar_entities:
    #                 result = data.copy()
    #                 result['source_node'] = u
    #                 result['target_node'] = v
    #                 if 'source_page' in result:
    #                     found_results.append(result)
        
    #     if not found_results:
    #         return []
            
    #     unique_results = []
    #     seen = set()
    #     for res in found_results:
    #         identifier = (res.get('source_node'), res.get('target_node'), res.get('type'))
    #         if identifier not in seen:
    #             unique_results.append(res)
    #             seen.add(identifier)
                
    #     return unique_results
    
    # def _retrieve_context_from_md(self, search_results: List[Dict[str, Any]], n_sentences: int = 2) -> Tuple[str, List[str]]:
    #     context = ""
    #     pages = sorted(list(set(res.get('source_page') for res in search_results if res.get('source_page'))))
        
    #     if not pages:
    #         return "", []

    #     all_md_files = [f for f in os.listdir(self.md_path) if f.endswith('.md')]
        
    #     page_texts = {}
    #     for page_num_str in pages:
    #         page_num = int(page_num_str)
    #         for md_file in all_md_files:
    #             with open(os.path.join(self.md_path, md_file), 'r', encoding='utf-8') as f:
    #                 content = f.read()
                
    #             match = re.search(rf"####\s+Page\s+{page_num}\b(.*?)(?=####\s+Page|\Z)", content, re.S)
    #             if match:
    #                 page_texts[page_num_str] = match.group(1).strip()
    #                 break
        
    #     context_parts = []
    #     for result in search_results:
    #         page_num = result.get('source_page')
    #         page_content = page_texts.get(page_num)
            
    #         if not page_content:
    #             continue
                
    #         source_node = result.get('source_node')
    #         target_node = result.get('target_node')
            
    #         sentences = sent_tokenize(page_content)
    #         for i, sent in enumerate(sentences):
    #             if source_node and source_node in sent and target_node and target_node in sent:
    #                 start = max(0, i - n_sentences)
    #                 end = min(len(sentences), i + n_sentences + 1)
    #                 context_snippet = " ".join(sentences[start:end])
    #                 context_parts.append(f"... {context_snippet} ... (출처: Page {page_num})")
    #                 break
        
    #     context = "\n".join(context_parts)
    #     return context, pages