In [None]:
import numpy as np
import re
import os
import json
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from model_loader.config import embedding_loader, generation_loader
from chroma_db import ChromaVectorDB

class HybridSearcher:
    def __init__(self, embedding_model, chunk_size=200, chunk_overlap=50, 
                 persist_directory="./chroma_db", collection_name="Gray"):
        """
        하이브리드 검색 클래스 초기화
        
        Args:
            embedding_model: 임베딩 모델
            chunk_size (int): 청크 크기
            chunk_overlap (int): 청크 간 중복 크기
            persist_directory (str): 벡터 DB 저장 경로
            collection_name (str): 벡터 DB 컬렉션 이름
        """
        self.embedding_model = embedding_model
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.chunks = None
        self.chunk_metadata = None
        self.bm25_index = None
        self.vector_index = None
        
        # ChromaDB 벡터 DB 클래스 초기화 - 동일한 임베딩 모델 사용
        self.vector_db = ChromaVectorDB(
            embedding_model=self.embedding_model,
            persist_directory=persist_directory,
            collection_name=collection_name
        )
        
        # 검색 결과 비교용 플래그
        self.debug_mode = False
        
        # 프롬프트 로드
        self.complexity_prompt = self._load_prompt("prompts/en/complex/complex_prompt.txt")
        self.decompose_prompt = self._load_prompt("prompts/en/decompose/decompose_prompt.txt")

    def _load_prompt(self, file_path):
        """프롬프트 파일 로드"""
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                return f.read()
        except Exception as e:
            print(f"프롬프트 파일 로드 중 오류 발생: {e}")
            # 기본 프롬프트 반환
            if "complex" in file_path:
                return "질문을 분석하여 복합적인 질문인지 판단하세요. 복합적인 질문은 여러 하위 질문으로 분해할 수 있습니다. '예' 또는 '아니오'로만 답변하세요."
            elif "decompose" in file_path:
                return "다음 복합적인 질문을 여러 개의 간단한 하위 질문으로 분해하세요. JSON 형식으로 하위 질문 목록을 반환하세요."

    def load_document(self, file_path):
        """문서 로드 및 청크 분할"""
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
        
        # 문서를 청크로 분할
        self.chunks, self.chunk_metadata = self._split_into_chunks_with_metadata(content)
        
        # BM25 인덱스 생성
        tokenized_chunks = [self._simple_tokenize(chunk) for chunk in self.chunks]
        self.bm25_index = BM25Okapi(tokenized_chunks)
        
        # 벡터 임베딩 생성 및 인메모리 인덱스 구축
        self.vector_index = self.embedding_model.encode(self.chunks)
        
        # ChromaDB에 문서 추가
        # 문서 ID 생성
        doc_name = os.path.basename(file_path)
        doc_ids = [f"{doc_name}_{i}" for i in range(len(self.chunks))]
        
        # 메타데이터에 페이지 정보 포함
        metadatas = [{"page": meta["page"], "source": doc_name, "chunk_index": i} 
                    for i, meta in enumerate(self.chunk_metadata)]
        
        # 벡터 DB에 문서 추가
        self.vector_db.add_documents(
            documents=self.chunks,
            metadatas=metadatas,
            ids=doc_ids
        )
        
        return len(self.chunks)
    
    def search(self, query, top_n=5, alpha=0.5, use_vector_db=True):
        """
        하이브리드 검색 수행
        
        Args:
            query (str): 검색 쿼리
            top_n (int): 반환할 결과 수
            alpha (float): BM25와 벡터 검색 결과 결합 비율 (0에 가까울수록 벡터 검색 중시)
            use_vector_db (bool): ChromaDB 벡터 DB 사용 여부
        
        Returns:
            list: 검색 결과 리스트
        """
        if self.chunks is None or self.bm25_index is None:
            raise ValueError("문서가 로드되지 않았습니다. load_document()를 먼저 호출하세요.")
        
        if not isinstance(query, str) :
            query = str(query)
            
        # BM25 검색 수행
        bm25_scores = self.bm25_index.get_scores(self._simple_tokenize(query))
        
        # 두 검색 방식 모두 수행 (디버그 모드)
        query_embedding = self.embedding_model.encode([query])[0]
        memory_scores = cosine_similarity([query_embedding], self.vector_index)[0]
        
        if use_vector_db:
            # ChromaDB를 사용한 벡터 검색
            db_results = self.vector_db.search(query, top_n=len(self.chunks))
            
            # 결과를 벡터 점수로 변환
            vector_scores = np.zeros(len(self.chunks))
            for res in db_results:
                # 문서 ID에서 인덱스 추출
                chunk_id = res["id"]
                if "_" in chunk_id:
                    try:
                        chunk_index = int(chunk_id.split("_")[-1])
                        if chunk_index < len(self.chunks):
                            # 거리를 유사도로 변환 (코사인 거리는 1 - 코사인 유사도)
                            if res["distance"] is not None:
                                # ChromaDB가 코사인 거리를 사용하는 경우
                                similarity = 1.0 - res["distance"]
                                vector_scores[chunk_index] = similarity
                    except ValueError:
                        continue
            
            # 디버그 모드에서 점수 비교
            if self.debug_mode:
                print("\n벡터 검색 점수 비교:")
                for i in range(min(5, len(self.chunks))):
                    print(f"Chunk {i}: Memory={memory_scores[i]:.4f}, ChromaDB={vector_scores[i]:.4f}, 차이={memory_scores[i]-vector_scores[i]:.4f}")
        else:
            # 메모리 내 벡터 검색 수행 (기존 방식)
            vector_scores = memory_scores
        
        # 검색 결과 결합
        combined_scores = self._combine_scores(bm25_scores, vector_scores, alpha)
        
        # 상위 N개 결과 반환
        top_indices = np.argsort(combined_scores)[-top_n:][::-1]
        results = [
            {
                "chunk": self.chunks[i],
                "score": combined_scores[i],
                "bm25_score": bm25_scores[i],
                "vector_score": vector_scores[i],
                "memory_score": memory_scores[i],  # 디버깅용
                "index": i,
                "page": self.chunk_metadata[i]["page"]
            }
            for i in top_indices
        ]
        
        return results

    def _split_into_chunks_with_metadata(self, text):
        """텍스트를 청크로 분할하고 페이지 정보를 메타데이터로 유지하는 함수"""
        chunks = []
        chunk_metadata = []
        
        # 페이지 패턴 정규식 (####으로 시작하는 페이지 헤더)
        page_pattern = re.compile(r'####\s*Page (\d+)')
        
        # 텍스트를 줄 단위로 처리
        lines = text.split('\n')
        current_page = "unknown"
        current_chunk = ""
        
        for line in lines:
            # 페이지 헤더 확인
            page_match = page_pattern.match(line)
            
            if page_match:
                # 새 페이지 시작
                # 현재 청크가 있으면 저장
                if current_chunk.strip():
                    chunks.append(current_chunk.strip())
                    chunk_metadata.append({"page": current_page})
                    current_chunk = ""
                
                # 새 페이지 번호 설정
                current_page = page_match.group(1)
                continue
            
            # 현재 청크에 라인 추가
            current_chunk += line + "\n"
            
            # 청크 크기 확인
            if len(current_chunk) >= self.chunk_size:
                chunks.append(current_chunk.strip())
                chunk_metadata.append({"page": current_page})
                current_chunk = ""  # 새 청크 시작
        
        # 마지막 청크 처리
        if current_chunk.strip():
            chunks.append(current_chunk.strip())
            chunk_metadata.append({"page": current_page})
        
        # 너무 작은 청크 결합 (메타데이터 유지)
        i = 0
        while i < len(chunks) - 1:
            if len(chunks[i]) + len(chunks[i+1]) < self.chunk_size:
                # 같은 페이지인 경우에만 결합
                if chunk_metadata[i]["page"] == chunk_metadata[i+1]["page"]:
                    chunks[i] = chunks[i] + "\n\n" + chunks[i+1]
                    chunks.pop(i+1)
                    chunk_metadata.pop(i+1)
                else:
                    i += 1
            else:
                i += 1
        
        return chunks, chunk_metadata
    
    def _simple_tokenize(self, text):
        """텍스트를 간단히 토크나이징하는 함수"""
        if not isinstance(text, str):
            text = str(text)
        return re.findall(r'\w+', text.lower())
    
    def _combine_scores(self, bm25_scores, vector_scores, alpha=0.5):
        """BM25와 벡터 검색 점수를 결합"""
        # 점수 정규화
        if np.max(bm25_scores) > 0:
            bm25_scores = bm25_scores / np.max(bm25_scores)
        if np.max(vector_scores) > 0:
            vector_scores = vector_scores / np.max(vector_scores)
        
        # 가중 평균 계산
        combined = alpha * bm25_scores + (1 - alpha) * vector_scores
        return combined
    
    def get_chunks_with_page_info(self, indices=None):
        """청크와 페이지 정보 반환"""
        if indices is None:
            return [(chunk, self.chunk_metadata[i]["page"]) for i, chunk in enumerate(self.chunks)]
        else:
            return [(self.chunks[i], self.chunk_metadata[i]["page"]) for i in indices if i < len(self.chunks)]
    
    # 1. 질문이 복합적인지 확인하는 메서드
    def is_complex_question(self, question):
        """
        LLM을 사용하여 질문이 복합적인지 판단하는 메서드
        
        Args:
            question (str): 사용자 질문
            
        Returns:
            bool: 복합적인 질문이면 True, 아니면 False
        """
        prompt = self.complexity_prompt.format(question=question)
        response = generation_loader.generate(prompt)
        
        # '예' 또는 'Yes'가 응답에 포함되어 있으면 복합적인 질문으로 간주
        response = response.lower().strip()
        print(f"###############질문이 복잡한가요? : {response}")
        return '예' in response or 'yes' in response
    
    # 2. 복합적인 질문을 하위 질문으로 분해하는 메서드
    def decompose_question(self, complex_question):
        """
        LLM을 사용하여 복합적인 질문을 여러 개의 하위 질문으로 분해하는 메서드
        
        Args:
            complex_question (str): 복합적인 사용자 질문
            
        Returns:
            list: 하위 질문 목록
        """
        prompt = self.decompose_prompt.format(question=complex_question)
        response = generation_loader.generate(prompt)

        try:
            # JSON 형식으로 반환된 하위 질문 파싱
            # JSON 블록 추출 (```json과 ```로 감싸져 있을 수 있음)
            json_match = re.search(r'```json\s*(.+?)\s*```', response, re.DOTALL)
            if json_match:
                response = json_match.group(1)
            
            # 중괄호 블록 추출
            json_match = re.search(r'(\{.+?\})', response, re.DOTALL)
            if json_match:
                response = json_match.group(1)
                
            sub_questions_data = json.loads(response)
            # 다양한 JSON 형식 처리
            if isinstance(sub_questions_data, list):
                return sub_questions_data
            elif isinstance(sub_questions_data, dict):
                if "questions" in sub_questions_data:
                    return sub_questions_data["questions"]
                elif "subQuestions" in sub_questions_data:
                    return sub_questions_data["subQuestions"]
                else:
                    # 딕셔너리의 값들을 리스트로 반환
                    return list(sub_questions_data.values())
                
        except (json.JSONDecodeError, AttributeError) as e:
            print(f"하위 질문 파싱 오류: {e}")
            print(f"LLM 응답: {response}")
            
            # 파싱 실패 시 줄바꿈을 기준으로 질문 추출 시도
            questions = []
            for line in response.split('\n'):
                line = line.strip()
                if line and ('?' in line or '질문' in line):
                    # 숫자, 점, 괄호 등의 접두어 제거
                    cleaned_line = re.sub(r'^[\d\.\)\-\s]+', '', line).strip()
                    if cleaned_line:
                        questions.append(cleaned_line)
            
            if questions:
                return questions
            # 단일 질문으로 처리
            return [complex_question]

In [None]:
from model_loader.config import *
from save_utils import *
from translate import *

class CarManualQA:
    def __init__(self, generation_loader, data_folder="./data/split_file", 
                 prompt_path_ko="./prompts/ko/generation/gemma3/generation_prompt2.txt", 
                 prompt_path_en="./prompts/en/generation/gemma3/generation_prompt2.txt", 
                 result_path="./result/5월12일/gemma3",
                 use_vector_db=True, 
                 persist_directory="./chroma_db",
                 collection_name="Gray",
                 language="en"):
        """
        자동차 매뉴얼 Q&A 시스템 초기화
        
        Args:
            generation_loader: 텍스트 생성 모델 로더
            data_folder (str): 분할된 데이터 파일들이 있는 폴더 경로
            prompt_path (str): 프롬프트 템플릿 파일 경로
            result_path (str): 결과를 저장할 경로
            use_vector_db (bool): ChromaDB 벡터 DB 사용 여부
            persist_directory (str): 벡터 DB 저장 경로
            collection_name (str): 벡터 DB 컬렉션 이름
        """
        self.data_folder = data_folder
        self.result_path = result_path
        self.use_vector_db = use_vector_db
        self.language = language

        if collection_name == "Gray" :
            collection_name = f"Gray_{language}"
        
        # 하이브리드 검색기 초기화 - ChromaDB 지원 버전
        self.searcher = HybridSearcher(
            embedding_model=embedding_loader,
            chunk_size=200, 
            chunk_overlap=50,
            persist_directory=persist_directory,
            collection_name=collection_name
        )
        
        self.loader = generation_loader
        self.prompt_path_ko = prompt_path_ko
        self.prompt_path_en = prompt_path_en

        if language=="ko" :
            self.prompt_template = self._load_prompt(prompt_path_ko)
        else :
            self.prompt_template = self._load_prompt(prompt_path_en)
        
        # 카테고리별 파일 매핑
        if language == "en" :
            self.category_to_file = {
                "16": "GrayAnatomy_Formatted.md"
            }
        else :
            self.category_to_file = {
                "16": "full.txt"
            }
        
        try :
            self.translate_en_to_ko = en_to_ko
        except ImportError :
            print("translate.py 모듈을 임포트할 수 없습니다.")
            self.translate_en_to_ko = en_to_ko

        # 이미 처리된 파일 추적
        self.loaded_files = set()
        
        # 전체 파일을 벡터 DB에 미리 로드할지 여부
        self.preload_all = False
        
        # 디버그 모드
        self.debug_mode = False
        
    def preload_documents(self):
        """
        모든 카테고리의 파일을 미리 벡터 DB에 로드
        """
        print("모든 문서를 벡터 DB에 로드 중...")
        for category, filename in self.category_to_file.items():
            file_path = os.path.join(self.data_folder, filename)
            if os.path.exists(file_path):
                print(f"카테고리 {category}: {filename} 로드 중...")
                self._load_document(file_path)
                self.loaded_files.add(file_path)
            else:
                print(f"[경고] 파일을 찾을 수 없습니다: {file_path}")
        
        print(f"총 {len(self.loaded_files)}개 파일이 벡터 DB에 로드되었습니다.")
        self.preload_all = True

    def _load_document(self, file_path):
        """
        문서를 로드하여 검색기에 추가
        
        Args:
            file_path (str): 로드할 파일 경로
            
        Returns:
            int: 로드된 청크 수
        """
        # 이미 로드된 파일이면 건너뛰기
        if file_path in self.loaded_files and self.preload_all:
            if self.debug_mode:
                print(f"이미 로드된 파일입니다: {file_path}")
            return 0
        
        # 파일 로드 및 벡터 DB에 추가
        num_chunks = self.searcher.load_document(file_path)
        
        # 파일 추적 목록에 추가
        self.loaded_files.add(file_path)
        
        if self.debug_mode:
            print(f"파일 로드 완료: {file_path} ({num_chunks}개 청크)")
        
        return num_chunks

    def filter_relevant_content(self, query, search_results, threshold=0.7, top_k=5):
        """
        LLM을 사용하여 검색 결과에서 질문과 관련이 있는 내용을 평가하고 
        관련성 점수가 높은 상위 k개 결과만 반환
        
        Args:
            query (str): 사용자 질의
            search_results (list): 검색 결과 목록
            threshold (float): 관련성 점수 임계값 (0.0~1.0)
            top_k (int): 반환할 상위 결과 개수
            
        Returns:
            list: 점수가 높은 상위 k개의 필터링된 관련 정보 목록
        """
        scored_results = []
        
        if self.debug_mode:
            print(f"검색 결과 {len(search_results)}개에 대한 관련성 필터링 시작")
        
        # 각 검색 결과에 대해 관련성 평가
        for result in search_results:
            # 프롬프트 구성 - 관련성 평가용
            relevance_prompt = f"""
    다음은 사용자의 질문입니다:
    "{query}"
    다음은 검색된 텍스트 정보입니다:
    "{result['chunk']}"
    위 텍스트가 사용자 질문에 얼마나 관련이 있는지 평가해주세요.
    평가는 다음과 같이 응답해주세요:
    1. 관련성 점수: 0.0 ~ 1.0 사이의 숫자 (1.0이 가장 관련성 높음)
    2. 이유: 관련성이 높거나 낮은 이유를 간략하게 설명
    응답 형식:
    {{
    "score": 0.0~1.0,
    "reason": "평가 이유"
    }}
    """
            
            relevance_result = self.loader.generate(relevance_prompt)
            
            try:
                # JSON 응답 파싱 (정확한 JSON이 아닐 수 있으므로 예외 처리)
                import re
                import json
                
                # JSON 형식 추출 시도
                json_match = re.search(r'\{.*"score".*:.*,.*"reason".*:.*\}', relevance_result, re.DOTALL)
                if json_match:
                    relevance_data = json.loads(json_match.group(0))
                    score = float(relevance_data.get("score", 0))
                else:
                    # 숫자만 추출 시도
                    score_match = re.search(r'score"?\s*:?\s*(\d+\.\d+|\d+)', relevance_result)
                    score = float(score_match.group(1)) if score_match else 0.0
                
                # 점수와 함께 결과 저장 (임계값 이상인 것만)
                if score >= threshold:
                    if self.debug_mode:
                        print(f"관련성 높음 (점수: {score:.2f}): {result['page']}페이지")
                    # 원본 결과에 점수 정보 추가
                    result_with_score = result.copy()
                    result_with_score['relevance_score'] = score
                    scored_results.append(result_with_score)
                else:
                    if self.debug_mode:
                        print(f"관련성 낮음 (점수: {score:.2f}): {result['page']}페이지")
                    
            except Exception as e:
                # 파싱 실패 시 안전을 위해 낮은 점수로 포함
                if self.debug_mode:
                    print(f"관련성 평가 파싱 오류: {str(e)}")
                result_with_score = result.copy()
                result_with_score['relevance_score'] = 0.1  # 낮은 기본 점수
                scored_results.append(result_with_score)
        
        # 점수를 기준으로 내림차순 정렬
        sorted_results = sorted(scored_results, key=lambda x: x['relevance_score'], reverse=True)
        
        # 상위 k개 결과만 선택
        top_results = sorted_results[:top_k]
        
        if self.debug_mode:
            print(f"필터링 결과: 총 {len(search_results)}개 중 {len(scored_results)}개가 임계값 통과")
            print(f"상위 {min(top_k, len(top_results))}개 결과만 선택됨")
            for i, res in enumerate(top_results):
                print(f"  {i+1}위: 점수 {res['relevance_score']:.2f} - {res['page']}페이지")
        
        # 결과가 없는 경우 원본의 일부라도 반환
        if not top_results and search_results:
            if self.debug_mode:
                print(f"필터링 결과가 없어 원본 상위 결과 {min(top_k, len(search_results))}개 포함")
            top_results = search_results[:min(top_k, len(search_results))]
        
        return top_results
    
    def _format_filtered_results(self, filtered_results):
        """
        필터링된 결과를 문자열로 변환
        
        Args:
            filtered_results (list): 필터링된 검색 결과 리스트
            
        Returns:
            str: 포맷된 문자열
        """
        result_str = ""
        
        for item in filtered_results:
            result_str += f"# 질문: {item['sub_question']}\n\n"
            
            # results가 이미 점수로 정렬되었다고 가정
            for res in item['results']:
                page_info = res['page']
                score_info = ""
                if 'relevance_score' in res:
                    score_info = f" (관련성: {res['relevance_score']:.2f})"
                
                result_str += f"## {page_info}페이지{score_info}\n"
                result_str += f"{res['chunk']}\n\n"
        
        return result_str

    def generate_response(self, query, category, top_n=5, alpha=0.5, target_language=None):
        """
        질의에 대한 응답 생성
        
        Args:
            query (str): 사용자 질의
            category (str): 검색할 카테고리
            top_n (int): 검색 결과 수
            alpha (float): BM25와 벡터 검색 가중치 (높을수록 BM25 중시)
            
        Returns:
            dict: 생성된 응답 및 메타데이터
        """
        try:
            response_language = target_language if target_language else self.language
            source_language = self.language

            # 파일 로드 확인
            filename = self.category_to_file[category]
            file_path = os.path.join(self.data_folder, filename)
            
            # 해당 파일이 존재하지 않는 경우
            if not os.path.exists(file_path):
                error_msg = f"[오류] 파일을 찾을 수 없습니다: {file_path}"
                print(error_msg)
                return {"답변": error_msg, "후처리": error_msg}
            
            # 파일 로드 (이미 로드된 경우 건너뜀)
            if not self.preload_all:
                self._load_document(file_path)
            
            # 단계 1: 질문 복잡성 판단
            # is_complex = self.searcher.is_complex_question(query)
            
            # if self.debug_mode:
            #     print(f"질문 복잡성 분석: {'복합적인 질문' if is_complex else '단순 질문'}")
            
            # if is_complex:
            #     # 복합적인 질문 처리
            #     # 단계 2: 질문 분해
            #     sub_questions = self.searcher.decompose_question(query)
            #     print(f"하위 질문 분해 결과: {sub_questions}")
                
            #     # 모든 하위 질문에 대한 검색 결과를 저장할 리스트
            #     all_search_results = []
            #     all_filtered_results = []

            #     # 단계 3: 각 하위 질문에 대해 검색 수행
            #     for i, sub_q in enumerate(sub_questions):
            #         if self.debug_mode:
            #             print(f"하위 질문 {i+1} 검색 중: '{sub_q}'")
                    
            #         if not isinstance(query, str) :
            #             query = str(query)
            #         # 하위 질문에 대한 검색 수행
            #         search_results = self.searcher.search(
            #             query=sub_q, 
            #             top_n=top_n, 
            #             alpha=alpha,
            #             use_vector_db=self.use_vector_db
            #         )
            #         filtered_results = self.filter_relevant_content(query, search_results)

            #         # 검색 결과 저장
            #         all_search_results.append({
            #             "sub_question": sub_q,
            #             "results": search_results
            #         })

            #         all_filtered_results.append({
            #             "sub_question" : sub_q,
            #             "results" : filtered_results
            #         })

            #     # 컨텍스트 구성 - 모든 하위 질문 결과 통합
            #     context = ""
            #     for i, sr in enumerate(all_filtered_results):
            #         context += f"\n### 하위 질문 {i+1}: {sr['sub_question']}\n"
                    
            #         for _, res in enumerate(sr["results"]):
            #             context += f"#### Page {res['page']}\n"
            #             context += f"{res['chunk']}\n\n"
                
            #     # 프롬프트 구성 - 복합 질문용
            #     prompt = self.prompt_template.format(context=context, query=query)
                
            # else:
            #     # 단순 질문 처리 - 기존 방식대로 검색
            #     search_results = self.searcher.search(
            #         query=query, 
            #         top_n=top_n, 
            #         alpha=alpha,
            #         use_vector_db=self.use_vector_db
            #     )
                
            #     filtered_results = self.filter_relevant_content(query, search_results)

            #     # 컨텍스트 구성 - 단순 질문용
            #     context = "\n\n".join([f"#### Page {result['page']}\n{result['chunk']}" for result in filtered_results])
                
            #     # 프롬프트 구성 - 단순 질문용
            #     prompt = self.prompt_template.format(context=context, query=query)
            

            search_results = self.searcher.search(
                query=query, 
                top_n=top_n, 
                alpha=alpha,
                use_vector_db=self.use_vector_db
            )
            
            # filtered_results = self.filter_relevant_content(query, search_results)

            # 컨텍스트 구성 - 단순 질문용
            # context = "\n\n".join([f"#### Page {result['page']}\n{result['chunk']}" for result in filtered_results])
            context = "\n\n".join([f"#### Page {result['page']}\n{result['chunk']}" for result in search_results])
            # 프롬프트 구성 - 단순 질문용
            prompt = self.prompt_template.format(context=context, query=query)

            # 응답 생성 파트
            if hasattr(self.loader, "tokenizer"):
                # Huggingface 모델 사용
                tokenizer = self.loader.tokenizer
                model = self.loader.model
                
                # 인풋 토크나이즈
                input_ids = tokenizer.encode(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
                attention_mask = (input_ids != tokenizer.pad_token_id).long().to(model.device)
                
                # 텍스트 생성
                output = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_new_tokens=400,
                    temperature=0.3,
                    do_sample=False,
                    top_p=0.85,
                    repetition_penalty=1.2,
                    early_stopping=True,
                    num_beams=3,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                )
                generated_ids = output[0][input_ids.shape[-1]:]
                raw_answer = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
            else:
                # OpenAI 등 API 기반 모델 사용 (Ollama 포함)
                raw_answer = self.loader.generate(prompt)
            
            # 응답 후처리
            final_answer = self._extract_answer_content(raw_answer)
            final_answer = self._remove_chinese_characters(final_answer)
            
            translated_answer = None
            if source_language == "en" and response_language == "ko" :
                if self.translate_en_to_ko :
                    try :
                        translated_answer = self.translate_en_to_ko(final_answer)
                        if self.debug_mode :
                            print("영어 -> 한국어 번역 완료")
                    except Exception as e :
                        print(f"번역 중 오류 발생 : {e}")
                        translated_answer = f"[번역 오류] {final_answer}"
                else :
                    translated_answer = f"[번역 모듈 없음] {final_answer}"
            # 결과 준비
            response = {
                "답변": raw_answer,
                "후처리": final_answer,
                "문서 일부": context,
                "question_en": query,
                "answer_en": raw_answer,
                # "is_complex": is_complex,
                # "필터링_결과": all_filtered_results
            }
            
            if translated_answer :
                response["번역된 답변"] = translated_answer
                
            # 복합 질문인 경우 하위 질문 정보 추가
            # if is_complex:
            #     response["sub_questions"] = sub_questions
            #     response["sub_search_results"] = all_search_results
            #     response["filtered_search_results"] = all_filtered_results
            # else:
            #     response["검색_결과"] = search_results
            #     response["필터링_결과"] = all_filtered_results
            
            # 결과 저장
            try:
                # result_path가 없으면 상위 스코프나 기본값으로 설정
                result_path = getattr(self, 'result_path', '../result')
                
                # 알파값으로 폴더 경로 생성
                # alpha_str = f"{alpha:.1f}"
                # db_type = "chromadb" if self.use_vector_db else "memory"
                # question_type = "complex" if is_complex else "simple"
                # result_path = os.path.join(result_path, f"{db_type}_{question_type}_alpha_{alpha_str}")
                
                # 폴더가 없으면 생성
                os.makedirs(result_path, exist_ok=True)
                
                filtered_results_str = ""
                filtered_results_str = self._format_filtered_results(response["필터링_결과"])

                # 파일 저장
                if self.debug_mode:
                    print(f"저장 시도: {result_path}")
                save_response_to_file(
                    query=query,
                    answer=response["답변"],
                    final_answer=response["번역된 답변"],
                    context=response["문서 일부"],
                    folder=result_path,
                    filtered_results=filtered_results_str
                )
                if self.debug_mode:
                    print(f"저장 완료: {result_path}")
            except Exception as e:
                print(f"결과 저장 중 오류 발생: {e}")
            
            return response
            
        except Exception as e:
            error_msg = f"[생성 오류] {str(e)}"
            print(error_msg)
            import traceback
            traceback.print_exc()
            return {"답변": error_msg, "후처리": error_msg}
    
    def _load_prompt(self, path):
        """프롬프트 템플릿 로드"""
        try:
            with open(path, "r", encoding="utf-8") as f:
                return f.read()
        except Exception as e:
            print(f"[ERROR] 프롬프트 로드 실패: {e}")
            raise
    
    
    def _extract_answer_content(self, text):
        """응답에서 답변 부분만 추출"""
        pattern = r"(?:<\|?|<|)?\|?answer\|?(?:\|?>|>)?(.*?)(?:<\|?|<|)?\|?endanswer\|?(?:\|?>|>)?"
        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
        return match.group(1).strip() if match else text.strip()
    
    def _remove_chinese_characters(self, text):
        """중국어 문자 제거"""
        return re.sub(r'[\u4E00-\u9FFF]', '', text)
    
    def get_collection_stats(self):
        """벡터 DB 컬렉션 통계 조회"""
        if hasattr(self.searcher, 'vector_db') and hasattr(self.searcher.vector_db, 'get_collection_stats'):
            return self.searcher.vector_db.get_collection_stats()
        return {"error": "벡터 DB 통계를 조회할 수 없습니다."}
    
    def toggle_vector_db(self, use_vector_db=None):
        """벡터 DB 사용 여부 전환"""
        if use_vector_db is not None:
            self.use_vector_db = use_vector_db
        else:
            self.use_vector_db = not self.use_vector_db
        
        print(f"벡터 DB 사용 여부: {self.use_vector_db}")
        return self.use_vector_db
    
    def set_debug_mode(self, debug_mode=True):
        """디버그 모드 설정"""
        self.debug_mode = debug_mode
        return self.debug_mode

In [None]:
import os
import chromadb
from chromadb.utils import embedding_functions
import re
import pathlib

class ChromaVectorDB:
    """ChromaDB를 이용한 벡터 DB 관리 클래스"""
    
    # def __init__(self, embedding_model, persist_directory="./chroma_db", collection_name="Gray"):
    #     """
    #     ChromaDB 초기화
        
    #     Args:
    #         embedding_model: 임베딩 모델 (SentenceTransformer 또는 호환 래퍼)
    #         persist_directory (str): 벡터 DB가 저장될 디렉토리 경로
    #         collection_name (str): 컬렉션 이름
    #     """
    #     # 디렉토리 생성 확인
    #     os.makedirs(persist_directory, exist_ok=True)
        
    #     # 임베딩 모델 저장
    #     self.embedding_model = embedding_model
        
    #     # ChromaDB 클라이언트 설정
    #     self.client = chromadb.PersistentClient(path=persist_directory)
        
    #     # 사용자 정의 임베딩 함수 생성 - 기존 embedding_model을 활용
    #     self.embedding_function = CustomEmbeddingFunction(self.embedding_model)
        
    #     # 컬렉션 생성 또는 가져오기
    #     try:
    #         self.collection = self.client.get_collection(
    #             name=collection_name,
    #             embedding_function=self.embedding_function
    #         )
    #         print(f"기존 컬렉션 '{collection_name}'을 로드했습니다.")
    #     except Exception as e:
    #         print(f"컬렉션 로드 중 오류 발생: {e}")
    #         print(f"새 컬렉션 '{collection_name}'을 생성합니다.")
    #         self.collection = self.client.create_collection(
    #             name=collection_name,
    #             embedding_function=self.embedding_function
    #         )
    def __init__(self, embedding_model, persist_directory="./chroma_db", collection_name="Gray"):
        """
        ChromaDB 초기화
        
        Args:
            embedding_model: 임베딩 모델 (SentenceTransformer 또는 호환 래퍼)
            persist_directory (str): 벡터 DB가 저장될 디렉토리 경로
            collection_name (str): 컬렉션 이름
        """
        # 디렉토리 생성 확인
        os.makedirs(persist_directory, exist_ok=True)
        
        # 임베딩 모델 저장
        self.embedding_model = embedding_model
        
        # 기존 데이터베이스 디렉토리 확인 및 처리
        db_path = pathlib.Path(persist_directory)
        db_exists = db_path.exists() and any(db_path.iterdir())
        
        # 사용자 정의 임베딩 함수 생성 - 기존 embedding_model을 활용
        self.embedding_function = CustomEmbeddingFunction(self.embedding_model)
        
        # 만약 데이터베이스가 손상되었거나 호환성 문제가 있다면 다시 생성
        try:
            self.client = chromadb.PersistentClient(path=persist_directory)
            
            # 컬렉션 가져오기
            try:
                self.collection = self.client.get_collection(
                    name=collection_name,
                    embedding_function=self.embedding_function
                )
                print(f"기존 컬렉션 '{collection_name}'을 로드했습니다.")
            except Exception as e:
                print(f"컬렉션 로드 중 오류 발생: {e}")
                
                # 오류가 'max_seq_id' 관련 문제인지 확인
                if "max_seq_id" in str(e) or "PersistentData" in str(e):
                    print("데이터베이스 구조 호환성 문제가 발견되었습니다. 컬렉션을 재생성합니다.")
                    
                    # 기존 컬렉션 제거 시도
                    try:
                        self.client.delete_collection(name=collection_name)
                        print(f"기존 컬렉션 '{collection_name}'을 삭제했습니다.")
                    except:
                        pass
                    
                    # 새 컬렉션 생성
                    self.collection = self.client.create_collection(
                        name=collection_name,
                        embedding_function=self.embedding_function
                    )
                    print(f"새 컬렉션 '{collection_name}'을 생성했습니다.")
                else:
                    # 다른 종류의 오류면 새 컬렉션 생성
                    print(f"새 컬렉션 '{collection_name}'을 생성합니다.")
                    self.collection = self.client.create_collection(
                        name=collection_name,
                        embedding_function=self.embedding_function
                    )
                    
        except Exception as outer_e:
            print(f"치명적 오류: ChromaDB 클라이언트 초기화 실패: {outer_e}")
            print("백업 솔루션: 기존 데이터베이스를 재설정합니다.")
            
            # 데이터베이스 디렉토리가 존재하면 백업 후 재생성
            if db_exists:
                # 기존 폴더 백업 (이름 변경)
                import shutil
                import time
                
                backup_dir = f"{persist_directory}_backup_{int(time.time())}"
                try:
                    shutil.move(persist_directory, backup_dir)
                    print(f"기존 데이터베이스를 {backup_dir}로 백업했습니다.")
                except Exception as move_err:
                    print(f"백업 실패: {move_err}")
                    # 기존 폴더 삭제 시도
                    try:
                        shutil.rmtree(persist_directory)
                        print(f"기존 데이터베이스 폴더를 삭제했습니다.")
                    except Exception as rm_err:
                        print(f"폴더 삭제 실패: {rm_err}")
            
            # 폴더 재생성
            os.makedirs(persist_directory, exist_ok=True)
            
            # 클라이언트와 컬렉션 새로 생성
            self.client = chromadb.PersistentClient(path=persist_directory)
            self.collection = self.client.create_collection(
                name=collection_name,
                embedding_function=self.embedding_function
            )
            print(f"ChromaDB를 재설정하고 새 컬렉션 '{collection_name}'을 생성했습니다.")
    
    def add_documents(self, documents, metadatas=None, ids=None):
        """
        문서를 벡터 DB에 추가
        
        Args:
            documents (list): 문서 텍스트 리스트
            metadatas (list, optional): 각 문서에 대한 메타데이터 리스트
            ids (list, optional): 각 문서에 대한 고유 ID 리스트
        
        Returns:
            int: 추가된 문서 수
        """
        if ids is None:
            # 고유한 ID 생성 (timestamp + index)
            import time
            timestamp = int(time.time())
            ids = [f"doc_{timestamp}_{i}" for i in range(len(documents))]
        
        # 메타데이터가 제공되지 않은 경우 빈 딕셔너리 생성
        if metadatas is None:
            metadatas = [{} for _ in range(len(documents))]
        
        # 문서 임베딩 미리 계산 - 디버깅 용도
        # embeddings = self.embedding_model.encode(documents)
        
        # 문서 추가 (임베딩은 임베딩 함수가 자동 계산)
        self.collection.add(
            documents=documents,
            metadatas=metadatas,
            ids=ids
        )
        
        return len(documents)
    
    def load_document_file(self, file_path, chunk_size=None):
        """
        파일을 로드하여 벡터 DB에 저장
        
        Args:
            file_path (str): 로드할 파일 경로
            chunk_size (int, optional): 청크 분할 크기 (None이면 페이지 단위로 분할)
            
        Returns:
            int: 추가된 청크 수
        """
        # 파일 확장자 확인
        # _, ext = os.path.splitext(file_path)
        
        # 파일 읽기
        try:
            print(f"파일 로드 중: {file_path}")
            text_content = pathlib.Path(file_path).read_text(encoding="utf-8")
            print(f"파일 크기: {len(text_content)} 문자")
        except Exception as e:
            print(f"파일 로드 오류: {e}")
            return 0
        
        # 청크 분할 방식 결정
        chunks = []
        metadatas = []
        
        if chunk_size:
            # 지정된 청크 크기로 분할
            current_pos = 0
            while current_pos < len(text_content):
                chunk = text_content[current_pos:current_pos + chunk_size]
                chunks.append(chunk)
                metadatas.append({"source": file_path, "chunk_index": len(chunks)})
                current_pos += chunk_size
            
            print(f"크기 기반 분할: {len(chunks)}개 청크 생성")
        else:
            # 페이지 단위 분할 (#### Page X 패턴 사용)
            page_pattern = r'(####\s+Page\s+\d+\s*\n(?:[\s\S]*?)(?=####\s+Page\s+\d+\s*\n|$))'
            page_chunks = re.findall(page_pattern, text_content)
            
            if page_chunks:
                # 페이지 패턴 찾음
                for page_chunk in page_chunks:
                    # 페이지 번호 추출
                    page_match = re.match(r'####\s+Page\s+(\d+)', page_chunk)
                    page_num = page_match.group(1) if page_match else "unknown"
                    
                    chunks.append(page_chunk)
                    metadatas.append({"source": file_path, "page": f"Page {page_num}"})
                
                print(f"페이지 기반 분할: {len(chunks)}개 페이지 찾음")
            else:
                # 페이지 패턴 못찾음 - 문단 단위로 분할
                paragraphs = re.split(r'\n\s*\n', text_content)
                paragraphs = [p.strip() for p in paragraphs if p.strip()]
                
                for i, para in enumerate(paragraphs):
                    chunks.append(para)
                    metadatas.append({"source": file_path, "paragraph": i+1})
                
                print(f"문단 기반 분할: {len(chunks)}개 문단 생성")
        
        # 청크가 없으면 전체 텍스트를 하나의 청크로 처리
        if not chunks:
            chunks = [text_content]
            metadatas = [{"source": file_path, "full_document": True}]
            print("분할 실패: 전체 텍스트를 하나의 청크로 처리")
        
        # 벡터 DB에 추가
        added_count = self.add_documents(
            documents=chunks,
            metadatas=metadatas
        )
        
        print(f"벡터 DB에 {added_count}개 청크 추가 완료")
        return added_count
    
    def search(self, query, top_n=5, metadata_filter=None):
        """
        벡터 유사도 기반 검색 수행
        
        Args:
            query (str): 검색 쿼리
            top_n (int): 반환할 결과 수
            metadata_filter (dict, optional): 메타데이터 필터링 조건
        
        Returns:
            dict: 검색 결과
        """
        # 진단용 - 쿼리 임베딩 계산
        query_embedding = self.embedding_model.encode([query])[0]
        print(f"쿼리 임베딩 shape: {query_embedding.shape}")
        
        # ChromaDB를 통한 검색
        results = self.collection.query(
            query_texts=[query],
            n_results=top_n,
            where=metadata_filter,
            include=["documents", "metadatas", "distances", "embeddings"]
        )
        
        # 결과 포맷팅
        formatted_results = []
        for i in range(len(results["documents"][0])):
            formatted_results.append({
                "chunk": results["documents"][0][i],
                "id": results["ids"][0][i],
                "metadata": results["metadatas"][0][i],
                "distance": results["distances"][0][i] if "distances" in results else None,
                # "embedding": results["embeddings"][0][i] if "embeddings" in results else None
            })
        
        return formatted_results
    
    # def get_all_document_embeddings(self, ids=None):
    #     """
    #     저장된 모든 문서의 임베딩 조회 (디버깅 용도)
    #     """
    #     result = self.collection.get(ids=ids, include=["embeddings", "documents"])
    #     return result
    
    # def get_document_by_id(self, doc_id):
    #     """ID로 문서 조회"""
    #     result = self.collection.get(ids=[doc_id])
    #     if result["documents"]:
    #         return {
    #             "chunk": result["documents"][0],
    #             "metadata": result["metadatas"][0]
    #         }
    #     return None
    
    # def delete_document(self, doc_id):
    #     """ID로 문서 삭제"""
    #     self.collection.delete(ids=[doc_id])
    
    def delete_collection(self):
        """컬렉션 삭제"""
        self.client.delete_collection(self.collection.name)
    
    def get_collection_stats(self):
        """컬렉션 통계 조회"""
        count = self.collection.count()
        return {
            "document_count": count,
            "collection_name": self.collection.name
        }
    
class CustomEmbeddingFunction(embedding_functions.EmbeddingFunction):
    """기존 임베딩 모델을 ChromaDB에서 사용하기 위한 래퍼 클래스"""
    
    def __init__(self, embedding_model):
        """
        생성자
        
        Args:
            embedding_model: 임베딩 모델 (encode 메서드 제공)
        """
        self.embedding_model = embedding_model
    
    def __call__(self, texts):
        """
        텍스트를 임베딩 벡터로 변환
        
        Args:
            texts: 인코딩할 텍스트 리스트
        
        Returns:
            임베딩 벡터 리스트
        """
        return self.embedding_model.encode(texts).tolist()

In [None]:
# vector_db = ChromaVectorDB(
#     embedding_model=embedding_loader,
#     persist_directory="./chroma_db",
#     collection_name="Gray_en"
# )
# vector_db.delete_collection()
# print("기존 컬렉션 삭제 완료")


In [None]:
from model_loader.config import *
from chroma_db import *
import chromadb
import numpy as np
import time

questions = [
    "How do the processes of ossification differ between the skull and the vertebral column, and what are the developmental implications of these differences?",
    "Compare and contrast the ovum and spermatozoon in terms of their development, maturation process, and contribution to fertilization, explaining how their specialized features facilitate their respective roles.",
    "How does the structure and development of the vertebral column demonstrate both functional adaptation and evolutionary history, and what role does the notochord play in these processes?",
    "How do the various stages of embryonic implantation, placental development, and fetal membrane formation collectively establish the maternal-fetal interface, and what are the critical structural and functional relationships that develop during this process?",
    "What are the primary and secondary curves of the vertebral column, how do they develop chronologically, and what functional advantages do they provide for bipedal locomotion?",
    "How does the skull's composition of dermal and cartilaginous bones reflect both its developmental origins and its functional requirements, and what are the key differences in the ossification patterns of these two bone types?",
    "Explain the process of fertilization and early embryonic development, describing how the male and female pronuclei interact to form the segmentation nucleus, and how this leads to the formation of the blastocyst and subsequent implantation.",
    "How does the development of the thoracic cage from its embryonic origins to its adult form demonstrate both functional adaptation and regional specialization, and what are the key differences between true and false ribs in terms of their structure and functions?",
    "Compare and contrast the development and structural features of the cervical, thoracic, and lumbar vertebrae, explaining how their specialized characteristics relate to their differing functional roles.",
    "How does the structure of the placenta facilitate its multiple physiological functions while maintaining separation between maternal and fetal circulations, and what developmental stages are critical to establishing this complex organ?",
    "How do the primitive embryonic germ layers (ectoderm, mesoderm, and endoderm) contribute to the formation of the skeletal, nervous, and digestive systems, and what key developmental interactions occur between these systems?",
    "Compare the processes of intramembranous and intracartilaginous ossification, detailing their anatomical sites, cellular mechanisms, and developmental timelines, and explain how these different processes create the various bones of the skeletal system.",
    "Describe the embryonic development of the branchial region and explain how its derivatives contribute to adult craniofacial structures, detailing the specific fates of each branchial arch and the developmental abnormalities that can occur when this process is disrupted.",
    "What is the sequence of events that occurs during the maturation of an ovum, and how does this process differ from spermatogenesis in terms of cellular divisions, chromosome reduction, and the number and viability of resulting cells?",
    "Describe the development of the limbs from their initial appearance as limb buds to their final form, explaining the molecular signals, tissue interactions, and morphogenetic movements involved in this process, and how developmental abnormalities can lead to congenital limb defects.",
    "Compare the structure and ossification patterns of the vertebral column, sternum, and ribs, explaining how these components develop together to form the thoracic cage, and how variations in their development can lead to congenital abnormalities.",
    "How does the skull's development from its embryonic origins demonstrate both segmentation patterns similar to the vertebral column and unique craniofacial adaptations, and what are the key differences in ossification between the cranial base, vault, and facial bones?",
    "Describe the embryonic development of the body cavities, explaining how the coelom divides to form the pericardial, pleural, and peritoneal cavities, and how the diaphragm develops to separate the thoracic and abdominal compartments.",
    "How do the neural tube and notochord interact during development to establish the central nervous system and axial skeleton, and what are the key signaling molecules and inductive interactions that guide this process?",
    "Explain the processes of implantation and placentation, detailing the roles of the trophoblast, decidua, and extraembryonic membranes in establishing the maternal-fetal interface, and how abnormalities in these processes can lead to pregnancy complications.",
    "Compare and contrast the three parts of the axillary artery in terms of their anatomical relations and clinical significance, explaining how these relationships might impact surgical approaches and potential complications during procedures in the axilla.",
    "Describe the course and major branches of the brachial artery, explaining how its anatomical variations affect the collateral circulation after ligation of different segments of the artery, and discuss the clinical implications of these variations for traumatic injuries and surgical procedures in the upper limb.",
    "Analyze the arterial supply of the hand, detailing the formation and clinical significance of the superficial and deep palmar arches, their anastomotic patterns, and how these dual systems provide redundancy that affects treatment strategies for hand injuries and vascular disorders.",
    "Examine the pathway and variations of the radial artery from its origin to the formation of the deep palmar arch, discussing the surgical implications of these variations for procedures such as radial artery harvesting for coronary bypass grafting and the creation of arteriovenous fistulas for hemodialysis access.",
    "Compare and contrast the anatomical course, relations, and branching patterns of the ulnar and radial arteries in the forearm and hand, analyzing how these differences impact the clinical assessment and management of vascular injuries to these vessels.",
    "Analyze the anatomical basis of collateral circulation around the elbow joint following ligation or obstruction of the brachial artery, explaining how the network of anastomosing vessels ensures tissue viability and discussing the clinical implications of this vascular arrangement for traumatic injuries and surgical interventions.",
    "Describe the anatomical boundaries and contents of the axilla, explaining how the neurovascular structures are arranged within this space and discussing the clinical implications of these relationships for procedures such as axillary lymph node dissection, brachial plexus blocks, and approaches to the shoulder joint.",
    "Explain the origin, course, and branching pattern of the abdominal aorta, describing how its major visceral and parietal branches supply different organ systems, and analyzing the clinical significance of important vascular landmarks and anastomoses for surgical procedures and management of vascular diseases.",
    "Compare the anatomical relationships and branching patterns of the axillary and brachial arteries, discussing how these relationships provide landmarks for surgical approaches, and analyzing the implications of anatomical variations for diagnostic procedures, treatment of traumatic injuries, and vascular access in these regions.",
    "Describe the arterial supply to the rectum and anal canal, explaining how the superior, middle, and inferior rectal arteries form a complex anastomotic network, and analyze the clinical significance of this vascular arrangement for colorectal surgical procedures, management of hemorrhoids, and potential consequences of vascular compromise in this region.",
    "Analyze the vascular supply of the hand through the radial and ulnar arteries, explaining how their terminal branches form the superficial and deep palmar arches, and discuss the clinical implications of these anastomotic patterns for hand injuries, surgical approaches, and the assessment of hand perfusion.",
    "Describe the origin, course, and major branches of the internal iliac (hypogastric) artery, explaining how its anatomy differs in the fetus compared to the adult, and discuss the clinical significance of these vessels in pelvic surgery, trauma management, and vascular embolization procedures.",
    "Explain the branching pattern and distribution of the celiac trunk, detailing how its three major divisions (left gastric, splenic, and hepatic arteries) supply the upper abdominal organs, and analyze the clinical significance of their anatomical variations, collateral pathways, and relationships to surrounding structures.",
    "Compare and contrast the anatomical relationships and distribution patterns of the superior and inferior mesenteric arteries, discussing their embryological significance, anastomotic connections, and the clinical implications of these vascular arrangements for intestinal ischemia, surgical resections, and collateral circulation in occlusive disease.",
    "Analyze the anatomical relationships and variations of the renal arteries, explaining their importance in surgical approaches to the kidney, renal transplantation, and interventional procedures, while also discussing the clinical significance of accessory renal vessels and their potential role in renovascular hypertension.",
    "Describe the origin, course, and branches of the external iliac artery, explaining its relationships to surrounding structures and analyzing the clinical implications of these anatomical features for vascular access procedures, traumatic injuries, and surgical approaches to the lower abdomen and pelvis.",
    "Compare and contrast the arterial supply of the male and female pelvic organs, describing how the internal iliac (hypogastric) artery and its branches distribute blood to these structures, and discuss the clinical significance of these vascular patterns for surgical procedures, hemorrhage control, and interventional radiological techniques.",
    "Describe the anatomical course and branching pattern of the internal pudendal artery in males and females, explaining how this vessel supplies the perineum and external genitalia, and discuss the clinical significance of its relationships for urogenital and anorectal procedures, traumatic injuries, and erectile dysfunction.",
    "Explain the anatomical relationships and branching pattern of the thoracic aorta, describing how its branches supply the thoracic walls and viscera, and discuss the clinical significance of these vascular patterns in thoracic surgery, trauma management, and the development of collateral circulation in cases of aortic coarctation.",
    "Analyze the surgical approaches to the axillary artery and its branches, discussing the potential complications of these procedures, strategies for minimizing vascular injury, and the management of axillary artery trauma in relation to the surrounding neurovascular structures.",
    "How does the female urethra compare to the male urethra in terms of structure, length, and coats? What specific differences in epithelial lining exist between them?",
    "What is the developmental journey of the testes from fetal life to adulthood, and what abnormalities can occur during this process? How do these abnormalities affect fertility and health?",
    "What are the key components of the spermatic cord, and how do they relate to the blood supply, lymphatic drainage, and innervation of the testes? What clinical implications might arise from this anatomical arrangement?",
    "Describe the structure of the tunica vaginalis and its relationship to the peritoneum. How does it contribute to various testicular pathologies, especially different types of hydrocele, and what distinguishes these conditions from each other?",
    "Compare and contrast the structure and function of the vesiculae seminales, prostate gland, and bulbourethral glands. How do their secretions contribute to the composition of semen, and what are the clinical implications of diseases affecting these accessory glands?",
    "How is the penis structured for its dual role in urination and reproduction? What vascular and nervous mechanisms enable erection, and how does this relate to the anatomical organization of the corpora cavernosa and corpus spongiosum?",
    "What is the anatomical relationship between the ovaries, uterine tubes, and uterus? How does this arrangement facilitate conception, and what pathologies might develop when abnormalities occur in this system?",
    "How do the structural and functional changes of the uterus vary across a woman's lifecycle, from fetal development through puberty, pregnancy, and menopause? What cyclical changes occur during the menstrual cycle?",
    "Compare the anatomy of the vagina and its relationship to surrounding structures with that of the male urethra. How do these differences affect susceptibility to infections and other pathological conditions?",
    "What is the embryological origin and development of the ovaries? How does this relate to the structures found in the adult ovary, and what implications does this have for the development of ovarian pathologies?",
    "Describe the structure and function of the clitoris and penis as homologous organs. How do they compare in terms of erectile tissue arrangement, vascular supply, innervation, and embryological development? What implications does this have for sexual response and dysfunction?",
    "How do the vestibular bulbs in females compare anatomically and functionally to the corpus spongiosum/bulb of the penis in males? What is their role in sexual arousal, and how do they relate to the greater vestibular glands and other external genital structures?",
    "What is the anatomical and functional relationship between the mammary glands and the reproductive system? How do they develop embryologically, and what hormonal changes affect their structure and function throughout a woman's life from puberty through pregnancy, lactation, and menopause?",
    "How does the structure of the uterine tube facilitate the transport of ova and fertilization? What are the different segments of the tube, their specific functions, and how does ectopic pregnancy develop in relation to tubal anatomy?",
    "What is the structure and function of the prostate gland, and how does its anatomy relate to common pathologies? How do the different anatomical zones of the prostate correspond to the development of conditions like benign prostatic hyperplasia and prostate cancer?",
    "Compare and contrast the structure, development, and functions of the thyroid gland with the other major endocrine glands. How does the embryological origin of the thyroid relate to its adult position and function, and what common pathologies affect this gland?",
    "How do the anatomical features of the female reproductive system contribute to both normal fertility and common infertility issues? Consider the ovaries, fallopian tubes, uterus, cervix, and vagina in your analysis, and explain how structural or functional abnormalities in each can impact conception and pregnancy.",
    "How does the structure of the scrotum and its layers provide optimal temperature regulation for the testes? What vascular adaptations contribute to this function, and how do pathological conditions like varicocele affect testicular function?",
    "How do the anatomical and histological structures of the breast change throughout a woman's life, particularly during pregnancy and lactation? What cellular mechanisms enable milk production and ejection, and how does the transition from colostrum to mature milk occur?",
    "Compare and contrast the structure, embryological origin, and function of the male and female external genitalia. How do homologous structures differ in their adult form, and what hormonal factors influence their development and later function?"
]
category_list = ["16"] * 60



generation_loader = generation_loader
# result_base_path = "../result"
qa_system = CarManualQA(
        generation_loader=generation_loader,
        data_folder="./data",
        prompt_path_en="./prompts/en/generation/gemma3/generation_prompt3.txt",
        result_path="./result/5월16일/en-gemma3",
        use_vector_db=True,
        persist_directory="./chroma_db",
        collection_name="Gray",
        language="en"
    )
    
for q, c in zip(questions, category_list):
    print(f"\n새 쿼리: {q}, 카테고리: {c}")
    
    
    # 검색 결과 가져오기 (alpha 값 전달)
    # search_results = test_search_only(q, c, alpha=alpha_formatted)
    # context = "\n\n".join([result["chunk"] for result in search_results])
    
    start_time = time.time()
    # 응답 생성 (alpha 값 전달)
    chroma_response = qa_system.generate_response(q, c, top_n=5, alpha=0.3, target_language="ko")
    end_time = time.time()
    
    
    
    elapsed_time = end_time - start_time

    print(f"추론시간 : {elapsed_time}")
    print(f"LLM 답변: {chroma_response['후처리']}")

In [None]:
template="""Given the question '{query}', generate a hypothetical document that directly answers this question. The document should be detailed and in-depth.
            the document size has be exactly {chunk_size} characters.""",

In [None]:
from translate import *
sent = "The body is the largest part of a vertebra, and is more or less cylindrical in shape. Its upper and lower surfaces are flattened and rough, and give attachment to the intervertebral fibrocartilages, and each presents a rim around its circumference. In front, the body is convex from side to side and concave from above downward. Behind, it is flat from above downward and slightly concave from side to side. Its anterior surface presents a few small apertures, for the passage of nutrient vessels; on the posterior surface is a single large, irregular aperture, or occasionally more than one, for the exit of the basi-vertebral veins from the body of the vertebra."
print(en_to_ko(sent))

In [None]:
from huggingface_hub import snapshot_download

# safetensors 파일만 다운로드
snapshot_download(
    repo_id="jbochi/madlad400-10b-mt",
    ignore_patterns=["*.gguf"],
    local_dir="./model/translate/madlad400-10b"
)

In [None]:
# from transformers import T5ForConditionalGeneration, T5Tokenizer
# import torch
# import time

# def load_madlad_model(model_name='jbochi/madlad400-3b-mt'):
#     """
#     MADLAD 모델과 토크나이저 로드
    
#     Args:
#         model_name (str): 사용할 MADLAD 모델 이름
    
#     Returns:
#         tuple: (model, tokenizer) - 로드된 모델과 토크나이저
#     """
#     model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")
#     tokenizer = T5Tokenizer.from_pretrained(model_name)
#     return model, tokenizer

# def translate_text(text, source_lang, target_lang, model=None, tokenizer=None):
#     """
#     MADLAD 모델을 사용하여 텍스트 번역
    
#     Args:
#         text (str): 번역할 텍스트
#         source_lang (str): 원본 언어 코드 (예: 'ko', 'en')
#         target_lang (str): 대상 언어 코드 (예: 'en', 'ko')
#         model: 사전 로드된 모델 (없으면 새로 로드)
#         tokenizer: 사전 로드된 토크나이저 (없으면 새로 로드)
    
#     Returns:
#         str: 번역된 텍스트
#     """
#     # 모델과 토크나이저가 제공되지 않은 경우 로드
#     if model is None or tokenizer is None:
#         model, tokenizer = load_madlad_model()
    
#     # 텍스트 앞에 대상 언어 토큰 추가
#     prefix = f"<2{target_lang}> "
#     input_text = prefix + text
    
#     # 입력 텍스트를 토큰화
#     input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
    
#     # 번역 생성
#     outputs = model.generate(
#         input_ids=input_ids,
#         max_length=512,
#         num_beams=4,
#         length_penalty=0.6
#     )
    
#     # 출력 토큰을 텍스트로 디코딩
#     translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
#     return translated_text

# def ko_to_en(text, model=None, tokenizer=None):
#     """
#     한국어를 영어로 번역
    
#     Args:
#         text (str): 번역할 한국어 텍스트
#         model: 사전 로드된 모델 (없으면 새로 로드)
#         tokenizer: 사전 로드된 토크나이저 (없으면 새로 로드)
    
#     Returns:
#         str: 번역된 영어 텍스트
#     """
#     return translate_text(text, 'ko', 'en', model, tokenizer)

# def en_to_ko(text, model=None, tokenizer=None):
#     """
#     영어를 한국어로 번역
    
#     Args:
#         text (str): 번역할 영어 텍스트
#         model: 사전 로드된 모델 (없으면 새로 로드)
#         tokenizer: 사전 로드된 토크나이저 (없으면 새로 로드)
    
#     Returns:
#         str: 번역된 한국어 텍스트
#     """
#     return translate_text(text, 'en', 'ko', model, tokenizer)

In [2]:
# 번역 평가용 한국어 문장 리스트
korean_sentences = [
    "현대 사회에서 인공지능 기술의 급속한 발전은 우리의 일상생활뿐만 아니라 산업 전반에 걸쳐 혁명적인 변화를 가져오고 있으며, 이러한 변화에 적응하지 못하는 기업들은 시장에서 도태될 가능성이 높아지고 있다.",
    "글로벌 기후 변화로 인한 극단적인 기상 현상이 증가함에 따라, 전 세계 정부와 기관들은 지속 가능한 발전 목표를 달성하기 위해 다양한 환경 정책을 수립하고 국제적 협력을 강화하고 있으나, 여전히 많은 국가들이 경제적 이익을 우선시하는 경향이 있다.",
    "한국의 전통문화는 오랜 역사를 통해 형성된 독특한 예술 형태와 철학적 개념을 포함하고 있으며, 특히 한식, 한복, 한옥과 같은 문화적 요소들은 현대 사회에서도 그 가치를 인정받아 세계적으로 주목받고 있는 중이다.",
    "급변하는 디지털 환경 속에서 정보 보안의 중요성은 날로 증가하고 있으며, 개인 정보 유출 사고가 빈번하게 발생함에 따라 기업들은 더욱 강화된 보안 시스템을 구축하고 있지만, 동시에 해커들의 공격 방식도 더욱 교묘해지고 있어 끊임없는 기술적 발전이 요구된다.",
    "현대 교육 시스템은 학생들의 창의성과 비판적 사고력을 키우는 데 중점을 두고 있으나, 여전히 많은 국가에서는 표준화된 시험과 암기식 학습에 의존하고 있어, 이러한 불균형을 해소하기 위한 교육 개혁의 필요성이 계속해서 제기되고 있다.",
    "글로벌 경제의 상호연결성이 증가함에 따라 한 국가의 경제적 위기는 빠르게 다른 국가들에게도 영향을 미치게 되었으며, 이로 인해 국제 금융 기관들은 위기 관리 시스템을 강화하고 각국 정부는 경제 안정화 정책을 더욱 중요시하게 되었다.",
    "현대 의학 기술의 발전으로 평균 수명이 크게 늘어났지만, 동시에 노인 인구 증가로 인한 사회적, 경제적 부담도 증가하고 있어 각국 정부는 노인 복지 정책과 의료 시스템의 효율성 개선에 많은 노력을 기울이고 있다.",
    "디지털 플랫폼의 확산은 사람들 간의 소통 방식을 근본적으로 변화시켰으며, 소셜 미디어를 통한 정보 공유가 활발해지면서 정보의 신뢰성 문제와 개인 정보 보호에 대한 우려가 함께 증가하고 있어 디지털 리터러시 교육의 중요성이 더욱 강조되고 있다.",
    "과학기술의 급속한 발전은 의료, 농업, 에너지 등 다양한 분야에서 혁신을 가져왔지만, 동시에 윤리적 문제와 직업 시장의 변화 등 새로운 사회적 도전들을 야기하고 있어, 이러한 변화에 대응하기 위한 사회적 합의와 제도적 준비가 필요하다.",
    "언어는 단순한 의사소통의 도구를 넘어 문화적 정체성과 세계관을 형성하는 중요한 요소로, 세계화가 진행됨에 따라 많은 소수 언어들이 사라질 위기에 처해있으며, 이는 인류 문화 다양성 보존의 관점에서 중요한 문제로 대두되고 있다."
]

# 번역 평가용 영어 문장 리스트
english_sentences = [
    "The rapid advancement of artificial intelligence technology in modern society is bringing revolutionary changes not only to our daily lives but also across industries, and companies that fail to adapt to these changes are increasingly likely to be eliminated from the market.",
    "As extreme weather phenomena increase due to global climate change, governments and institutions worldwide are establishing various environmental policies and strengthening international cooperation to achieve sustainable development goals, yet many countries still tend to prioritize economic interests.",
    "Korean traditional culture encompasses unique art forms and philosophical concepts formed throughout its long history, and cultural elements such as Korean cuisine, hanbok (traditional clothing), and hanok (traditional houses) are being recognized globally for their value even in modern society.",
    "The importance of information security is increasing day by day in the rapidly changing digital environment, and as personal information leakage incidents frequently occur, companies are building enhanced security systems, but at the same time, hackers' attack methods are becoming more sophisticated, requiring continuous technological advancement.",
    "The modern education system focuses on developing students' creativity and critical thinking skills, but many countries still rely on standardized tests and rote learning, leading to ongoing calls for educational reform to address this imbalance.",
    "As the interconnectedness of the global economy increases, an economic crisis in one country quickly affects others, leading international financial institutions to strengthen crisis management systems and governments to place greater emphasis on economic stabilization policies.",
    "While advancements in modern medical technology have significantly extended average life expectancy, the simultaneous increase in the elderly population has increased social and economic burdens, prompting governments to invest heavily in policies for elderly welfare and improving the efficiency of healthcare systems.",
    "The proliferation of digital platforms has fundamentally changed how people communicate, and as information sharing through social media becomes more active, concerns about information reliability and personal data protection have also increased, emphasizing the importance of digital literacy education.",
    "The rapid development of science and technology has brought innovation in various fields such as medicine, agriculture, and energy, but it has also created new social challenges such as ethical issues and changes in the job market, requiring social consensus and institutional preparation to respond to these changes.",
    "Language is more than just a tool for communication; it is an important element that forms cultural identity and worldview, and as globalization progresses, many minority languages are in danger of disappearing, which is emerging as an important issue from the perspective of preserving human cultural diversity."
]

# jbochi/madlad400-3b-mt

In [3]:
import time
from translate import *

en_to_ko_results = [en_to_ko(sent) for sent in english_sentences]
print(f"영어 -> 한국어 번역 결과")
for i, result in enumerate(en_to_ko_results) :
    start_time = time.time()
    print(f"영->한 {i+1}번째 결과 : {en_to_ko(result)}")
    end_time = time.time()
    trans_time = end_time - start_time
    print(f"번역 시간 : {trans_time}")

ko_to_en_results = [ko_to_en(sent) for sent in korean_sentences]
print(f"한국어 -> 영어 번역 결과")
for i, result in enumerate(ko_to_en_results) :
    start_time = time.time()
    print(f"한->영 {i+1}번째 결과 : {ko_to_en(result)}")
    end_time = time.time()
    trans_time = end_time - start_time
    print(f"번역 시간 : {trans_time}")



구두점으로 분할된 문장 리스트 : ['The rapid advancement of artificial intelligence technology in modern society is bringing revolutionary changes not only to our daily lives but also across industries, and companies that fail to adapt to these changes are increasingly likely to be eliminated from the market.']
구두점으로 분할된 문장 리스트 : ['As extreme weather phenomena increase due to global climate change, governments and institutions worldwide are establishing various environmental policies and strengthening international cooperation to achieve sustainable development goals, yet many countries still tend to prioritize economic interests.']
구두점으로 분할된 문장 리스트 : ['Korean traditional culture encompasses unique art forms and philosophical concepts formed throughout its long history, and cultural elements such as Korean cuisine, hanbok (traditional clothing), and hanok (traditional houses) are being recognized globally for their value even in modern society.']
구두점으로 분할된 문장 리스트 : ["The importance of information se

In [None]:
from translate import *

sent = "The body is the largest part of a vertebra, and is more or less cylindrical in shape. Its upper and lower surfaces are flattened and rough, and give attachment to the intervertebral fibrocartilages, and each presents a rim around its circumference. In front, the body is convex from side to side and concave from above downward. Behind, it is flat from above downward and slightly concave from side to side. Its anterior surface presents a few small apertures, for the passage of nutrient vessels; on the posterior surface is a single large, irregular aperture, or occasionally more than one, for the exit of the basi-vertebral veins from the body of the vertebra."
print(ko_to_en(sent))

In [1]:
from translate import *
sent = 'According to page 43, “others, such as the bones of the limbs, are preceded by rods of cartilage. Hence two kinds of ossification are described: the intramembranous and the intracartilaginous.” \nAccording to page 39, “Up to a certain stage the development of the skull corresponds with that of the vertebral column.”\nThe document does not contain information about the developmental implications of these differences.'
print("번역 전 : ", sent)
result = en_to_ko(sent)
print("번역 후 : ", result)

번역 전 :  According to page 43, “others, such as the bones of the limbs, are preceded by rods of cartilage. Hence two kinds of ossification are described: the intramembranous and the intracartilaginous.” 
According to page 39, “Up to a certain stage the development of the skull corresponds with that of the vertebral column.”
The document does not contain information about the developmental implications of these differences.
구두점으로 분할된 문장 리스트 : ['According to page 43, “others, such as the bones of the limbs, are preceded by rods of cartilage.', 'Hence two kinds of ossification are described: the intramembranous and the intracartilaginous.” \nAccording to page 39, “Up to a certain stage the development of the skull corresponds with that of the vertebral column.”\nThe document does not contain information about the developmental implications of these differences.']
번역 후 :  43쪽에 따르면, "사지의 뼈와 같은 다른 것들은 연골의 막대가 앞서 있다. 010년 10월 1일자 뉴욕타임스에 실린 논문에 따르면, “The development of the skull corresponds to 

In [1]:
import re

# 파일 경로
file_path = './data/GrayAnatomy_Formatted.md'

# 파일 읽기
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

# 1. 취소선(~~) 제거
content = re.sub(r'~~', '', content)

# 2. 볼드체(**) 제거 - 단어 자체는 유지하고 표시만 제거
content = re.sub(r'\*\*', '', content)

# 3. 이탤릭체(*) 제거 - 단어 자체는 유지하고 표시만 제거
content = re.sub(r'(?<!\*)\*(?!\*)', '', content)

content = re.sub(r'F IG', 'FIG', content)
# 수정된 내용 저장
with open(file_path, 'w', encoding='utf-8') as file:
    file.write(content)

print(f"파일이 성공적으로 수정되었습니다: {file_path}")

파일이 성공적으로 수정되었습니다: ./data/GrayAnatomy_Formatted.md
