# 터미널 설치 및 초기 설정

pip install -U langchain-classic langchain-chroma langchain-openai langchain-huggingface langchain-community



ChatGPT API KEY 설정 필요

# 파이썬 라이브러리로 PDF에서 텍스트 추출

In [1]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m86.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.6


In [2]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20251107 (from pdfplumber)
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.0.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.8-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20251107-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import fitz  # PyMuPDF 라이브러리
import pdfplumber
import time

# 파일 업로드 시 경로 바꿔야 함
pdf_path = "/content/(무)하나1Q어린이보험_계약자용약관.pdf"

# PyMuPDF (Fitz) 사용 (1)
def extract_with_fitz(path):
    start = time.time()
    doc = fitz.open(path)
    text = ""
    for page in doc:
        text += page.get_text()
    end = time.time()
    print(f"[PyMuPDF] 소요시간: {end - start:.4f}초")
    return text

# pdfplumber 사용 (2)
def extract_with_plumber(path):
    start = time.time()
    text = ""
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            # 표가 있다면 표만 따로 추출하거나 텍스트와 섞어서 추출 가능
            text += page.extract_text()
    end = time.time()
    print(f"[pdfplumber] 소요시간: {end - start:.4f}초")
    return text

# (1)과 (2) 실행 및 비교
text_fitz = extract_with_fitz(pdf_path)
text_plumber = extract_with_plumber(pdf_path)

# 결과 일부 출력해서 비교해보기
print("-" * 30)
print("PyMuPDF 결과 (앞 200자):", text_fitz[:200])
print("-" * 30)
print("pdfplumber 결과 (앞 200자):", text_plumber[:200])

[PyMuPDF] 소요시간: 0.7441초
[pdfplumber] 소요시간: 31.9744초
------------------------------
PyMuPDF 결과 (앞 200자):  
1 
 
 
 
 
 
 
 
 
무배당  
하나1Q 어린이보험 
 
 
 
 
 
 
 
약관의 목차 
 
 
02 ········· 가입자 유의사항 
03 ········· 주요 민원사항 
04 ········· 주요내용 요약서 
06 ········· 보험용어 해설 
07 ········· 무배당 하나1Q 어린이보험 
55 ········· 무배당
------------------------------
pdfplumber 결과 (앞 200자): 무배당
하나 1Q 어린이보험
약관의 목차
02 ········· 가입자 유의사항
03 ········· 주요 민원사항
04 ········· 주요내용 요약서
06 ········· 보험용어 해설
07 ········· 무배당 하나1Q어린이보험
55 ········· 무배당 주산기질환입원특약
65 ········· 특정 신체부위·질병 보장제한부 인수 특약
7


# .txt와 JSON 파일로 저장하기

In [4]:
import json

# PyMuPDF 결과 저장
with open("result_fitz.txt", "w", encoding="utf-8") as f:
    f.write(text_fitz)

# pdfplumber 결과 저장
with open("result_plumber.txt", "w", encoding="utf-8") as f:
    f.write(text_plumber)

# .json 구조화 저장 -> ([페이지번호, 텍스트] 쌍으로 저장)
rag_data = []

# pdfplumber로 페이지별 데이터 만들기
with pdfplumber.open(pdf_path) as pdf:
    for i, page in enumerate(pdf.pages):
        text = page.extract_text()
        if text:
            rag_data.append({
                "page_number": i + 1,
                "content": text
            })

# JSON 파일로 저장
with open("insurance_data.json", "w", encoding="utf-8") as f:
    json.dump(rag_data, f, ensure_ascii=False, indent=4)

# PDF에서 추출한 텍스트를 Vector DB로 변환하고 저장하기

In [6]:
!pip uninstall -y langchain langchain-community langchain-core langchain-openai langchain-chroma
!pip install langchain langchain-community langchain-core langchain-openai langchain-chroma langchain-huggingface chromadb sentence-transformers

Found existing installation: langchain 0.3.27
Uninstalling langchain-0.3.27:
  Successfully uninstalled langchain-0.3.27
[0mFound existing installation: langchain-core 0.3.79
Uninstalling langchain-core-0.3.79:
  Successfully uninstalled langchain-core-0.3.79
[0mCollecting langchain
  Downloading langchain-1.0.8-py3-none-any.whl.metadata (4.9 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-core
  Downloading langchain_core-1.0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting langchain-openai
  Downloading langchain_openai-1.0.3-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-chroma
  Downloading langchain_chroma-1.0.0-py3-none-any.whl.metadata (1.9 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-1.0.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chromadb
  Downloading chromadb-1.3.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting la

In [7]:
import json
import time
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# 저장된 JSON 파일 불러오기
with open("insurance_data.json", "r", encoding="utf-8") as f:
    json_data = json.load(f)

# LangChain Document 형식으로 변환
docs = []
for entry in json_data:
    doc = Document(
        page_content=entry['content'], # page_content는 실제 내용
        metadata={"page": entry['page_number']} # metadata에는 페이지 번호 등 부가 정보 저장
    )
    docs.append(doc)

# 텍스트 Chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
splits = text_splitter.split_documents(docs)

# 임베딩 모델 설정
embedding_model = HuggingFaceEmbeddings(
    model_name="jhgan/ko-sroberta-multitask",
    model_kwargs={'device': 'cpu'}, # GPU 있으면 'cuda'
    encode_kwargs={'normalize_embeddings': True}
)

# Vector DB 생성 및 로컬 저장 (persist_directory 지정 중요)
start_time = time.time()

vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embedding_model,
    persist_directory="./chroma_db"  # 해당 경로에 DB가 저장됨
)

end_time = time.time()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/744 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/585 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# LLM 기반 RAG Langchain으로 답변 생성하기





In [8]:
import sys
import os

# 라이브러리가 없으면 설치
try:
    from langchain.chains import create_retrieval_chain
except ImportError:
    os.system('pip install -U langchain langchain-community langchain-chroma langchain-openai langchain-huggingface chromadb sentence-transformers')

# 필요 라이브러리 설치
import time
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# OpenAI API Key 입력
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

# DB 존재여부 확인 및 불러오기
if not os.path.exists("./chroma_db"):
  print("저장된 DB 폴더(chroma_db)가 없습니다.")
else:
    # 임베딩 모델 설정
    embedding_model = HuggingFaceEmbeddings(
        model_name="jhgan/ko-sroberta-multitask",
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': True}
    )

    # DB 불러오기
    vectorstore = Chroma(
        persist_directory="./chroma_db",
        embedding_function=embedding_model
    )

    # 검색기 & LLM 설정 (GPT 모델 설정)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

    # 프롬프트 & Chain 연결 (프롬프트 작성 잘 할 것)
    system_prompt = (
        "당신은 보험 약관 분석가입니다. 문맥을 바탕으로 질문에 답하세요.\n\n"
        "{context}"
    )
    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt), ("human", "{input}")
    ])

    # RAG 체인 생성
    rag_chain = create_retrieval_chain(retriever, create_stuff_documents_chain(llm, prompt))

    # 질문 설정
    def ask_question(query):
        start = time.time()

        try:
            response = rag_chain.invoke({"input": query})

            # 출처 페이지 표시
            pages = [doc.metadata.get("page", "?") for doc in response.get("context", [])]

        except Exception as e:

        print(f"소요 시간: {time.time() - start:.4f}초")

    # --- 테스트 질문 입력 ---
    ask_question("- 산모 프로필: 32세, 여성, 초산, 임신 10주차, 출산 예정일 26.03.17. , 단태아. 산모의 키는 160cm, 임신 전 체중 50kg, 임신 후 현재 체중 58kg, 과거 병력: 없음, 약물 복용 정보: 고혈압 약 복용 중, 직무 및 근무시간: IT 회사 근무 중이고 9시~18시 근무 중임. 현재 건강 상태: 빈혈 있지만 약 복용하고 있지 않음, 기존 가입한 보험 유무: 기존 가입한 태아 보험 없음원하는 보험 조건: 30세 만기형 - 질문: 위 산모 프로필에 대해서, 선택해야하는 특약을 추천해주고, 추천 이유 및 근거를 제시해줘.")

IndentationError: expected an indented block after 'if' statement on line 23 (ipython-input-215665463.py, line 24)