In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import json
import uuid
import tiktoken
from openai import OpenAI
import pinecone
from tqdm import tqdm
import os
from dotenv import load_dotenv
from pinecone import Pinecone

# Pinecone API Key & 환경 정보 가져오기
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_ENV = os.getenv("PINECONE_ENV")


In [3]:
### 🔢 Tokenizer
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

def num_tokens(text):
    return len(tokenizer.encode(text))

# ### 📥 1. JSON 로드
# with open("C:/Users/USER/Downloads/apm-20099.json", "r", encoding="utf-8") as f:
#     data = json.load(f)

In [32]:
import os
import json

# 폴더 경로 설정 (역슬래시 대신 슬래시 또는 r 문자열로 작성)
folder_path = r"C:/Users/USER/OneDrive/바탕 화면/내 자료/보아즈/ADV 플젝/증례보고pdf"

# 모든 json 파일을 리스트로 로드
data = []

for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            try:
                json_data = json.load(f)
                data.append(json_data)
            except json.JSONDecodeError as e:
                print(f"⚠️ JSON 파싱 오류: {filename} - {e}")

# 이제 data에는 모든 json 파일의 내용이 리스트 형태로 저장됨

In [33]:
data[0]["merged_text"]

'신생아에서 발생한 기관내 삽관 후 기도 손상의 주술기 관리-증례 보고- Perioperative management of tracheal injuryfollowing endotracheal intubation in aneonate# -A case report- Tracheal injury in neonates is a rare but serious complication ofendotracheal intubation. The morbidity and mortality are associatedwith early recognition and adequate management. Herein, wereported a case of perioperative management of neonatal trachealinjury following multiple attempts at endotracheal intubation causedby unanticipated difficulty. (Anesth Pain Med 2017; 12: 52-55) Key Words: Complication, Endotracheal intubation, Neonate.소아에서 기관내 삽관의 합병증으로 기도 손상이 발생하는 경우는 드물지만, 치명적인 결과를 초래할 수 있으므로빠른 진단과 적절한 처치가 매우 중요하다. 특히 신생아는기도의 해부학적 특성상 기관내 삽관과 연관된 손상에 더욱 취약하다[1]. 저자들은 출생 당일 기관내 삽관의 어려움으로 인해 삽관을 반복하여 시도한 후 기도 손상이 발생하여 외과적 봉합을 시행한 환아를 경험하여 문헌 고찰과 함께 보고하고자 한다.제왕절개술로 출생한 제태 기간 36주 여자 환아(신장 48cm, 체중 2,980 g)가 출생 직후부터 청색증을 보이고 말초산소포화도가 85%로 측정되어 마스크로 보조호흡을 하면서산소포화도가 95% 정도로 유지되었으나 호흡수가 상승하면서 흉부 함몰을 보여 출생 당일에 본원으로 전원되어 신생아 중환자실에 입원하였다. 입원 당시 환

In [18]:
! pip install openai --upgrade

Collecting openai
  Downloading openai-1.90.0-py3-none-any.whl (734 kB)
     -------------------------------------- 734.6/734.6 kB 9.2 MB/s eta 0:00:00
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 0.28.0
    Uninstalling openai-0.28.0:
      Successfully uninstalled openai-0.28.0
Successfully installed openai-1.90.0


In [34]:
import openai
from langchain_openai import OpenAIEmbeddings

# OpenAI의 "text-embedding-3-small" 모델을 사용하여 임베딩을 생성
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=OPENAI_API_KEY)


In [35]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

# OpenAI 임베딩 설정
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=OPENAI_API_KEY
)

# 청킹 설정
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", " ", ""]
)

# 전체 벡터 저장용 리스트
all_texts = []
all_vectors = []
all_metadata = []

for i, item in enumerate(data):
    text = item["merged_text"]
    
    # 청크 분리
    chunks = text_splitter.split_text(text)
    
    # 임베딩 생성
    vectors = embeddings.embed_documents(chunks)
    
    # 저장
    all_texts.extend(chunks)
    all_vectors.extend(vectors)
    all_metadata.extend([{"source": f"doc_{i}", "chunk": j} for j in range(len(chunks))])


In [36]:
# 조건에 따라 year 값을 설정하는 함수
def get_year_from_source(source):
    if source in [f"doc_{i}" for i in range(0, 6)]:
        return 2017
    elif source == "doc_6":
        return 2018
    elif source in ["doc_7", "doc_8"]:
        return 2019
    else:
        return None  # 예외 처리용 (혹시 source 값이 다를 경우)

# year 값 추가
for item in all_metadata:
    source = item.get("source")
    item["year"] = get_year_from_source(source)

In [37]:
# 1. 문서 ID와 파일명 매핑 테이블 (사진 기반 순서에 따라 수정)
doc_id_to_filename = {
    "doc_0": "APM-12-052",
    "doc_1": "APM-12-147",
    "doc_2": "APM-12-240",
    "doc_3": "APM-12-243",
    "doc_4": "APM-12-335",
    "doc_5": "APM-12-339",
    "doc_6": "APM-12-173",
    "doc_7": "APM-12-040",
    "doc_8": "APM-12-044"
}

# 2. 기존 메타데이터 예시
# all_metadata = [{'source': 'doc_0', 'chunk': 0}, {'source': 'doc_0', 'chunk': 1}, ...]

# 3. source 필드 매핑
for item in all_metadata:
    old_id = item["source"]
    if old_id in doc_id_to_filename:
        item["source"] = doc_id_to_filename[old_id]

In [47]:
for meta in all_metadata:
    meta["paper_title"] = meta.pop("source")

all_metadata

[{'chunk': 0, 'year': 2017, 'paper_title': 'APM-12-052'},
 {'chunk': 1, 'year': 2017, 'paper_title': 'APM-12-052'},
 {'chunk': 2, 'year': 2017, 'paper_title': 'APM-12-052'},
 {'chunk': 3, 'year': 2017, 'paper_title': 'APM-12-052'},
 {'chunk': 4, 'year': 2017, 'paper_title': 'APM-12-052'},
 {'chunk': 5, 'year': 2017, 'paper_title': 'APM-12-052'},
 {'chunk': 6, 'year': 2017, 'paper_title': 'APM-12-052'},
 {'chunk': 7, 'year': 2017, 'paper_title': 'APM-12-052'},
 {'chunk': 8, 'year': 2017, 'paper_title': 'APM-12-052'},
 {'chunk': 9, 'year': 2017, 'paper_title': 'APM-12-052'},
 {'chunk': 10, 'year': 2017, 'paper_title': 'APM-12-052'},
 {'chunk': 11, 'year': 2017, 'paper_title': 'APM-12-052'},
 {'chunk': 12, 'year': 2017, 'paper_title': 'APM-12-052'},
 {'chunk': 13, 'year': 2017, 'paper_title': 'APM-12-052'},
 {'chunk': 14, 'year': 2017, 'paper_title': 'APM-12-052'},
 {'chunk': 0, 'year': 2017, 'paper_title': 'APM-12-147'},
 {'chunk': 1, 'year': 2017, 'paper_title': 'APM-12-147'},
 {'chunk'

In [None]:
import pinecone
from tqdm import tqdm  # 진행 상황 보기 용도
import os
from pinecone import Pinecone, ServerlessSpec

# 인덱스 이름 지정
PINECONE_INDEX_NAME = 'boaz01'

# Initialize clients
openai_client = OpenAI(api_key=OPENAI_API_KEY)
#pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)

# Pinecone 인스턴스 생성
pc = Pinecone(api_key=PINECONE_API_KEY)

# 인덱스가 존재하는지 확인 후 연결
existing_indexes = [index["name"] for index in pc.list_indexes()]
if PINECONE_INDEX_NAME not in existing_indexes:
    raise ValueError(f"❌ '{PINECONE_INDEX_NAME}' 인덱스가 존재하지 않습니다. Pinecone에서 생성해주세요!")

# 인덱스 연결
index = pc.Index(PINECONE_INDEX_NAME)

# 업서트할 벡터 묶음 생성 (id, vector, metadata 포함)
to_upsert = []
for i, (vec, meta) in enumerate(zip(all_vectors, all_metadata)):
    to_upsert.append((f"chunk_{i}", vec, meta))

# 배치로 업서트 (100개씩)
batch_size = 100
for i in tqdm(range(0, len(to_upsert), batch_size)):
    batch = to_upsert[i:i+batch_size]
    index.upsert(vectors=batch)

100%|██████████| 3/3 [00:08<00:00,  2.75s/it]


--------------여기 밑에는 522버전이고, 위에는 621 다은이 json가지고 증례보고 임베딩생성성

### document생성

In [51]:
data_list = data['elements']

In [52]:
for element in data_list:
    print(element)

{'category': 'heading1', 'content': {'html': "<h1 id='0' style='font-size:20px'>apm<br>ANESTHESIA & PAIN MEDICINE</h1>", 'markdown': '# apm\nANESTHESIA & PAIN MEDICINE', 'text': 'apm\nANESTHESIA & PAIN MEDICINE'}, 'coordinates': [{'x': 0.0427, 'y': 0.0273}, {'x': 0.2725, 'y': 0.0273}, {'x': 0.2725, 'y': 0.0953}, {'x': 0.0427, 'y': 0.0953}], 'id': 0, 'page': 1}
{'category': 'paragraph', 'content': {'html': "<br><p id='1' data-category='paragraph' style='font-size:14px'>Check for<br>updates</p>", 'markdown': 'Check for\nupdates', 'text': 'Check for\nupdates'}, 'coordinates': [{'x': 0.8787, 'y': 0.0481}, {'x': 0.9184, 'y': 0.0481}, {'x': 0.9184, 'y': 0.0615}, {'x': 0.8787, 'y': 0.0615}], 'id': 1, 'page': 1}
{'category': 'heading1', 'content': {'html': "<br><h1 id='2' style='font-size:20px'>Case Report</h1>", 'markdown': '# Case Report', 'text': 'Case Report'}, 'coordinates': [{'x': 0.7266, 'y': 0.0715}, {'x': 0.8604, 'y': 0.0715}, {'x': 0.8604, 'y': 0.0938}, {'x': 0.7266, 'y': 0.0938}], '

In [53]:
from langchain_core.documents import Document

documents = []
export_categories = {'heading1', 'paragraph', 'list'}
except_text = ['기본 기능','설정', '애플리케이션', '부록']

for element in data_list:

    text_content = element['content']['text'].strip()
    if text_content in except_text:
        continue

    category = element['category']

    if category != 'table' and category not in export_categories:
         continue

     # 기본 메타데이터 구성
    metadata = {
        'type': category,
        'page': element['page'],
        'text': element['content']['text']
        }
    

    documents.append(
        Document(page_content=element['content']['markdown'], metadata=metadata))


In [54]:
documents

[Document(metadata={'type': 'heading1', 'page': 1, 'text': 'apm\nANESTHESIA & PAIN MEDICINE'}, page_content='# apm\nANESTHESIA & PAIN MEDICINE'),
 Document(metadata={'type': 'paragraph', 'page': 1, 'text': 'Check for\nupdates'}, page_content='Check for\nupdates'),
 Document(metadata={'type': 'heading1', 'page': 1, 'text': 'Case Report'}, page_content='# Case Report'),
 Document(metadata={'type': 'paragraph', 'page': 1, 'text': 'Anesth Pain Med 2021;16:273-278\nhttps://doi.org/10.17085/apm.20099\npISSN 1975-5171 · eISSN 2383-7977'}, page_content='Anesth Pain Med 2021;16:273-278\nhttps://doi.org/10.17085/apm.20099\npISSN 1975-5171 · eISSN 2383-7977'),
 Document(metadata={'type': 'paragraph', 'page': 1, 'text': 'Received December 28, 2020\nRevised March 10, 2021\nAccepted March 14, 2021'}, page_content='Received December 28, 2020\nRevised March 10, 2021\nAccepted March 14, 2021'),
 Document(metadata={'type': 'paragraph', 'page': 1, 'text': 'Corresponding author'}, page_content='Correspond

### 청킹

In [55]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 청킹 함수 정의
def chunk_documents(documents, chunk_size=500, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )

    # 새로운 documents 리스트로 청크 생성
    new_documents = []
    
    for doc in documents:
        # page_content에서 청킹
        chunks = text_splitter.split_text(doc.page_content)
        
        # 청크마다 새로운 Document로 생성
        for i, chunk in enumerate(chunks):
            new_doc = Document(
                metadata={
                    #'source': doc.metadata['source'],
                    #'id': f"{doc.metadata['id']}-{i}",  # 각 청크마다 id 변경
                    #'header' : doc.metadata['header'],
                    'type': doc.metadata['type'],
                    'page': doc.metadata['page'],
                    'text': doc.metadata['text']
                    #'paper_title': doc.metadata['paper_title'],
                    #'author': doc.metadata['author']
                },
                page_content=chunk
            )
            new_documents.append(new_doc)
    
    return new_documents

# 청킹 실행
chunked_documents = chunk_documents(documents)


In [56]:
chunked_documents[0].page_content

'# apm\nANESTHESIA & PAIN MEDICINE'

### 임베딩

In [119]:
! pip install openai pymupdf



In [120]:
! pip install openai



In [121]:
! pip install langchain_openai




In [57]:
import os
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings

# .env 파일에서 환경 변수 로드
load_dotenv()

# 환경 변수에서 OpenAI API 키 가져오기
api_key = os.getenv("OPENAI_API_KEY")

# OpenAI의 "text-embedding-3-small" 모델을 사용하여 임베딩을 생성
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=api_key)

In [58]:
import os
import json

# 임베딩과 메타데이터 저장할 리스트
embedding_data = []
metadata_list = []

# 청킹된 문서 리스트 순회
for idx, doc in enumerate(documents):
    text = doc.page_content  # 청킹된 문서 내용
    metadata = doc.metadata  # 메타데이터

    # OpenAI 임베딩 생성
    embedding = embeddings.embed_query(text)

    # 임베딩 데이터 저장
    embedding_data.append({
        "id": f"chunk_{idx}",  # 고유 ID (추후 Pinecone에 저장 시 사용)
        "embedding": embedding
    })

    # 메타데이터 저장
    metadata_list.append({
        "id": f"chunk_{idx}",
        "metadata": metadata
    })

# JSON 파일로 저장 (임시)
with open("embeddings.json", "w", encoding="utf-8") as f:
    json.dump(embedding_data, f, ensure_ascii=False, indent=4)

with open("metadata.json", "w", encoding="utf-8") as f:
    json.dump(metadata_list, f, ensure_ascii=False, indent=4)

print("✅ 임베딩 & 메타데이터 저장 완료!")

✅ 임베딩 & 메타데이터 저장 완료!


### 파인콘에 적재

In [59]:
# JSON 파일 경로 (파일이 있는 위치에 맞게 수정!)
embeddings_file = "embeddings.json"
metadata_file = "metadata.json"

# JSON 파일 로드
with open(embeddings_file, "r", encoding="utf-8") as f:
    embeddings_data = json.load(f)

with open(metadata_file, "r", encoding="utf-8") as f:
    metadata_data = json.load(f)

print(f"✅ 임베딩 {len(embeddings_data)}개, 메타데이터 {len(metadata_data)}개 로드 완료!")

✅ 임베딩 71개, 메타데이터 71개 로드 완료!


In [60]:
print("🔍 샘플 임베딩 데이터:", embeddings_data[0])  # 첫 번째 벡터 확인
print("🔍 샘플 메타데이터 데이터:", metadata_data[0])  # 첫 번째 메타데이터 확인

🔍 샘플 임베딩 데이터: {'id': 'chunk_0', 'embedding': [0.019461218267679214, 0.019984591752290726, 0.02458476647734642, 0.022697867825627327, -0.03911525756120682, -0.011479777283966541, 0.0013299532001838088, 0.017836006358265877, 0.00010889260738622397, -0.007953895255923271, -0.025948291644454002, -0.01992950029671192, 0.005120104644447565, 0.02972208708524704, 0.05437571927905083, -0.004266180098056793, 0.046828124672174454, -0.001011453103274107, 0.03677385300397873, 0.014792178757488728, 0.022573910653591156, 0.008277559652924538, -0.0370493121445179, -0.0036567256320267916, -0.022339770570397377, -0.03379888832569122, -0.005822526756674051, -0.004789553117007017, 0.00017528266471344978, 0.000170225408510305, 0.02654052898287773, -0.036718759685754776, 0.04178721457719803, 0.01757431961596012, -0.01700962893664837, 0.023772159591317177, -0.007409862242639065, 0.03801342099905014, -0.011307614855468273, -0.06285987049341202, -0.02136188931763172, -0.01570119522511959, 0.01717490330338478, 

In [61]:
import pinecone
#apm-20099_processing.json은 baegjiyeon 파인콘에 적재함!
#전처리 전 json은 ewha 파인콘에 적재하겠음.

# Initialize clients
openai_client = OpenAI(api_key=OPENAI_API_KEY)
#pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)

# Pinecone 인스턴스 생성
pc = Pinecone(api_key=PINECONE_API_KEY)

# 인덱스가 존재하는지 확인 후 연결
existing_indexes = [index["name"] for index in pc.list_indexes()]
if PINECONE_INDEX_NAME not in existing_indexes:
    raise ValueError(f"❌ '{PINECONE_INDEX_NAME}' 인덱스가 존재하지 않습니다. Pinecone에서 생성해주세요!")

# 인덱스 연결
index = pc.Index(PINECONE_INDEX_NAME)

In [62]:
# Pinecone에 벡터 업로드
vectors_to_upsert = []

for embedding_item, metadata_item in zip(embeddings_data, metadata_data):
    vectors_to_upsert.append({
        "id": embedding_item["id"],  # 고유 ID (chunk ID)
        "values": embedding_item["embedding"],  # 임베딩 벡터 값
        "metadata": metadata_item["metadata"]  # 문서 메타데이터
    })

# Pinecone에 벡터 한 번에 업로드
index.upsert(vectors=vectors_to_upsert)
print(f"✅ {len(vectors_to_upsert)}개 벡터 한 번에 업로드 완료!")

✅ 71개 벡터 한 번에 업로드 완료!


### 리트리버 생성

In [146]:
! pip install transformers



In [147]:
! pip install transformers pinecone-client sentence-transformers



ERROR: Could not install packages due to an OSError: [WinError 5] 액세스가 거부되었습니다: 'C:\\Users\\USER\\anaconda3\\anaconda\\Lib\\site-packages\\~-mpy.libs\\libopenblas64__v0.3.23-293-gc2f4bdbb-gcc_10_3_0-2bde3a66a51006b2b53eb373ff767a3f.dll'
Consider using the `--user` option or check the permissions.




Collecting numpy>=1.17
  Downloading numpy-1.24.4-cp39-cp39-win_amd64.whl (14.9 MB)
     --------------------------------------- 14.9/14.9 MB 16.4 MB/s eta 0:00:00
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4


In [148]:
! pip install pinecone-client transformers sentence-transformers




In [None]:
from dotenv import load_dotenv
from pinecone import Pinecone
import os
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone as LangchainPinecone
from langchain_community.vectorstores import Pinecone
import pinecone
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain_openai import OpenAIEmbeddings

# # OpenAI API 키를 직접 전달
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# index_name지정
index_name = 'boazadv1'

#벡터스토어 
vectorstore = LangchainPinecone(index,embeddings, text_key="text")  # 'text' 필드 지정

#리트리버 생성 
retriever = vectorstore.as_retriever(search_kwargs = {"k":10})


In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate

#쿼리 실행
query = "During post-insertion chest X-ray evaluation, what radiographic clue strongly suggests that a PICC has entered the internal jugular vein instead of the central circulation?"
relevant_docs = retriever.invoke(query)

#결과
print("Retrieved Douments:")
for doc in relevant_docs:
    print(doc.page_content)

# OpenAI LLM 설정
llm = ChatOpenAI(model_name="gpt-4", openai_api_key=OPENAI_API_KEY)

from langchain_core.prompts import PromptTemplate
# 🔹 프롬프트 정의

custom_prompt = PromptTemplate.from_template(
"""
You are an AI assistant specializing in pediatric anesthesia. 
Your primary task is to provide medically relevant, research-backed responses to assist anesthesiologists.
When responding, follow these guidelines:

1. **Keyword Extraction & Search**:
   - Identify key medical terms in the query.
   - Retrieve relevant data from your knowledge base or provided documents.

2. **Context-Aware Answering**:
   - Provide clear, structured responses based on scientific literature and clinical guidelines.
   - If necessary, list potential differential diagnoses, risk factors, or treatment considerations.

3. **Reference-Based Validation**:
   - Cite relevant sources whenever possible.
   - If the answer is based on general medical knowledge, explicitly state that.

4. **Clinically Usable Information**:
   - Keep responses concise yet informative.
   - Offer actionable insights that a medical professional can apply in practice.

### Example Query:  
*"What are the most common complications of pediatric anesthesia
"*  

### Example Response:  
- **Common Complications**: Postoperative nausea and vomiting (PONV), respiratory depression, bradycardia.  
- **Risk Factors**: Age < 3 years, pre-existing respiratory conditions, opioid use.  
- **Recommended Management**: Preoperative fasting adherence, appropriate opioid titration, intraoperative antiemetic administration.  
- **References**: [Study XYZ, 2023], [Pediatric Anesthesia Guidelines, ASA].  

Now, process the following query and generate a structured response.

Query: {query}
Answer:
""")

# 2. RAG 체인 생성 (stuff 체인에 프롬프트 삽입)
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": custom_prompt}  # 💡 여기에 프롬프트 삽입
)

query = "During post-insertion chest X-ray evaluation, what radiographic clue strongly suggests that a PICC has entered the internal jugular vein instead of the central circulation?"

response = rag_chain.invoke({"query": query})
print("Generated Answer:")
print(response)


Retrieved Douments:
Conceptualization: Hee-Soo Kim. Data curation: Sang-
Hwan Ji, Sol Ji Yoo, Hee-Soo Kim. Formal analysis: Sol Ji Yoo,
Eun-Hee Kim, Ji-Hyun Lee. Methodology: Young-Eun Jang,
Jin-Tae Kim. Investigation: Sang-Hwan Ji, Sung-Ae Cho, Eun-
Hee Kim, Hee-Soo Kim. Software: Sung-Ae Cho, Young-Eun
Jang, Eun-Hee Kim. Writing - original draft: Sang-Hwan Ji,
Sol Ji Yoo, Sung-Ae Cho, Hee-Soo Kim. Writing - review &
editing: Sang-Hwan Ji, Young-Eun Jang, Eun-Hee Kim, Ji-
Hyun Lee, Jin-Tae Kim, Hee-Soo Kim.
Subject Value
 Catheter insertion length (cm) 28 (26, 32)
 Number of attempts to success 
 1 19 (63.3)
 2 6 (20.0)
 3 2 (6.7)
 Reason for failed attempts (n = 17) 
 Venipuncture failure 3 (17.6)
 Guidewire insertion failure 2 (11.8)
 Dilation failure 3 (17.6)
 Catheter advancement failure 9 (52.9)
 Catheter tip position 
 Optimal* 14 (46.7)
 Suboptimal† 8 (26.7)
 Malpositioning‡ 5 (16.7)
 Failure§ 3 (10.0)
Catheter insertion technique
Among the 30 patients, the procedure was comple

ValidationError: 1 validation error for StuffDocumentsChain
  Value error, document_variable_name context was not found in llm_chain input_variables: ['query'] [type=value_error, input_value={'llm_chain': LLMChain(ve...None, 'callbacks': None}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/value_error

In [26]:
# 쿼리 실행
query ="How many cases experienced failure of PICC insertion due to both veins being unsuitable?"
relevant_docs = retriever.invoke(query)

# 결과 출력
print("Retrieved Documents:")
for doc in relevant_docs:
    print(doc.page_content)

# OpenAI LLM 설정
llm = ChatOpenAI(model_name="gpt-4", openai_api_key=OPENAI_API_KEY)

# RAG (Retrieval-Augmented Generation) 모델 생성
rag_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

# 질의 실행
response = rag_chain.invoke({"query": query})
print("Generated Answer:")
print(response)

Retrieved Documents:
Conceptualization: Hee-Soo Kim. Data curation: Sang-
Hwan Ji, Sol Ji Yoo, Hee-Soo Kim. Formal analysis: Sol Ji Yoo,
Eun-Hee Kim, Ji-Hyun Lee. Methodology: Young-Eun Jang,
Jin-Tae Kim. Investigation: Sang-Hwan Ji, Sung-Ae Cho, Eun-
Hee Kim, Hee-Soo Kim. Software: Sung-Ae Cho, Young-Eun
Jang, Eun-Hee Kim. Writing - original draft: Sang-Hwan Ji,
Sol Ji Yoo, Sung-Ae Cho, Hee-Soo Kim. Writing - review &
editing: Sang-Hwan Ji, Young-Eun Jang, Eun-Hee Kim, Ji-
Hyun Lee, Jin-Tae Kim, Hee-Soo Kim.
Although some reports have described landmark-based
determination of PICC insertion length, they are limited to
adults [7]. Since there is no standardized method to deter-
mine the appropriate length of PICC insertion in children
and no fluoroscopy was available, we simply measured the
distance between the targeted insertion site and the sternal
notch [7]. The insertion length was optimal in approximately
half of the cases and was acceptable (optimal and subopti-
mal combined) in 

In [33]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate

# 프롬프트 정의: 문서와 질문을 기반으로 명확한 답변을 유도
prompt_template = """
You are a medical assistant helping to analyze clinical research data.

Use the following context extracted from a study to answer the question as precisely as possible.
If the answer is not contained in the context, say "The context does not provide an answer."

--------------------
Context:
{context}
--------------------

Question: {query}

Answer in one sentence:
"""

custom_prompt = PromptTemplate(
    input_variables=["context", "query"],
    template=prompt_template,
)

# OpenAI 모델 설정
llm = ChatOpenAI(model_name="gpt-4", openai_api_key=OPENAI_API_KEY)

# Retrieval QA 체인 구성
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": custom_prompt},
    return_source_documents=True  # 어디서 답을 가져왔는지도 확인할 수 있게 설정
)

# 질의 실행
query = "How many cases experienced failure of PICC insertion due to both veins being unsuitable?"
response = rag_chain.invoke({"query": query})

# 결과 출력
print("Generated Answer:")
print(response["result"])  # 결과만 보기

print("\nSource Document(s):")
for doc in response["source_documents"]:
    print(doc.page_content)


ValueError: Missing some input keys: {'query'}