In [1]:
# %pip install python-dotenv pandas pinecone langchain langchain-openai langchain-pinecone scikit-learn matplotlib


In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_LLM_MODEL = os.getenv("OPENAI_LLM_MODEL")  # 'gpt-4o-mini'
OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_REGION = os.getenv("PINECONE_INDEX_REGION")
PINECONE_INDEX_CLOUD = os.getenv("PINECONE_INDEX_CLOUD")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")  # 'ir'
PINECONE_INDEX_METRIC = os.getenv("PINECONE_INDEX_METRIC")
PINECONE_INDEX_DIMENSION = int(os.getenv("PINECONE_INDEX_DIMENSION"))

# 압축 인덱스 이름
COMPRESSED_INDEX_NAME = f"{PINECONE_INDEX_NAME}-compressed"

print(f"환경 변수 로딩 완료 : {PINECONE_INDEX_NAME}, {PINECONE_INDEX_REGION}, {PINECONE_INDEX_CLOUD}")

환경 변수 로딩 완료 : ir-embeddings, us-east-1, aws


In [3]:
import pandas as pd

documents_df = pd.read_csv("../../datas/documents.csv")
queries_df = pd.read_csv("../../datas/queries.csv")

print(f"문서 수: {len(documents_df)}")
print(f"질의 수: {len(queries_df)}")

문서 수: 30
질의 수: 30


In [4]:
import time
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# ChatOpenAI 인스턴스
chat_model = ChatOpenAI(
    model_name=OPENAI_LLM_MODEL,
    openai_api_key=OPENAI_API_KEY,
    temperature=0.3
)

# PromptTemplate 설정
summarize_prompt = PromptTemplate(
    input_variables=["text"],
    template="""
아래 문서를 읽고, 핵심 내용을 짧고 간결하게 요약하세요:

{text}

요약:
"""
)

# StrOutputParser 설정
output_parser = StrOutputParser()

# 체인 구성
summarization_chain = summarize_prompt | chat_model | output_parser
print("요약용 LangChain 체인 구성 완료")

# 문서별 요약 생성
compressed_texts = []
for idx, row in documents_df.iterrows():
    doc_id = row['doc_id']
    content = row['content']
    summary = summarization_chain.invoke({"text": content})
    compressed_texts.append({'doc_id': doc_id, 'content': summary})
    time.sleep(1)  # 호출 제한 관리

compressed_df = pd.DataFrame(compressed_texts)
print("GPT-4o-mini 기반 문서 압축 완료")

요약용 LangChain 체인 구성 완료
GPT-4o-mini 기반 문서 압축 완료


In [5]:
compressed_df.to_csv("../../datas/compressed_documents.csv", index=True)

In [6]:
from pinecone import Pinecone, ServerlessSpec
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

# Pinecone 클라이언트 연결
pc = Pinecone(api_key=PINECONE_API_KEY)

# 압축 인덱스 없으면 생성
if COMPRESSED_INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=COMPRESSED_INDEX_NAME,
        dimension=PINECONE_INDEX_DIMENSION,
        metric=PINECONE_INDEX_METRIC,
        spec=ServerlessSpec(region=PINECONE_INDEX_REGION, cloud=PINECONE_INDEX_CLOUD)
    )
compressed_index = pc.Index(COMPRESSED_INDEX_NAME)

# 임베딩 모델 생성
embedding_model = OpenAIEmbeddings(model=OPENAI_EMBEDDING_MODEL, openai_api_key=OPENAI_API_KEY)
# 원본 벡터 스토어 (기존 ir)
vector_store = PineconeVectorStore(index_name=PINECONE_INDEX_NAME, embedding=embedding_model)
# 압축 벡터 스토어
compressed_vector_store = PineconeVectorStore(index_name=COMPRESSED_INDEX_NAME, embedding=embedding_model)

print("Pinecone 연결 및 벡터 스토어 설정 완료")

  from .autonotebook import tqdm as notebook_tqdm


Pinecone 연결 및 벡터 스토어 설정 완료


In [7]:
from langchain.schema import Document

# 압축 문서를 compressed index에 업서트
docs_to_upsert = []
for idx, row in compressed_df.iterrows():
    docs_to_upsert.append(Document(page_content=row['content'], metadata={'doc_id': row['doc_id']}))
compressed_vector_store.add_documents(docs_to_upsert)
print("압축 문서 업서트 완료")

압축 문서 업서트 완료


In [8]:
import numpy as np

def parse_relevant(relevant_str):
    pairs = relevant_str.split(';')
    rel_dict = {}
    for pair in pairs:
        doc_id, grade = pair.split('=')
        rel_dict[doc_id] = int(grade)
    return rel_dict

def compute_metrics(predicted, relevant_dict, k=5):
    hits = sum(1 for doc in predicted[:k] if doc in relevant_dict)
    precision = hits / k
    total_relevant = len(relevant_dict)
    recall = hits / total_relevant if total_relevant > 0 else 0
    rr = 0
    for idx, doc in enumerate(predicted):
        if doc in relevant_dict:
            rr = 1 / (idx + 1)
            break
    num_correct = 0
    precisions = []
    for i, doc in enumerate(predicted[:k]):
        if doc in relevant_dict:
            num_correct += 1
            precisions.append(num_correct / (i + 1))
    ap = np.mean(precisions) if precisions else 0
    return precision, recall, rr, ap

def evaluate_all(results_dict, queries_df, k=5):
    prec_list, rec_list, rr_list, ap_list = [], [], [], []
    for idx, row in queries_df.iterrows():
        qid = row['query_id']
        relevant = parse_relevant(row['relevant_doc_ids'])
        predicted = results_dict[qid]
        p, r, rr, ap = compute_metrics(predicted, relevant, k)
        prec_list.append(p)
        rec_list.append(r)
        rr_list.append(rr)
        ap_list.append(ap)
    return {
        'P@5': np.mean(prec_list),
        'R@5': np.mean(rec_list),
        'MRR': np.mean(rr_list),
        'MAP': np.mean(ap_list)
    }


In [9]:
# 원본(index 'ir')과 압축(index 'ir-compressed')에서 검색 수행
orig_results = {}
comp_results = {}
for idx, row in queries_df.iterrows():
    qid = row['query_id']
    query_text = row['query_text']
    # 원본 검색
    docs_orig = vector_store.similarity_search(query_text, k=5)
    orig_results[qid] = [doc.metadata['doc_id'] for doc in docs_orig]
    # 압축 검색
    docs_comp = compressed_vector_store.similarity_search(query_text, k=5)
    comp_results[qid] = [doc.metadata['doc_id'] for doc in docs_comp]

print("검색 결과 수집 완료")
# 평가
orig_metrics = evaluate_all(orig_results, queries_df, k=5)
comp_metrics = evaluate_all(comp_results, queries_df, k=5)

import pandas as pd
df_metrics = pd.DataFrame({
    'Metric': ['P@5', 'R@5', 'MRR', 'MAP'],
    'Original': [orig_metrics['P@5'], orig_metrics['R@5'], orig_metrics['MRR'], orig_metrics['MAP']],
    'Compressed': [comp_metrics['P@5'], comp_metrics['R@5'], comp_metrics['MRR'], comp_metrics['MAP']]
})
df_metrics

검색 결과 수집 완료


Unnamed: 0,Metric,Original,Compressed
0,P@5,0.466667,0.24
1,R@5,1.716667,0.816667
2,MRR,0.977778,0.833333
3,MAP,0.978148,0.818519
