In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [2]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small", 
    api_key=OPENAI_API_KEY
)

In [3]:
from pinecone import Pinecone, ServerlessSpec

# API 키 설정 (본인의 API 키로 교체하세요)
pcone = Pinecone(api_key=PINECONE_API_KEY)   # Pinecone API 키 입력

# Pinecone 인덱스 생성
index_name = "wiki-vector-index"  # 인덱스 이름 설정
if not pcone.has_index(index_name):
    pcone.create_index(
        name=index_name,
        dimension=1536,            # 임베딩 벡터 차원 (모델 output 크기와 맞춤)
        metric="cosine",           # 유사도 metric 선택
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
# 생성된 인덱스에 연결
wiki_index = pcone.Index(index_name)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from datasets import load_dataset
data = load_dataset("wikipedia", "20220301.simple", split="train[:100]", trust_remote_code=True)

In [12]:
for record in data[:3]:
    print(record)

id
url
title
text


In [7]:
len(data)

100

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

In [9]:
data[0].keys()

dict_keys(['id', 'url', 'title', 'text'])

In [10]:
texts = []
metas = []
batch_size = 100
count = 0

for i, sample in enumerate(data):
    text = sample["text"]
    metadata = {
        "title": sample["title"],
        "wiki_id": sample["id"],
        "url": sample["url"]
    }

    chunks = splitter.split_text(text)
    for i, chunk in enumerate(chunks):
        record = {
            "chunk_id": i,
            "text": text,
            **metadata
        }

        texts.append(chunk)
        metas.append(record)
        count += 1

        if count % batch_size == 0:
            vectors = embeddings.embed_documents(texts)
            ids = [f"{record['wiki_id']}_{record['chunk_id']}" for record in metas]
            wiki_index.upsert(zip(ids, vectors, metas))
            texts = []
            metas = []
            print(f"Upserted {count} records")

            




Upserted 100 records
Upserted 200 records
Upserted 300 records
Upserted 400 records
Upserted 500 records
Upserted 600 records
Upserted 700 records
Upserted 800 records
Upserted 900 records
Upserted 1000 records
Upserted 1100 records
Upserted 1200 records
Upserted 1300 records
Upserted 1400 records
Upserted 1500 records


In [11]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(
    index=wiki_index,
    embedding=embeddings,
    text_key="text"
)

question = "벨기에(Belgium)는 어디 있나요?"

docs = vector_store.similarity_search(query=question, k=5)
for doc in docs:
    print(doc.metadata)

{'chunk_id': 0.0, 'title': 'Belgium', 'url': 'https://simple.wikipedia.org/wiki/Belgium', 'wiki_id': '103'}
{'chunk_id': 20.0, 'title': 'Belgium', 'url': 'https://simple.wikipedia.org/wiki/Belgium', 'wiki_id': '103'}
{'chunk_id': 2.0, 'title': 'Belgium', 'url': 'https://simple.wikipedia.org/wiki/Belgium', 'wiki_id': '103'}
{'chunk_id': 19.0, 'title': 'Belgium', 'url': 'https://simple.wikipedia.org/wiki/Belgium', 'wiki_id': '103'}
{'chunk_id': 4.0, 'title': 'Belgium', 'url': 'https://simple.wikipedia.org/wiki/Belgium', 'wiki_id': '103'}
