In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
import os

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_LLM_MODEL = os.getenv("OPENAI_LLM_MODEL")
OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
PINECONE_INDEX_REGION = os.getenv("PINECONE_INDEX_REGION")
PINECONE_INDEX_CLOUD = os.getenv("PINECONE_INDEX_CLOUD")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_INDEX_METRIC = os.getenv("PINECONE_INDEX_METRIC")
PINECONE_INDEX_DIMENSION = int(os.getenv("PINECONE_INDEX_DIMENSION"))
# PINECONE_API_KEY


In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
    api_key=PINECONE_API_KEY
)

pc.create_index(
    name=PINECONE_INDEX_NAME,
    dimension=PINECONE_INDEX_DIMENSION,
    metric=PINECONE_INDEX_METRIC,
    spec=ServerlessSpec(
        region=PINECONE_INDEX_REGION,
        cloud=PINECONE_INDEX_CLOUD
    )
)

In [None]:
wine_index = pc.Index(PINECONE_INDEX_NAME)
wine_index.describe_index_stats()

In [None]:
from langchain_community.document_loaders import CSVLoader

loader = CSVLoader("../../datas/winemag-data-130k-v2.csv", encoding="utf-8")
docs = loader.load()

docs[0]

In [None]:
print(len(docs))
print(max(len(doc.page_content) for doc in docs))

In [7]:
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings(model=OPENAI_EMBEDDING_MODEL, openai_api_key=OPENAI_API_KEY)

In [None]:
from langchain_pinecone import PineconeVectorStore

In [None]:
BATCH_SIZE = 300
for i in range(0, len(docs), BATCH_SIZE):
    batch = docs[i:i + BATCH_SIZE]
    try:
        PineconeVectorStore.from_documents(
            documents=batch,
            index_name=PINECONE_INDEX_NAME,
            embedding=embedding
        )

        print(f"{i}~{i+len(batch)-1} documents indexed")
    except Exception as e:
        print(f"Error indexing documents {i}~{i+len(batch)-1}: {e}")


In [11]:
vector_store = PineconeVectorStore(
    index_name=PINECONE_INDEX_NAME,
    embedding=embedding
)

query = "달콤한 맛을 가진 와인"
results = vector_store.similarity_search(query, k=5)
for result in results:
    print(f"Content: {result.page_content[:100]}...")

Content: : 11089
country: US
description: Ripe hay, vanilla, and apricot mark the nose; deep orange flavors w...
Content: : 6319
country: US
description: Aromas of candy corn, spice and almond butter are followed by full-b...
Content: : 7687
country: US
description: Smoothly seductive in bright red-berry and pomegranate fruit, this y...
Content: : 10299
country: US
description: High on the deliciousness-factor, this shows soft, seemingly sweet ...
Content: : 10062
country: US
description: This rich cream-textured blend of Chardonnay and Viognier evokes wa...
