In [1]:
from pinecone import Pinecone
from dotenv import load_dotenv
import os
load_dotenv()
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

In [None]:
# index = pc.Index(os.getenv("PINECONE_INDEX_NAME"))
# index.describe_index_stats()

In [None]:
from pinecone import Pinecone, ServerlessSpec
index_name = "wine-embeddings"
pc.create_index(
    name=index_name,
    dimension=768,  # 모델 차원
    metric="cosine",  # 모델 메트릭
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

In [3]:
# 'wine' 인덱스를 가져옵니다.
index = pc.Index(index_name)

index.describe_index_stats()

  from .autonotebook import tqdm as notebook_tqdm


{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 1280}},
 'total_vector_count': 1280,
 'vector_type': 'dense'}

# 데이터 로드

In [4]:
from langchain_community.document_loaders import CSVLoader

loader = CSVLoader("./winemag-data-130k-v2.csv")
docs = loader.load()

for i, doc in enumerate(docs[:3]):
    print(str(i), doc)

0 page_content=': 0
country: Italy
description: Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.
designation: Vulkà Bianco
points: 87
price: 
province: Sicily & Sardinia
region_1: Etna
region_2: 
taster_name: Kerin O’Keefe
taster_twitter_handle: @kerinokeefe
title: Nicosia 2013 Vulkà Bianco  (Etna)
variety: White Blend
winery: Nicosia' metadata={'source': './winemag-data-130k-v2.csv', 'row': 0}
1 page_content=': 1
country: Portugal
description: This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016.
designation: Avidagos
points: 87
price: 15.0
province: Douro
region_1: 
region_2: 
taster_name: Roger Voss
taster_twitter_handle: @vossroger
title: Quinta dos Avidagos 2011 Avidagos Red (Douro)
varie

In [5]:
vars(docs[0])

{'id': None,
 'metadata': {'source': './winemag-data-130k-v2.csv', 'row': 0},
 'page_content': ": 0\ncountry: Italy\ndescription: Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.\ndesignation: Vulkà Bianco\npoints: 87\nprice: \nprovince: Sicily & Sardinia\nregion_1: Etna\nregion_2: \ntaster_name: Kerin O’Keefe\ntaster_twitter_handle: @kerinokeefe\ntitle: Nicosia 2013 Vulkà Bianco  (Etna)\nvariety: White Blend\nwinery: Nicosia",
 'type': 'Document'}

In [6]:
import pinecone
from langchain_community.document_loaders import CSVLoader
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm
import numpy as np
from pinecone import Pinecone
import os
from dotenv import load_dotenv

# 환경 변수 로드
load_dotenv()
# Pinecone 객체 생성
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# 인덱스 연결
index_name = "wine-embeddings"
index = pc.Index(index_name)

# CSVLoader로 데이터 불러오기
loader = CSVLoader(
    "winemag-data-130k-v2.csv",
    encoding="utf-8",
)
docs = loader.load()
print(f"총 문서 개수: {len(docs)}")

# Hugging Face 임베딩 모델 로드 (GPU)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)

# 임베딩 생성 및 Pinecone 업로드
batch_size = 256  # GPU 환경이면 배치 크게 가능
for i in tqdm(range(0, len(docs), batch_size)):
    batch_docs = docs[i:i+batch_size]
    texts = [doc.page_content for doc in batch_docs]
    
    # GPU: convert_to_tensor=True
    embeddings = model.encode(texts, show_progress_bar=False, convert_to_tensor=True)
    embeddings = embeddings.cpu().numpy()  # Pinecone 업로드용으로 CPU로 이동

    # Pinecone 업로드용 리스트 생성
    to_upsert = [(str(i+j), embeddings[j].tolist(), {"text": texts[j]}) for j in range(len(batch_docs))]
    index.upsert(vectors=to_upsert)

print("Pinecone 업로드 완료!")


총 문서 개수: 129971
Using device: cuda


100%|██████████| 508/508 [3:40:27<00:00, 26.04s/it]  

Pinecone 업로드 완료!





In [3]:
import torch

# GPU 사용 가능 여부
print("GPU 사용 가능:", torch.cuda.is_available())

# GPU 개수
print("GPU 개수:", torch.cuda.device_count())

# GPU 이름 (0번 장치 기준)
if torch.cuda.is_available():
    print("GPU 이름:", torch.cuda.get_device_name(0))

# 현재 디바이스 정보
print("현재 디바이스:", torch.cuda.current_device() if torch.cuda.is_available() else "CPU")


GPU 사용 가능: True
GPU 개수: 1
GPU 이름: NVIDIA GeForce RTX 4060 Laptop GPU
현재 디바이스: 0
