In [6]:
from dotenv import load_dotenv
load_dotenv()
import os
from pinecone import Pinecone, ServerlessSpec
import json
import requests
import uuid

In [7]:
# NOTE: requires .env instead of .env.local (for JS)
# NOTE: pip-chill > requirements.txt to save required packages

# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Create a Pinecone index
pc.create_index(
    name="rag",
    dimension=768,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [3]:
data = json.load(open("reviews.json"))
processed_data = []

# https://huggingface.co/blog/getting-started-with-embeddings
# Create embeddings for each review
# Note: using retry decorator (pip install retry) in query function
model_id = "sentence-transformers/all-mpnet-base-v2"
hf_token = os.getenv("HUGGINGFACE_API_TOKEN")
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{
    model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}

def query(texts):
    for review in texts["reviews"]:
        response = requests.post(api_url, headers=headers, json={
                                 "inputs": review["review"], "options": {"wait_for_model": True}})
        embedding = response.json()  # Extract embedding
        processed_data.append(
            {
                "values": embedding,  # The embeddings returned by Hugging Face
                 # Generate a UUID for each review
                "id": str(uuid.uuid4()),
                "metadata": {
                    "professor": review["professor"], # Professor's name
                    "school": review["school"], # The school the professor teaches
                    # The department the professor belongs to
                    "department": review["department"],
                    "class": review["class"],  # The class the review refers to
                    # Review difficulty rating
                    "difficulty": review["difficulty"],
                    "quality": review["quality"],  # Review quality rating
                    # The timestamp of the review
                    "timestamp": review["timestamp"],
                    "review": review["review"],  # The review content
                }
            }
        )
    return processed_data


output = query(data)

# Insert the embeddings into the Pinecone index
index = pc.Index("rag")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)
print(f"Upserted count: {upsert_response['upserted_count']}")

# Print index statistics
print(index.describe_index_stats())

Upserted count: 14
{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}
