In [45]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
import json
import os
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

In [47]:
#Don't run the cell unless you want to create new index.
pc.create_index(
    name="restuarant-rag",
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [None]:
# HuggingFace Embeddings
# Use this instead of OpenAI embeddings if you don't have an OpenAI account with credits
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
data = json.load(open("reviews.json"))
data["restaurants"]

In [50]:
processed_data = []

# Iterate through each restaurant and their reviews
for restaurant in data["restaurants"]:
    # Combine relevant fields into a single string for embedding
    combined_text = f"{restaurant['description']} {restaurant['cuisine']} {restaurant['category']} {restaurant['rating']} {restaurant['priceRange']} {' '.join([review['review'] for review in restaurant['reviews']])}"
    
    # Generate embedding for the combined text
    embedding = hf_embeddings.embed_query(combined_text)

    # Append processed data
    processed_data.append(
        {
            "values": embedding,
            "id": restaurant["name"],
            "metadata": {
                "description": restaurant["description"],
                "cuisine": restaurant["cuisine"],
                "category": restaurant["category"],
                "restaurant_rating": restaurant["rating"],
                "price_range": restaurant["priceRange"],
                "location": restaurant["location"],
                "reviews":" ".join([review['review'] for review in restaurant["reviews"]])
            }
        }
    )


In [None]:
# processed_data now contains the embeddings and metadata for each review
# You can save it to a file or use it further in your application
with open('processed_data.json', 'w') as outfile:
    json.dump(processed_data, outfile, indent=4)

print("Embeddings created and saved to 'processed_data.json'.")

In [None]:
processed_data[0]

In [59]:
# Insert the embeddings into the Pinecone index
index = pc.Index("restuarant-rag")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)

In [None]:
index.describe_index_stats()