In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, WebBaseLoader, YoutubeLoader, DirectoryLoader, TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone
from openai import OpenAI
import numpy as np
import tiktoken
import json
import os
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

USER_AGENT environment variable not set, consider setting it to identify your requests.
  from tqdm.autonotebook import tqdm


In [14]:
# HuggingFace Embeddings
# Use this instead of OpenAI embeddings if you don't have an OpenAI account with credits

text = "This is a test document."

hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
query_result = hf_embeddings.embed_query(text)



In [None]:
data = json.load(open("reviews.json"))
data["restaurants"]

In [9]:
# Free Llama 3.1 API via OpenRouter
# Use this instead of OpenAI if you don't have an OpenAI account with credits

processed_data = []

openrouter_client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=os.getenv('OPENROUTER_API_KEY')
)

In [21]:
# Initialize the model
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load the mock data from reviews.json
with open('reviews.json', 'r') as file:
    data = json.load(file)

# Create an empty list to hold the processed data
processed_data = []

# Iterate through each restaurant and their reviews
for restaurant in data["restaurants"]:
    for review in restaurant["reviews"]:
        # Generate embedding for the review text
        #embedding = model.encode(review['review']).tolist()
        embedding = hf_embeddings.embed_query(review['review'])

        # Append processed data
        processed_data.append(
            {
                "values": embedding,
                "id": restaurant["name"],
                "metadata": {
                    "review": review["review"],
                    "cuisine": restaurant["cuisine"],
                    "category": restaurant["category"],
                    "restaurant_rating": restaurant["rating"],
                    "review_rating": review["rating"]
                }
            }
        )

# processed_data now contains the embeddings and metadata for each review
# You can save it to a file or use it further in your application
with open('processed_data.json', 'w') as outfile:
    json.dump(processed_data, outfile, indent=4)

print("Embeddings created and saved to 'processed_data.json'.")

Embeddings created and saved to 'processed_data.json'.


In [19]:
processed_data

[{'values': [-0.09749305248260498,
   0.042789291590452194,
   0.034945785999298096,
   0.07777624577283859,
   -0.00859276857227087,
   0.007957379333674908,
   0.012005404569208622,
   -0.05793491378426552,
   -0.07833213359117508,
   -0.0741262286901474,
   0.07192082703113556,
   -0.0304417721927166,
   0.022932391613721848,
   -0.04754607751965523,
   0.010347015224397182,
   0.01986876130104065,
   0.11224512755870819,
   -0.08452019095420837,
   -0.04964462295174599,
   -0.01629791222512722,
   0.021738190203905106,
   0.039976853877305984,
   -0.025767160579562187,
   0.010110616683959961,
   -0.011443429626524448,
   0.04720035567879677,
   0.022560585290193558,
   0.03910685330629349,
   0.003205608343705535,
   -0.06181204691529274,
   -0.023724541068077087,
   0.013804611749947071,
   0.008394413627684116,
   0.020885160192847252,
   0.03120485320687294,
   0.04877016320824623,
   0.05014519393444061,
   -0.15370915830135345,
   0.04893499240279198,
   0.016705377027392387,