In [8]:
from dotenv import load_dotenv
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
load_dotenv()

model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = os.getenv("HUGGINGFACE_TOKEN")
import requests

api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}

In [15]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=384, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [16]:
import json
data = json.load(open("reviews.json"))
data['reviews']


[{'professor': 'Dr. John Smith',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Very knowledgeable, but sometimes hard to understand during lectures.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Excellent teacher, makes complex topics easy to understand.'},
 {'professor': 'Dr. Robert Brown',
  'subject': 'Chemistry',
  'stars': 3,
  'review': 'Good at explaining concepts, but the exams are really tough.'},
 {'professor': 'Dr. Linda Davis',
  'subject': 'Biology',
  'stars': 4,
  'review': 'Engaging lectures, but a lot of homework.'},
 {'professor': 'Dr. Michael Miller',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Great professor, very supportive and approachable.'},
 {'professor': 'Dr. Sarah Wilson',
  'subject': 'English Literature',
  'stars': 2,
  'review': 'Interesting material, but her grading is inconsistent.'},
 {'professor': 'Dr. David Moore',
  'subject': 'Economics',
  'stars': 4,
  'review': 'Good lectures,

In [17]:
processed_data = []

# Function to get embeddings from Hugging Face
def get_hf_embedding(text):
    payload = {"inputs": text}
    response = requests.post(api_url, headers=headers, json=payload)
    response.raise_for_status()  # Check if the request was successful
    
    # Print the raw JSON response to see its structure
    print(f"Raw JSON response: {response.json()}")
    
    # Assuming the response is a list of lists, extract the first list as the embedding vector
    embedding = response.json()
    
    
    return embedding


# Generate embeddings and prepare data for Pinecone
for review in data['reviews']:
    embedding = get_hf_embedding(review['review'])
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

# Check the first entry in processed_data to verify its structure
print(f"First entry in processed_data: {processed_data[0]}")

Raw JSON response: [0.02574087120592594, -0.05748310685157776, -0.02932148240506649, -0.00642554322257638, -0.108961321413517, -0.01896694116294384, 0.06405319273471832, 0.05570586770772934, -0.021430162712931633, 0.04944321885704994, -0.05305688455700874, 0.08298638463020325, -0.07074841856956482, 0.06358945369720459, -0.07039649039506912, -0.08633160591125488, 0.008692940697073936, -0.05190284177660942, -0.048319824039936066, -0.031435441225767136, 0.024246666580438614, 0.07431907951831818, 0.045748598873615265, -0.005736966151744127, 0.028842616826295853, 0.009164507500827312, -0.04024055600166321, -0.04367050155997276, 0.08465614169836044, -0.06861420720815659, -0.06972469389438629, 0.06082741916179657, 0.050314273685216904, 0.03525381535291672, -0.011487606912851334, 0.06015031039714813, 0.0035057677887380123, 0.05213827267289162, 0.014040510170161724, 0.05475597083568573, -0.037656813859939575, 0.031380247324705124, 0.1009000912308693, -0.05516810715198517, 0.02334323711693287, -

In [18]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [20]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}