In [2]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [3]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag",dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [4]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Chris Anderson',
  'subject': 'History',
  'stars': 4,
  'review': 'Engaging lectures with a lot of historical insights.'},
 {'professor': 'Robert Harris',
  'subject': 'Math',
  'stars': 5,
  'review': 'Excellent at explaining complex concepts in simple terms.'},
 {'professor': 'James Brown',
  'subject': 'History',
  'stars': 3,
  'review': 'Good content but sometimes hard to follow.'},
 {'professor': 'Susan Smith',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Very knowledgeable and approachable.'},
 {'professor': 'Charles Taylor',
  'subject': 'Physics',
  'stars': 2,
  'review': 'Struggles to keep the class engaged.'},
 {'professor': 'Linda Thomas',
  'subject': 'Literature',
  'stars': 4,
  'review': 'Passionate about the subject and inspires students.'},
 {'professor': 'Patricia Garcia',
  'subject': 'Physics',
  'stars': 3,
  'review': 'Good explanations but assignments are tough.'},
 {'professor': 'Sarah Thompson',
  'subject': 'Physics',
  'stars': 1,
  '

In [5]:
# genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

# # Initialize processed data list
# processed_data = []

# # Create embeddings for each review
# for review in data["reviews"]:
#     response = genai.embed_content(
#         model="models/text-embedding-004",
#         content=review['review'],
#         task_type="retrieval_document"
#     )
#     embedding = response['embedding']
#     processed_data.append(
#         {
#             "values": embedding,
#             "id": review["professor"],
#             "metadata": {
#                 "review": review["review"],
#                 "subject": review["subject"],
#                 "stars": review["stars"],
#             }
#         }
#     )


In [12]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "stars": review["stars"],
            "subject": review["subject"]
        }
    })

KeyboardInterrupt: 

In [14]:
processed_data[0]

{'values': [-0.015199002,
  0.01348227,
  0.019431636,
  0.015628185,
  0.00203307,
  0.031966742,
  -0.010551984,
  0.02159235,
  0.009989607,
  0.03939605,
  0.05682976,
  -0.0047913115,
  -0.024404239,
  -0.016826937,
  0.014074246,
  -0.02894766,
  -0.034956224,
  0.0153469965,
  0.039692037,
  0.03862648,
  0.05611939,
  0.0025510497,
  0.040994387,
  0.031641155,
  -0.0031189772,
  -0.055941798,
  -0.030102015,
  0.012601704,
  0.010611182,
  -0.0061121597,
  0.04546381,
  -0.022791103,
  -0.032173935,
  -0.017611306,
  -0.053129908,
  0.051945955,
  0.04090559,
  0.008820453,
  0.021932736,
  0.03836009,
  0.065472625,
  -0.033476282,
  -0.02181434,
  0.028133692,
  0.034926627,
  -0.03605138,
  -0.024211846,
  -0.021103969,
  0.002340158,
  0.03791611,
  -0.01473282,
  -0.00865026,
  0.037412927,
  -0.026298566,
  -0.008073082,
  -0.01626456,
  -0.03255872,
  0.052419536,
  -0.028266888,
  0.00050688005,
  0.016752942,
  -0.03252912,
  0.024182249,
  -0.0006877104,
  -0.0192244

In [15]:
index= pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 15}

In [16]:
# Query Pinecone to verify the number of vectors
response = index.describe_index_stats(namespace="ns1")
print(response)

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 15}},
 'total_vector_count': 15}


In [17]:
unique_professors = {review['professor'] for review in data['reviews']}
print(unique_professors)

{'Sarah Thompson', 'Chris Anderson', 'Michael Martin', 'Linda Thomas', 'Chris Taylor', 'Chris Jones', 'James Clark', 'Charles Taylor', 'Susan Smith', 'James Brown', 'Patricia Garcia', 'Jane Taylor', 'Emily Martin', 'Alex Martinez', 'Robert Moore', 'Jane Jones', 'Patricia Williams', 'Robert Harris', 'Laura Thomas', 'Sarah Jones'}
