In [2]:
from dotenv import load_dotenv
load_dotenv()

import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [7]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
  name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [3]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. Emily Thompson',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Engaging lectures and clear explanations. Challenging assignments, but very rewarding.'},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Brilliant instructor! Makes complex topics easy to understand. Always available for extra help.'},
 {'professor': 'Dr. Sarah Johnson',
  'subject': 'Biology',
  'stars': 3,
  'review': 'Knowledgeable, but sometimes rushes through material. Lab sessions could be better organized.'},
 {'professor': 'Prof. Robert Davis',
  'subject': 'History',
  'stars': 4,
  'review': 'Passionate about the subject. Assignments encourage critical thinking. Could improve on feedback timeliness.'},
 {'professor': 'Dr. Maria Rodriguez',
  'subject': 'Psychology',
  'stars': 5,
  'review': 'Exceptional teacher! Brings real-world examples into lectures. Fair grading and helpful office hours.'},
 {'professor': 'Prof. James Wilson',
  'subje

In [4]:
processed_data = []
client = OpenAI()

for review in data["reviews"]:
  response = client.embeddings.create(
    input=review["review"],
    model="text-embedding-3-small",
  )

  embedding = response.data[0].embedding
  processed_data.append({
    "values": embedding,
    "id": review["professor"],
    "metadata": {
      "review": review["review"],
      "subject": review["subject"],
      "starts": review["stars"]
    }
  })

In [5]:
processed_data[0]

{'values': [-0.030247904,
  0.015799128,
  0.011707559,
  -0.0052191135,
  0.025278606,
  -0.0041354555,
  -0.007244643,
  0.004584448,
  0.008034599,
  0.033731814,
  0.05763306,
  0.024981529,
  -0.051205378,
  -0.033191673,
  0.036135443,
  0.0052562486,
  0.0031311307,
  -0.0029015706,
  0.020039236,
  0.058227215,
  0.033353716,
  -0.015042931,
  0.054284185,
  0.0138208615,
  -0.043157276,
  -0.07648399,
  -0.047181327,
  -0.010782568,
  -0.006036077,
  0.007379678,
  0.035271216,
  -0.014408265,
  -0.0048275115,
  -0.010066881,
  -0.028789522,
  0.079454765,
  0.017662615,
  0.020430839,
  0.022550892,
  0.021821702,
  0.016109709,
  0.01752758,
  -0.022591403,
  -0.03230044,
  0.0057153683,
  -0.012402991,
  -0.036027413,
  -0.019688144,
  0.054824326,
  0.028411424,
  -0.025089556,
  0.040186502,
  0.073621236,
  -0.013618308,
  -0.07281102,
  -0.0074471957,
  0.0018364799,
  0.05790313,
  -0.049152844,
  0.0057727583,
  0.024859997,
  -0.026102321,
  0.043832455,
  0.02084944

In [8]:
index = pc.Index('rag')
index.upsert(
  vectors=processed_data,
  namespace='ns1'
)

{'upserted_count': 20}

In [9]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}