In [7]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [5]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
  name="rag", 
  dimension=1536, 
  metric="cosine",
  spec=ServerlessSpec(cloud="aws", region="us-east-1")
  )


In [6]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. John Smith',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Great lectures, but the exams are quite challenging.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Physics',
  'stars': 5,
  'review': 'Explains concepts very clearly. Highly recommend!'},
 {'professor': 'Dr. William Brown',
  'subject': 'Computer Science',
  'stars': 3,
  'review': 'Interesting assignments, but sometimes hard to follow in class.'},
 {'professor': 'Dr. Olivia Davis',
  'subject': 'Chemistry',
  'stars': 2,
  'review': "Material is tough, and the professor isn't very approachable."},
 {'professor': 'Dr. Michael Wilson',
  'subject': 'Biology',
  'stars': 5,
  'review': 'Passionate about the subject and always willing to help.'},
 {'professor': 'Dr. Sophia Taylor',
  'subject': 'History',
  'stars': 4,
  'review': 'Engaging lectures, but grading is strict.'},
 {'professor': 'Dr. James Lee',
  'subject': 'English',
  'stars': 3,
  'review': 'Interesting discussions, but the work

In [8]:
processed_data = []
client = OpenAI()

for review in data["reviews"]:
  response = client.embeddings.create(
    input=review["review"],
    model="text-embedding-3-small"
  )
  embedding = response.data[0].embedding
  processed_data.append({
    "values": embedding,  
    "id": review["professor"],
    "metadata": {
      "review": review["review"],
      "subject": review["subject"],
      "stars": review["stars"]
    }
  }
)
      

In [9]:
processed_data[0]

{'values': [-0.02260770834982395,
  -0.00678965263068676,
  0.00611569220200181,
  -0.04171882942318916,
  -0.007600407116115093,
  0.022928006947040558,
  0.0012219706550240517,
  0.00020633787789847702,
  0.018190262839198112,
  -0.0024022357538342476,
  -0.00827436801046133,
  -0.023168230429291725,
  -0.03507264330983162,
  -0.04139852896332741,
  -0.02363533154129982,
  -0.003363130148500204,
  -0.018950970843434334,
  -0.027812551707029343,
  -0.008621357381343842,
  0.048578545451164246,
  0.046843599528074265,
  -0.006859717890620232,
  0.05514465644955635,
  0.026558050885796547,
  -0.03608692064881325,
  -0.049405984580516815,
  0.003666746197268367,
  0.020565807819366455,
  0.03784855827689171,
  0.03563316538929939,
  0.05242212489247322,
  -0.015240851789712906,
  0.0008070010808296502,
  -0.022380830720067024,
  -0.05952206626534462,
  0.08082188665866852,
  0.004197239875793457,
  0.05674614757299423,
  0.03595346212387085,
  -0.001711593009531498,
  0.03480572625994682

In [10]:
index = pc.Index("rag")
index.upsert(
  vectors=processed_data,
  namespace="ns1"
)

{'upserted_count': 20}

In [11]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}