In [39]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [40]:
index_name = "rag"
pc = Pinecone()
index_exists = pc.list_indexes().get("indexes", [])  # Get a list of existing indexes

if index_name not in [index["name"] for index in index_exists]:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


In [41]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Jane Smith',
  'subject': 'Computer Science',
  'stars': 4,
  'review': 'Dr. Smith is very knowledgeable and her lectures are well-organized. The coursework is challenging but fair.'},
 {'professor': 'Dr. John Doe',
  'subject': 'Mathematics',
  'stars': 3,
  'review': 'Dr. Doe knows his stuff, but his lectures can be a bit dry. Office hours are very helpful though.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Physics',
  'stars': 5,
  'review': 'Dr. Johnson is an amazing professor! She makes complex concepts easy to understand and is always willing to help.'},
 {'professor': 'Dr. Michael Brown',
  'subject': 'History',
  'stars': 2,
  'review': "Dr. Brown's lectures are hard to follow, and his grading is harsh. Be prepared to read a lot."},
 {'professor': 'Dr. Susan Lee',
  'subject': 'Biology',
  'stars': 4,
  'review': "Dr. Lee is a great professor who cares about her students. Her exams are tough, but she's fair in her grading."},
 {'professor': 'Dr. Wil

In [42]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
    "values": embedding,
    "id": review['professor'],
    "metadata": {
        "review": review['review'],
        "subject": review["subject"],
        "stars": review["stars"]
    }
    })

In [43]:
processed_data[0]

{'values': [-0.0043026027,
  -0.0003754208,
  -0.0011423199,
  0.021139015,
  0.039234012,
  0.023779765,
  -0.0049497816,
  -0.01428347,
  0.018784454,
  0.0065498427,
  0.014595677,
  -0.030544251,
  -0.030310094,
  0.004582288,
  0.03403056,
  0.030700354,
  -0.01362003,
  -0.01607866,
  0.008624718,
  0.03129875,
  0.034082595,
  -0.030570267,
  0.02638149,
  -0.031246716,
  -0.050187275,
  -0.041757684,
  0.03127273,
  0.011421572,
  0.0014228183,
  0.023727732,
  0.062389363,
  -0.012930573,
  -0.0066994415,
  0.0057433075,
  -0.024937533,
  0.056509465,
  -0.021204058,
  0.0536996,
  0.024091972,
  -0.004562775,
  -0.003061905,
  -0.0019138939,
  -0.05853881,
  -0.033978526,
  -0.00425382,
  -0.033978526,
  0.018381186,
  -0.0035318416,
  0.021828473,
  0.029997889,
  -0.051201947,
  0.04709122,
  0.045660272,
  -0.06618788,
  -0.06670823,
  0.015050978,
  0.0071482393,
  0.06400243,
  0.00097239466,
  -0.022947215,
  0.06462685,
  -0.014543642,
  -0.014192409,
  -0.031428836,
 

In [44]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [None]:
index.describe_index_stats()