In [5]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [3]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [4]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Jane Smith',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Dr. Smith is an amazing professor. Her lectures are clear and she is always willing to help outside of class.'},
 {'professor': 'Dr. John Doe',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Dr. Doe explains concepts well, but his exams are very challenging.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Biology',
  'stars': 3,
  'review': "The course was informative, but Dr. Johnson's lectures were sometimes difficult to follow."},
 {'professor': 'Dr. Michael Brown',
  'subject': 'Chemistry',
  'stars': 4,
  'review': 'Dr. Brown is a good professor, but he moves through the material very quickly.'},
 {'professor': 'Dr. Sarah Davis',
  'subject': 'Physics',
  'stars': 2,
  'review': "Dr. Davis is knowledgeable, but she doesn't seem very approachable."},
 {'professor': 'Dr. David Wilson',
  'subject': 'Economics',
  'stars': 5,
  'review': "Dr. Wilson's lectures are engaging and he

In [8]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input = review['review'],
        model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding, 
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [11]:
processed_data[0]

{'values': [-0.022473834,
  -0.0033288137,
  -0.019648686,
  0.04279407,
  0.011456909,
  0.0061365957,
  0.011647954,
  -0.002854096,
  -0.0018800561,
  -0.022682248,
  0.019833941,
  -0.014588889,
  -0.014739408,
  -0.006547632,
  0.011485855,
  0.029293561,
  0.00027354166,
  0.01114429,
  -0.002854096,
  0.04615183,
  0.025194779,
  -0.0076649557,
  0.0148899285,
  -0.041196242,
  -0.01656881,
  -0.033670228,
  0.015052028,
  0.039413154,
  0.01213425,
  0.012690017,
  0.0764643,
  -0.005740033,
  -0.013500511,
  -0.011399017,
  -0.032026082,
  0.036518537,
  -0.020621277,
  0.007601274,
  -0.0021897806,
  0.00188874,
  0.013778394,
  0.006043968,
  -0.0071786596,
  0.002942382,
  0.030335626,
  -0.025889488,
  -0.011028505,
  0.01160164,
  0.034689136,
  0.032466065,
  -0.05113058,
  0.015353068,
  0.037190087,
  -0.021987539,
  -0.059930228,
  0.035152275,
  0.020262346,
  0.056595623,
  0.030381938,
  -0.021721235,
  0.030613508,
  0.020598121,
  -0.014785723,
  -0.023921145,
  

In [12]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [13]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}