In [15]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec



In [5]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name='rag', dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [6]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Smith',
  'subject': 'Computer Science',
  'stars': 4,
  'review': 'Dr. Smith is very knowledgeable and explains concepts clearly.'},
 {'professor': 'Prof. Johnson',
  'subject': 'History',
  'stars': 5,
  'review': 'Prof. Johnson makes history come alive with engaging lectures.'},
 {'professor': 'Dr. Lee',
  'subject': 'Mathematics',
  'stars': 3,
  'review': 'Dr. Lee is helpful during office hours but lectures can be a bit fast-paced.'},
 {'professor': 'Dr. Brown',
  'subject': 'Physics',
  'stars': 2,
  'review': 'Dr. Brown is brilliant but often difficult to follow.'},
 {'professor': 'Prof. Davis',
  'subject': 'Philosophy',
  'stars': 5,
  'review': 'Prof. Davis challenges students to think deeply and critically.'},
 {'professor': 'Dr. Wilson',
  'subject': 'Chemistry',
  'stars': 3,
  'review': "Dr. Wilson's labs are well-organized, but lectures are a bit dry."},
 {'professor': 'Prof. Miller',
  'subject': 'Economics',
  'stars': 4,
  'review': 'Prof. Miller e

In [16]:
processed_data = []
client = OpenAI()
for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [17]:
processed_data[0]

{'values': [-0.030663276,
  -0.0019848996,
  -0.04040982,
  0.009465918,
  0.018712811,
  0.0053729177,
  0.0040793107,
  0.028582554,
  0.007282528,
  -0.022093985,
  0.010259878,
  -0.023941996,
  -0.024845468,
  -0.0031484615,
  0.048130393,
  0.003867132,
  -0.02822664,
  -0.011505573,
  0.017918851,
  0.036248375,
  0.046022292,
  -0.00089919375,
  0.015755996,
  -0.012025754,
  -0.05119672,
  -0.031101324,
  0.037808914,
  0.04498193,
  -0.0041306443,
  -0.014332344,
  0.08196951,
  -0.01049259,
  0.0006314034,
  -0.0021645671,
  -0.043886814,
  0.021984475,
  -0.022408832,
  0.011122282,
  -0.008521379,
  -0.0044899797,
  0.01583813,
  0.035673436,
  -0.05924583,
  -0.021888651,
  0.03827434,
  -0.019123482,
  -0.028062373,
  -0.031758394,
  0.0070224376,
  0.033976004,
  -0.03457832,
  0.008466624,
  0.020615578,
  -0.04763759,
  -0.049526665,
  0.035208013,
  0.04301072,
  0.04087524,
  0.018000986,
  -0.028719444,
  0.08027208,
  -0.008569291,
  -0.016111908,
  -0.008863604,


In [18]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [19]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}