In [1]:
from dotenv import load_dotenv
load_dotenv() 
import os
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm, trange


In [2]:

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag" , dimension=384, metric="cosine", spec=ServerlessSpec(cloud="aws",region="us-east-1")
)


In [3]:
import json
data=json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Psychology',
  'stars': 5,
  'review': "Dr. Johnson's lectures are insightful and engaging. She makes complex topics easy to understand."},
 {'professor': 'Prof. Michael Lee',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Prof. Lee is very knowledgeable, but his lectures can be a bit fast-paced at times.'},
 {'professor': 'Dr. Susan Carter',
  'subject': 'Biology',
  'stars': 3,
  'review': "Dr. Carter's lectures are informative, but her grading criteria can be unclear."},
 {'professor': 'Prof. David Thompson',
  'subject': 'History',
  'stars': 5,
  'review': 'Prof. Thompson has a passion for history that makes his classes incredibly engaging.'},
 {'professor': 'Dr. Jessica White',
  'subject': 'Chemistry',
  'stars': 4,
  'review': 'Dr. White is very approachable and always willing to help with difficult concepts.'},
 {'professor': 'Prof. John Davis',
  'subject': 'Physics',
  'stars': 2,
  'review': 'Prof. Davis is knowledge

In [4]:
processed_data = []
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

for review in data['reviews']:
    embedding = model.encode(review['review'])
    processed_data.append({
        "values": embedding.tolist(),
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })



In [5]:
processed_data[0]

{'values': [-0.11260160803794861,
  -0.20677514374256134,
  -0.2015436738729477,
  0.28645315766334534,
  -0.28760042786598206,
  0.39670389890670776,
  -0.28936731815338135,
  0.12330608814954758,
  -0.04254387319087982,
  0.04124155268073082,
  -0.05252132564783096,
  0.18020989000797272,
  -0.05709284916520119,
  0.32333114743232727,
  -0.20155245065689087,
  0.412378191947937,
  0.5116987228393555,
  0.015796691179275513,
  -0.3276572525501251,
  -0.2663814425468445,
  -0.42543479800224304,
  0.2009955793619156,
  0.26713526248931885,
  0.031202001497149467,
  -0.2105790227651596,
  -0.04088139161467552,
  0.12797702848911285,
  -0.2695576250553131,
  0.22025944292545319,
  -0.207169309258461,
  -0.025581711903214455,
  0.3436863422393799,
  0.14719437062740326,
  0.1173117384314537,
  -0.22313442826271057,
  0.04950880631804466,
  0.3103867173194885,
  0.2609418034553528,
  0.34854480624198914,
  0.14668779075145721,
  -0.1375444531440735,
  -0.09912071377038956,
  0.4110374152660

In [6]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1" 
)

{'upserted_count': 20}

In [7]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}