In [2]:
from dotenv import load_dotenv
import os
from pinecone import Pinecone, ServerlessSpec
from huggingface_hub import InferenceClient

In [14]:
load_dotenv()

True

In [26]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

In [11]:
pc.create_index(
    name='rag', 
    dimension=1024, # dimension needs to be the same as the embedding output dimension
    metric='cosine', 
    spec=ServerlessSpec(cloud='aws', region='us-east-1')
)

In [21]:
import json
data = json.load(open('reviews.json'))
data['reviews']

[{'professor': 'Dr. Sarah Johnson',
  'subject': 'Introduction to Psychology',
  'stars': 5,
  'review': 'Dr. Johnson is an amazing professor! Her lectures are engaging and she genuinely cares about her students.'},
 {'professor': 'Prof. Michael Lee',
  'subject': 'Calculus I',
  'stars': 3,
  'review': 'The material is tough, but Prof. Lee explains it well. The grading is fair, but the tests are difficult.'},
 {'professor': 'Dr. Emily Brown',
  'subject': 'Organic Chemistry',
  'stars': 4,
  'review': 'Dr. Brown is very knowledgeable, and her labs are well-organized. The course is challenging, but she offers a lot of help.'},
 {'professor': 'Prof. John Smith',
  'subject': 'World History',
  'stars': 2,
  'review': "Prof. Smith's lectures are dry, and the readings are overwhelming. The exams are also too detailed."},
 {'professor': 'Dr. Alice Wang',
  'subject': 'Data Structures',
  'stars': 5,
  'review': 'Dr. Wang is the best! Her explanations are clear, and she makes complex topics

In [18]:
client = InferenceClient(model='WhereIsAI/UAE-Large-V1', token=os.getenv("HF_TOKEN"))

In [22]:
processed_data=[]
# the following structure is specific for Hugging Face Inference API and Pinecone,
# needs to be modified if switch to other embedding model and/or vector database
for review in data['reviews']:
    embedding = client.feature_extraction(review['review'])
    processed_data.append({
        'values': embedding,
        'id': review['professor'],
        'metadata': {
            'review': review['review'],
            'subject': review['subject'],
            'stars': review['stars']
        }
    })

In [23]:
processed_data[0]

{'values': array([-0.02360744,  0.70602494, -0.06913628, ...,  0.52790743,
         0.1484503 ,  0.3966465 ], dtype=float32),
 'id': 'Dr. Sarah Johnson',
 'metadata': {'review': 'Dr. Johnson is an amazing professor! Her lectures are engaging and she genuinely cares about her students.',
  'subject': 'Introduction to Psychology',
  'stars': 5}}

In [24]:
len(processed_data[0]['values'])

1024

In [27]:
# convert to Firebase language, Index=collection, namespace=document
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace='ns1' 
)

{'upserted_count': 20}

In [28]:
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}