<a href="https://colab.research.google.com/github/yunchengyang515/hybrid-toolbox-data-analysis/blob/main/hybrid_toolbox_research_document_loader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG project that loads research PDF documents in to a vector Database
Tech Stack:
- Supbase
- Open AI Embedding
- Llamaindex

### 1️⃣ Install Required Libraries

In [6]:
!pip install llama-index llama-index-readers-smart-pdf-loader llama-index-vector-stores-supabase llmsherpa supabase
!pip install -q llama-index-llms-huggingface
!pip install -q llama-index-embeddings-langchain



In [7]:
!pip install -q transformers einops accelerate langchain bitsandbytes sentence_transformers langchain-community

### 2️⃣ Import Libraries

In [8]:
from google.colab import drive, userdata
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.supabase import SupabaseVectorStore
from llama_index.core import StorageContext

In [9]:
import os, logging, sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

### 3️⃣ Retrieve Secrets & Mount Google Drive

In [10]:
import os
from huggingface_hub import login
SUPABASE_CONNECTION = userdata.get('SUPABASE_CONNECTION')
drive.mount('/content/drive')

os.environ["HF_KEY"] = userdata.get('HF_KEY')
login(token=os.environ.get('HF_KEY'),add_to_git_credential=True)

Mounted at /content/drive


In [11]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

EMBEDDING_MODEL_NAME = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"

embed_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)

  embed_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [39]:
vector_store = SupabaseVectorStore(
    postgres_connection_string=userdata.get('SUPABASE_CONNECTION'),
    collection_name="documents",
    dimension = 384
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
print(storage_context)

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x788ddc63e890>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x788ddf0f71d0>, vector_stores={'default': SupabaseVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=False), 'image': SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={}, text_id_to_ref_doc_id={}, metadata_dict={}))}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x788ddf21db90>, property_graph_store=None)


In [40]:
input_dir = '/content/drive/MyDrive/hybrid_toolbox/research_documents'
reader = SimpleDirectoryReader(input_dir=input_dir, recursive=True, required_exts=[".pdf"])
documents = reader.load_data()

In [41]:
documents[0]

Document(id_='259ebc24-fee8-4b36-9d2e-d68e356d2f37', embedding=None, metadata={'page_label': '1', 'file_name': 'Elicit - Injury Prevention in Amateur Marathon Runners - Report.pdf', 'file_path': '/content/drive/MyDrive/hybrid_toolbox/research_documents/Elicit - Injury Prevention in Amateur Marathon Runners - Report.pdf', 'file_type': 'application/pdf', 'file_size': 42892, 'creation_date': '2025-03-10', 'last_modified_date': '2025-03-08'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Injury Prevention in Amateur Marathon Runners\nHip-focused strength training before running sessions was the most effective injury prevention strategy for\n

In [42]:
def process_and_store_documents(documents):

    if documents:
        index = VectorStoreIndex.from_documents(documents, storage_context=storage_context, embed_model=embed_model)
        return index
    return None

In [43]:
index = process_and_store_documents(documents)