## Data Ingestion for Deep RAG

In this notebook, I'll load extracted data into Qdrant vector database:

- **Markdown**: Page-level chunks with metadata
- **Tables**: Separate documents with context and page numbers
- **Images**: Text descriptions embedded (generated in notebook 06-01b)
- **Hybrid Search**: Dense (semantic) + Sparse (keyword) embeddings

**Prerequisites:**
- Run notebook 06-01 first to extract PDFs
- Run notebook 06-01b to generate image descriptions
- Qdrant server running on localhost:6333
- Google API key set in .env file

**Output:**
- Single Qdrant collection with all content types
- Rich metadata for filtering (company, year, quarter, doc_type, page)
- Deduplication using file hashes

### 0. Qdrant API Setup

In [24]:
import os
from dotenv import load_dotenv
load_dotenv()

from qdrant_client import QdrantClient

qdrant_client = QdrantClient(
    url="https://1fe44dd3-0e21-40c8-a091-818dea1ecbb7.us-east4-0.gcp.cloud.qdrant.io:6333", 
    api_key=os.getenv("QDRANT_API_KEY"),
)

print(qdrant_client.get_collections())

collections=[CollectionDescription(name='financial_doc'), CollectionDescription(name='financial_docs')]


### 1. Setup and Imports

In [25]:
import hashlib
from pathlib import Path

from langchain_google_genai import GoogleGenerativeAIEmbeddings

from langchain_qdrant import QdrantVectorStore, RetrievalMode, FastEmbedSparse

from langchain_core.documents import Document
from qdrant_client import QdrantClient

### 2. Configuration

In [45]:
# Paths
MARKDOWN_DIR = "data/rag-data/markdown"
TABLES_DIR = "data/rag-data/tables"
IMAGES_DESC_DIR = "data/rag-data/images_desc"

# Qdrant Configuration
COLLECTION_NAME = "financial_docs"
EMBEDDING_MODEL = "models/gemini-embedding-001"

### 3. Initialize Embeddings and Client

In [27]:
# Embeddings
embeddings = GoogleGenerativeAIEmbeddings(model=EMBEDDING_MODEL)
sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")

In [28]:
result = embeddings.embed_query('anything')
result

[-0.011239917,
 -0.003467326,
 -0.01105274,
 -0.061294574,
 0.019453146,
 0.0041164085,
 0.027915798,
 0.030927422,
 0.020724343,
 -0.019478088,
 -0.010210239,
 -0.016495619,
 0.006826603,
 0.014704376,
 0.10754771,
 -0.011110156,
 -0.0062106918,
 -0.00084689894,
 -0.014423388,
 -0.012677392,
 0.010252997,
 -0.0056658206,
 0.009224655,
 0.02877456,
 -0.005810222,
 0.005776986,
 0.021359634,
 -0.010408976,
 0.014186839,
 -0.016026922,
 0.018438524,
 0.021181392,
 0.005438517,
 0.0029037222,
 0.017847326,
 0.02250698,
 -0.006939403,
 -0.006773324,
 0.0044366727,
 0.0111732595,
 -0.013067514,
 -0.007899133,
 0.005993952,
 -0.007714994,
 0.008815338,
 0.008485886,
 0.0244984,
 -0.014944867,
 0.0105981305,
 0.024794662,
 -0.016665125,
 -0.016796125,
 -0.019621175,
 -0.15991208,
 -0.022005564,
 0.024612658,
 0.012544701,
 0.00857908,
 0.017164089,
 0.00421856,
 0.0017847299,
 0.002861315,
 -0.009164036,
 -0.020286093,
 -0.012568524,
 0.001471714,
 -0.0022333832,
 0.024866171,
 -0.0022693733,

In [29]:
result = sparse_embeddings.embed_query('hi hello')
result

SparseVector(indices=[948991206, 613153351], values=[1.0, 1.0])

In [30]:
result = sparse_embeddings.embed_documents(['hi', 'hello'])
result

[SparseVector(indices=[948991206], values=[1.6877434821696136]),
 SparseVector(indices=[613153351], values=[1.6877434821696136])]

### 4. Create or Recreate Collection

In [31]:
# Create vector store at Remote location
vector_store = QdrantVectorStore.from_documents(
    documents=[],
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    url="https://1fe44dd3-0e21-40c8-a091-818dea1ecbb7.us-east4-0.gcp.cloud.qdrant.io:6333", 
    api_key = os.getenv("QDRANT_API_KEY"),
    collection_name = COLLECTION_NAME,
    retrieval_mode=RetrievalMode.HYBRID,
    force_recreate=False
)

In [32]:
vector_store.client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='financial_doc'), CollectionDescription(name='financial_docs')])

### 5. Helper Functions

In [33]:
def extract_metadata_from_filename(filename: str):
    """
    Extract metadata from filename.
    
    Expected format: CompanyName DocType [Quarter] Year.pdf
    Examples:
        - Amazon 10-Q Q1 2024.pdf
        - Microsoft 10-K 2023.pdf
    """

    filename = filename.replace('.pdf', '').replace('.md', '')
    parts = filename.split()

    return {
        'company_name': parts[0],
        'doc_type': parts[1],
        'fiscal_quarter': parts[2] if len(parts)==4 else None,
        'fiscal_year': parts[-1]
    }

extract_metadata_from_filename('apple 10-k 2023.md')

{'company_name': 'apple',
 'doc_type': '10-k',
 'fiscal_quarter': None,
 'fiscal_year': '2023'}

In [34]:
def compute_file_hash(file_path: Path):

    sha256_hash = hashlib.sha256()

    with open(file_path, 'rb') as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)

    return sha256_hash.hexdigest()


In [35]:
compute_file_hash(Path(r'data\rag-data\markdown\amazon\amazon 10-k 2023.md'))

'05f2d434b6eee52a5bbb4155a78068b2eda1eeda86b7af55335beb0634ac0398'

In [36]:
def get_processed_hashes():
    
    processed_hashes = set()
    offset = None

    while True:
        points, offset = vector_store.client.scroll(
                            collection_name=COLLECTION_NAME,
                            limit=10_000,
                            with_payload=True,
                            offset=offset
                        )

        if not points:
            break
        
        processed_hashes.update(point.payload['metadata']['file_hash'] for point in points)

        if offset is None:
            break

    return processed_hashes

In [37]:
processed_hashes = get_processed_hashes()

In [38]:
len(processed_hashes)

0

In [39]:
# extract the page number from the file path
import re

def extract_page_number(file_path: Path):
    pattern = r'page_(\d+)'
    match = re.search(pattern=pattern, string=file_path.stem)
    return int(match.group(1)) if match else None

In [40]:
file_path = Path(r'data\rag-data\images_desc\google\google 10-k 2023\page_28.md')
extract_page_number(file_path)

28

### 6. Ingestion Function

In [41]:
def ingest_file_in_db(file_path, processed_hashes):

    file_hash = compute_file_hash(file_path)
    if file_hash in processed_hashes:
        print(f"Following file has been already uploaded: {file_path}")

    path_str = str(file_path)
    if 'markdown' in path_str:
        content_type = 'text'
        doc_name = file_path.name
    elif 'tables' in path_str:
        content_type = 'tables'
        doc_name = file_path.parent.name
    elif 'images_desc' in path_str:
        content_type = 'image'
        doc_name = file_path.parent.name
    else:
        content_type = 'unknown'
        doc_name = file_path.name

    content = file_path.read_text(encoding='utf-8')

    base_metadata = extract_metadata_from_filename(doc_name)

    base_metadata.update({
        'content_type': content_type,
        'file_hash': file_hash,
        'source_file': doc_name
    })

    if content_type == 'text':
        # write method for ingesting markdown data
        pages = content.split('<!-- page break -->')
        documents = []
        for idx, page in enumerate(pages, start=1):
            metadata = base_metadata.copy()
            metadata.update({'page': idx})
            documents.append(Document(page_content=page, metadata=metadata))

        vector_store.add_documents(documents)

    else:
        # write method to ingest images desc and tables .md data
        page_num = extract_page_number(file_path)
        metadata = base_metadata.copy()
        metadata.update({'page': page_num})
        documents = [Document(page_content=content, metadata=metadata)]

        vector_store.add_documents(documents)


    processed_hashes.add(file_hash)


In [None]:
file_path = Path(r'data\rag-data\markdown\amazon\amazon 10-k 2023.md')
processed_hashes = get_processed_hashes()

ingest_file_in_db(file_path, processed_hashes)

In [None]:
from tqdm import tqdm

base_path = Path('data/rag-data')
all_md_files = list(base_path.rglob("*.md"))

for md_file in tqdm(all_md_files):
    ingest_file_in_db(md_file, processed_hashes)

### 8. Verify Ingestion

In [None]:
collection_info = vector_store.client.get_collection(COLLECTION_NAME)
collection_info



### 9. Test Search

In [None]:
query = "what is the tesla's revenue"
results = vector_store.similarity_search(query)

In [None]:
results

[Document(metadata={'company_name': 'meta', 'doc_type': '10-k', 'fiscal_quarter': None, 'fiscal_year': '2024', 'content_type': 'tables', 'file_hash': '459a6644aa4ab684fc242f5492e2438d9594865aa9a562d702d7ecebb724de03', 'source_file': 'meta 10-k 2024', 'page': 101, '_id': '79cbeed7-6a30-460d-8015-a48aa8ccc840', '_collection_name': 'financial_docs'}, page_content='**Page:** 101\n\n| Total revenue  | $ 164,501                 | $ 134,902                 | $ 116,609                 |\nRevenue disaggregated by geography, based on the addresses of our customers, consists of the following (in millions):\n\n|                              | Year Ended December 31,   | Year Ended December 31,   | Year Ended December 31,   |\n|------------------------------|---------------------------|---------------------------|---------------------------|\n|                              | 2024                      | 2023                      | 2022                      |\n| United States and Canada (1) | $ 63,20