# Add Vector Embeddings to Parsed PDF Blocks

This notebook enhances each block from the parsed PDF JSON with vector embeddings using sentence-transformers.

In [11]:
import json
from sentence_transformers import SentenceTransformer
from typing import List, Dict
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# Initialize sentence-transformers model
# Using all-MiniLM-L6-v2 - a lightweight and fast model
model = SentenceTransformer('all-MiniLM-L6-v2')
print(f"Model loaded: {model}")

Model loaded: SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


In [13]:
# Load the parsed PDF JSON
with open('workspace_with_sections.json', 'r') as f:
    workspace_data = json.load(f)

In [14]:
# Check structure
print(f"Document ID: {workspace_data['doc_id']}")
print(f"Number of pages: {workspace_data['num_pages']}")
print(f"Number of blocks: {len(workspace_data['blocks'])}")
print(f"\nSample block structure:")
print(json.dumps(workspace_data['blocks'][0], indent=2))

Document ID: chess_pdf
Number of pages: 95
Number of blocks: 1943

Sample block structure:
{
  "page_num": 0,
  "block_idx": 0,
  "bbox": [
    33.75,
    37.4498291015625,
    456.353759765625,
    138.7994384765625
  ],
  "text": "The Project Gutenberg eBook of Chess Fundamentals      This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.",
  "font_size": 9.75,
  "font_name": "Menlo-Regular",
  "is_bold": false,
  "char_count": 497,
  "type": "skip",
  "section_path": null
}


In [15]:
def get_embedding(text: str) -> List[float]:
    """Get embedding for a text string using sentence-transformers."""
    text = text.replace("\n", " ")
    embedding = model.encode(text, convert_to_numpy=True)
    return embedding.tolist()

In [16]:
def add_embeddings_to_blocks(blocks: List[Dict], batch_size: int = 32) -> List[Dict]:
    """Add embeddings to each block using batched encoding for efficiency."""
    enhanced_blocks = []
    
    # Collect texts and indices for batch processing
    texts_to_embed = []
    text_indices = []
    
    for i, block in enumerate(blocks):
        if not block.get('text') or len(block['text'].strip()) < 3:
            text_indices.append(None)
        else:
            text_indices.append(len(texts_to_embed))
            texts_to_embed.append(block['text'].replace("\n", " "))
    
    # Generate embeddings in batches
    print(f"Generating embeddings for {len(texts_to_embed)} blocks...")
    embeddings = model.encode(
        texts_to_embed, 
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    
    # Add embeddings to blocks
    for i, block in enumerate(blocks):
        enhanced_block = block.copy()
        
        if text_indices[i] is None:
            enhanced_block['embedding'] = None
        else:
            enhanced_block['embedding'] = embeddings[text_indices[i]].tolist()
        
        enhanced_blocks.append(enhanced_block)
    
    return enhanced_blocks

In [17]:
# Process all blocks
enhanced_blocks = add_embeddings_to_blocks(workspace_data['blocks'])

Generating embeddings for 1941 blocks...


Batches: 100%|██████████| 61/61 [00:08<00:00,  7.00it/s]


In [18]:
# Create enhanced workspace data
enhanced_workspace = workspace_data.copy()
enhanced_workspace['blocks'] = enhanced_blocks

In [19]:
# Check a sample embedding
sample_block_with_embedding = next(b for b in enhanced_blocks if b['embedding'] is not None)
print(f"Sample text: {sample_block_with_embedding['text'][:100]}...")
print(f"Embedding dimensions: {len(sample_block_with_embedding['embedding'])}")
print(f"First 5 values: {sample_block_with_embedding['embedding'][:5]}")

Sample text: The Project Gutenberg eBook of Chess Fundamentals      This ebook is for the use of anyone anywhere ...
Embedding dimensions: 384
First 5 values: [0.02042297087609768, -0.02423093654215336, -0.059918925166130066, -0.10673654824495316, 0.0003949103702325374]


In [20]:
# Save enhanced workspace
output_path = 'workspace_with_embeddings.json'
with open(output_path, 'w') as f:
    json.dump(enhanced_workspace, f, indent=2)

print(f"Enhanced workspace saved to {output_path}")

Enhanced workspace saved to workspace_with_embeddings.json


In [21]:
# Statistics
blocks_with_embeddings = sum(1 for b in enhanced_blocks if b['embedding'] is not None)
blocks_without_embeddings = len(enhanced_blocks) - blocks_with_embeddings

print(f"\nStatistics:")
print(f"Total blocks: {len(enhanced_blocks)}")
print(f"Blocks with embeddings: {blocks_with_embeddings}")
print(f"Blocks without embeddings: {blocks_without_embeddings}")


Statistics:
Total blocks: 1943
Blocks with embeddings: 1941
Blocks without embeddings: 2
