In [1]:
pip install pymupdf4llm FlagEmbedding

Collecting pymupdf4llm
  Downloading pymupdf4llm-0.2.3-py3-none-any.whl.metadata (4.9 kB)
Collecting FlagEmbedding
  Downloading FlagEmbedding-1.3.5.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.9/163.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pymupdf>=1.26.6 (from pymupdf4llm)
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting ir-datasets (from FlagEmbedding)
  Downloading ir_datasets-0.5.11-py3-none-any.whl.metadata (12 kB)
Collecting inscriptis>=2.2.0 (from ir-datasets->FlagEmbedding)
  Downloading inscriptis-2.7.0-py3-none-any.whl.metadata (27 kB)
Collecting trec-car-tools>=2.5.4 (from ir-datasets->FlagEmbedding)
  Downloading trec_car_tools-2.6-py3-none-any.whl.metadata (640 bytes)
Collecting lz4>=3.1.10 (from ir-datasets->FlagEmbedding)
  Downloading lz4-4.4.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_

In [3]:
import time
import pymupdf4llm
from FlagEmbedding import BGEM3FlagModel

# CONFIGURATION
PDF_PATH = "Summer2025_erbaysal_ml.pdf"  # <--- REPLACE THIS with your file path
CHUNK_SIZE = 500  # Characters (Keep small for observation)
OVERLAP = 50      # Characters to repeat between chunks

def print_separator(title):
    print(f"\n{'='*60}\n {title} \n{'='*60}")

# ==========================================
# STEP 1: Fast Extraction (PDF -> Markdown)
# ==========================================
print_separator("STEP 1: EXTRACTION")
start_time = time.time()

# pymupdf4llm is extremely fast and preserves tables as Markdown
print(f"Processing file: {PDF_PATH}...")
markdown_text = pymupdf4llm.to_markdown(PDF_PATH)

end_time = time.time()
print(f"✅ Extracted {len(markdown_text)} characters in {end_time - start_time:.4f} seconds.")

# OBSERVATION: Show the first 500 characters to see the structure
print("\n--- [Preview of Raw Extracted Text] ---")
print(markdown_text[:500] + "...\n(truncated)")


# ==========================================
# STEP 2: Chunking (Text -> List of Strings)
# ==========================================
print_separator("STEP 2: CHUNKING")

# Simple sliding window chunker
chunks = []
start = 0
while start < len(markdown_text):
    end = start + CHUNK_SIZE
    # Create the chunk
    chunk = markdown_text[start:end]
    chunks.append(chunk)
    # Move the window forward, minus the overlap
    start += (CHUNK_SIZE - OVERLAP)

print(f"✅ Split document into {len(chunks)} chunks.")

# OBSERVATION: Print the first 2 chunks to check overlap and content
print(f"\n--- [Chunk 1 (Length: {len(chunks[0])})] ---")
print(f"'{chunks[0]}'")

if len(chunks) > 1:
    print(f"\n--- [Chunk 2 (Length: {len(chunks[1])})] ---")
    print(f"'{chunks[1]}'")
    print("\n(Notice the overlap at the beginning of Chunk 2!)")


# ==========================================
# STEP 3: Embedding (Chunks -> BGE-M3 Vectors)
# ==========================================
print_separator("STEP 3: BGE-M3 EMBEDDING")

print("Loading BGE-M3 model (this happens only once)...")
# use_fp16=True speeds this up significantly on Mac/GPU
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

print(f"Encoding {len(chunks)} chunks...")
start_time = time.time()

# We request BOTH Dense (Concept) and Sparse (Keyword) vectors
embeddings = model.encode(
    chunks,
    batch_size=12,
    max_length=8192,
    return_dense=True,
    return_sparse=True
)

end_time = time.time()
print(f"✅ Generated embeddings in {end_time - start_time:.4f} seconds.")

# OBSERVATION: Inspect the first chunk's vector
dense_vec = embeddings['dense_vecs'][0]
sparse_vec = embeddings['lexical_weights'][0]

print("\n--- [Analysis of Chunk 1 Vectors] ---")
print(f"1. Dense Vector Shape: {dense_vec.shape}")
print(f"   (This is a list of {len(dense_vec)} numbers representing the 'Concept')")
print(f"   First 5 numbers: {dense_vec[:5]}")

print(f"\n2. Sparse Vector (Keywords): Found {len(sparse_vec)} important tokens.")
print("   (These are the specific words the model flagged as important)")

# Let's see exactly WHICH words BGE-M3 thought were important
# The model returns Token IDs, so we need to map them back to words is tricky
# without the tokenizer object handy, but we can print the IDs and weights.
print("   Top 5 Token IDs and their Weights:")
sorted_sparse = sorted(sparse_vec.items(), key=lambda x: x[1], reverse=True)
for token_id, weight in sorted_sparse[:5]:
    print(f"      Token ID {token_id}: Weight {weight:.4f}")

print_separator("DONE")


 STEP 1: EXTRACTION 
Processing file: Summer2025_erbaysal_ml.pdf...
✅ Extracted 3018 characters in 0.2271 seconds.

--- [Preview of Raw Extracted Text] ---
**ERASMUS+ Blended Intensive Program (BIP)**
_Data Science for Sustainable Finance and Economics_


Dear Selection Committee,


I am writing to express my strong interest in participating in the ERASMUS+ Blended Intensive
Program (BIP) on _Data Science for Sustainable Finance and Economics_ to be held in Berlin in
September 2025. As a Turkish student currently pursuing my Master’s degree in Mathematical
Engineering at Politecnico di Milano, this program represents a unique and meaningful opportu...
(truncated)

 STEP 2: CHUNKING 
✅ Split document into 7 chunks.

--- [Chunk 1 (Length: 500)] ---
'**ERASMUS+ Blended Intensive Program (BIP)**
_Data Science for Sustainable Finance and Economics_


Dear Selection Committee,


I am writing to express my strong interest in participating in the ERASMUS+ Blended Intensive
Program (BIP) on _D

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

.gitattributes: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

colbert_linear.pt:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

bm25.jpg:   0%|          | 0.00/132k [00:00<?, ?B/s]

.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

miracl.jpg:   0%|          | 0.00/576k [00:00<?, ?B/s]

mkqa.jpg:   0%|          | 0.00/608k [00:00<?, ?B/s]

nqa.jpg:   0%|          | 0.00/158k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/127k [00:00<?, ?B/s]

others.webp:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/485k [00:00<?, ?B/s]

Constant_7_attr__value:   0%|          | 0.00/65.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

onnx/model.onnx:   0%|          | 0.00/725k [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

onnx/tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

onnx/model.onnx_data:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

sparse_linear.pt:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Encoding 7 chunks...
✅ Generated embeddings in 21.6882 seconds.

--- [Analysis of Chunk 1 Vectors] ---
1. Dense Vector Shape: (1024,)
   (This is a list of 1024 numbers representing the 'Concept')
   First 5 numbers: [-0.06410479 -0.02905737 -0.0081046   0.01326521 -0.02258414]

2. Sparse Vector (Keywords): Found 76 important tokens.
   (These are the specific words the model flagged as important)
   Top 5 Token IDs and their Weights:
      Token ID 61749: Weight 0.2880
      Token ID 10931: Weight 0.2725
      Token ID 76924: Weight 0.2680
      Token ID 10271: Weight 0.2677
      Token ID 1328: Weight 0.2676

 DONE 


In [8]:
import numpy as np

# ==========================================
# EXTENSION: Word Similarity Test
# ==========================================
print_separator("EXTENSION: SIMILARITY TEST")

# 1. Define 5 words to test against "Chunk 1"
#    (I included 'banana' as a control to see what a non-match looks like)
test_words = ["student", "Torino", "deadline", "project", "banana"]

print(f"Comparing these words against CHUNK 1:\n{test_words}")

# 2. Encode the single words
word_embeddings = model.encode(
    test_words,
    return_dense=True,
    return_sparse=True
)

# Helper function for Cosine Similarity
def cosine_sim(vec_a, vec_b):
    return np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))

# 3. Loop through and compare
# We use 'dense_vec' and 'sparse_vec' from the previous step (Chunk 1)
print("\n--- [Results] ---")

for i, word in enumerate(test_words):
    # Get the word's vectors
    w_dense = word_embeddings['dense_vecs'][i]
    w_sparse = word_embeddings['lexical_weights'][i]

    # A. Calculate DENSE Similarity (Conceptual)
    # Range is usually 0.0 to 1.0 (1.0 is identical)
    d_score = cosine_sim(w_dense, dense_vec)

    # B. Check SPARSE Match (Keyword)
    # We check if the Token IDs of the word exist in the Chunk's sparse vector
    # (Note: A word might be split into multiple tokens, we check for any overlap)
    is_keyword_match = False
    match_weight = 0.0

    for token_id in w_sparse:
        if token_id in sparse_vec:
            is_keyword_match = True
            match_weight = sparse_vec[token_id]
            break # Found a match

    # PRINT REPORT
    print(f"\nWord: '{word.upper()}'")
    print(f"   ➤ Dense Score:   {d_score:.4f}  ", end="")

    if d_score > 0.5:
        print("(High conceptual relevance)")
    elif d_score < 0.4:
        print("(Low relevance)")
    else:
        print("(Moderate relevance)")

    if is_keyword_match:
        print(f"   ➤ Keyword Match: ✅ YES (It appears in the text! Weight: {match_weight:.4f})")
    else:
        print(f"   ➤ Keyword Match: ❌ NO  (Exact word not found)")

print_separator("FINISHED")


 EXTENSION: SIMILARITY TEST 
Comparing these words against CHUNK 1:
['student', 'Torino', 'deadline', 'project', 'banana']

--- [Results] ---

Word: 'STUDENT'
   ➤ Dense Score:   0.4625  (Moderate relevance)
   ➤ Keyword Match: ✅ YES (It appears in the text! Weight: 0.2059)

Word: 'TORINO'
   ➤ Dense Score:   0.3648  (Low relevance)
   ➤ Keyword Match: ❌ NO  (Exact word not found)

Word: 'DEADLINE'
   ➤ Dense Score:   0.4779  (Moderate relevance)
   ➤ Keyword Match: ❌ NO  (Exact word not found)

Word: 'PROJECT'
   ➤ Dense Score:   0.4574  (Moderate relevance)
   ➤ Keyword Match: ❌ NO  (Exact word not found)

Word: 'BANANA'
   ➤ Dense Score:   0.3284  (Low relevance)
   ➤ Keyword Match: ❌ NO  (Exact word not found)

 FINISHED 


In [14]:
import numpy as np
import pandas as pd
from FlagEmbedding import BGEM3FlagModel

# ==========================================
# CONFIGURATION
# ==========================================
words = ["queen", "king", "female", "Königin", "banana"]

print(f"Analyzing similarity between: {words}")

# ==========================================
# 1. LOAD MODEL & ENCODE
# ==========================================
# We use use_fp16=True for speed on your Mac
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

embeddings = model.encode(words, return_dense=True, return_sparse=False)
dense_vecs = embeddings['dense_vecs']

# ==========================================
# 2. CALCULATE SIMILARITY MATRIX
# ==========================================
def cosine_similarity_matrix(vectors):
    # Normalize vectors (make length = 1)
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    normalized = vectors / norms

    # Dot product of normalized vectors gives Cosine Similarity
    return np.dot(normalized, normalized.T)

sim_matrix = cosine_similarity_matrix(dense_vecs)

# ==========================================
# 3. DISPLAY RESULTS
# ==========================================
# Create a DataFrame for a nice grid view
df = pd.DataFrame(sim_matrix, index=words, columns=words)

print("\n" + "="*50)
print("      SEMANTIC SIMILARITY MATRIX (0.0 - 1.0)")
print("="*50)
print(df.round(4))

print("\n" + "="*50)
print("      KEY INSIGHTS CHECK")
print("="*50)

# Helper to print specific pairs
def print_pair(w1, w2):
    s = df.loc[w1, w2]
    print(f"'{w1}' vs '{w2}': \t{s:.4f}", end=" ")
    if s > 0.7: print("--> Very High (Synonym/Translation)")
    elif s > 0.6: print("--> High (Related Concept)")
    elif s > 0.4: print("--> Low (Unrelated)")
    else: print("--> Moderate")

print_pair("queen", "Königin")  # Testing Multilinguality
print_pair("queen", "king")     # Testing Related Concepts
print_pair("queen", "female")   # Testing Gender Property
print_pair("queen", "banana")   # Testing Control Group

Analyzing similarity between: ['queen', 'king', 'female', 'Königin', 'banana']


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



      SEMANTIC SIMILARITY MATRIX (0.0 - 1.0)
          queen    king  female  Königin  banana
queen    1.0000  0.7121  0.7065   0.6874  0.5137
king     0.7121  1.0000  0.4769   0.5495  0.4445
female   0.7065  0.4769  1.0000   0.5588  0.5729
Königin  0.6874  0.5495  0.5588   1.0000  0.3823
banana   0.5137  0.4445  0.5729   0.3823  1.0000

      KEY INSIGHTS CHECK
'queen' vs 'Königin': 	0.6874 --> High (Related Concept)
'queen' vs 'king': 	0.7121 --> Very High (Synonym/Translation)
'queen' vs 'female': 	0.7065 --> Very High (Synonym/Translation)
'queen' vs 'banana': 	0.5137 --> Low (Unrelated)
