In [4]:
#!/usr/bin/env python3
"""
Quick check: How many chunks exceed 512 tokens?
This determines if all-minilm is safe or if nomic-embed-text is needed.
"""

import json
from pathlib import Path
import sys

# Add project root to path
# project_"""  """root = Path(__file__).parent.parent.parent
# sys.path.insert(0, str(project_root))

from src.utils.config import get_config

def estimate_tokens(text):
    """
    Rough token estimation (4 chars ‚âà 1 token)
    Good enough for decision-making.
    """
    return len(text) // 4

def check_chunk_lengths():
    config = get_config()
    
    print("=" * 70)
    print("CHUNK TOKEN LENGTH ANALYSIS")
    print("=" * 70)
    
    # Check all chunk files
    chunk_files = [
        ("Books", config.FILE_BOOK_CHUNKS),
        ("Wiki Chronology", config.FILE_WIKI_CHUNKS_CHRONOLOGY),
        ("Wiki Characters", config.FILE_WIKI_CHUNKS_CHARACTER),
        ("Wiki Chapter Summaries", config.FILE_WIKI_CHUNKS_CHAPTER_SUMMARY),
        ("Wiki Concepts", config.FILE_WIKI_CHUNKS_CONCEPT),
    ]
    
    all_lengths = []
    total_chunks = 0
    over_512 = 0
    
    for name, filepath in chunk_files:
        if not filepath.exists():
            print(f"\n‚ö†Ô∏è  {name}: File not found - {filepath}")
            continue
            
        lengths = []
        with open(filepath) as f:
            for line in f:
                chunk = json.loads(line)
                token_estimate = estimate_tokens(chunk['text'])
                lengths.append(token_estimate)
                all_lengths.append(token_estimate)
                total_chunks += 1
                if token_estimate > 512:
                    over_512 += 1
        
        avg = sum(lengths) / len(lengths)
        max_len = max(lengths)
        over_512_here = sum(1 for l in lengths if l > 512)
        
        print(f"\nüìÅ {name}:")
        print(f"   Chunks: {len(lengths)}")
        print(f"   Average: {avg:.0f} tokens")
        print(f"   Max: {max_len} tokens")
        print(f"   Over 512: {over_512_here} ({over_512_here/len(lengths)*100:.1f}%)")
    
    # Overall statistics
    print("\n" + "=" * 70)
    print("OVERALL STATISTICS")
    print("=" * 70)
    
    all_lengths.sort()
    avg = sum(all_lengths) / len(all_lengths)
    median = all_lengths[len(all_lengths) // 2]
    
    print(f"\nTotal chunks: {total_chunks:,}")
    print(f"Average: {avg:.0f} tokens")
    print(f"Median: {median} tokens")
    print(f"Max: {max(all_lengths)} tokens")
    
    print(f"\nüö® CRITICAL FINDING:")
    print(f"   Chunks >512 tokens: {over_512:,} ({over_512/total_chunks*100:.1f}%)")
    
    # Distribution
    print(f"\nüìä DISTRIBUTION:")
    ranges = [
        (0, 256, "0-256 tokens"),
        (256, 512, "256-512 tokens"),
        (512, 1024, "512-1024 tokens ‚ö†Ô∏è"),
        (1024, 2000, "1024-2000 tokens ‚ö†Ô∏è‚ö†Ô∏è"),
    ]
    
    for start, end, label in ranges:
        count = sum(1 for t in all_lengths if start <= t < end)
        pct = count/total_chunks*100
        bar = "‚ñà" * int(pct / 2)
        print(f"   {label:<25} {count:>6,} ({pct:>5.1f}%) {bar}")
    
    # Recommendation
    print("\n" + "=" * 70)
    print("RECOMMENDATION")
    print("=" * 70)
    
    pct_over_512 = (over_512/total_chunks) * 100
    
    if pct_over_512 > 20:
        print("\n‚ùå DO NOT USE all-minilm")
        print(f"   Reason: {pct_over_512:.1f}% of chunks exceed 512 tokens")
        print(f"   Impact: {over_512:,} chunks will be TRUNCATED")
        print(f"   Loss: Significant WoT lore will be lost")
        print("\n‚úÖ USE nomic-embed-text instead")
        print("   - Handles chunks up to 8,192 tokens")
        print("   - No information loss")
        print("   - Worth the extra 11 hours")
        
    elif pct_over_512 > 10:
        print("\n‚ö†Ô∏è CAUTION with all-minilm")
        print(f"   Reason: {pct_over_512:.1f}% of chunks exceed 512 tokens")
        print(f"   Impact: {over_512:,} chunks will be truncated")
        print("\nüí° OPTIONS:")
        print("   A) Use nomic-embed-text (safer, no truncation)")
        print("   B) Re-chunk to keep all under 512 tokens, then use all-minilm")
        print("   C) Accept 10-20% truncation loss, use all-minilm (risky)")
        
    else:
        print("\n‚úÖ all-minilm is SAFE to use")
        print(f"   Reason: Only {pct_over_512:.1f}% exceed 512 tokens")
        print(f"   Impact: Minimal truncation ({over_512:,} chunks)")
        print(f"   Benefit: 11 hours faster than nomic-embed-text")
        print("\n   Go ahead with all-minilm!")
    
    print("\n" + "=" * 70)

if __name__ == "__main__":
    try:
        check_chunk_lengths()
    except Exception as e:
        print(f"\n‚ùå Error: {e}")
        import traceback
        traceback.print_exc()

CHUNK TOKEN LENGTH ANALYSIS

üìÅ Books:
   Chunks: 7374
   Average: 908 tokens
   Max: 1015 tokens
   Over 512: 7058 (95.7%)

üìÅ Wiki Chronology:
   Chunks: 26
   Average: 398 tokens
   Max: 1464 tokens
   Over 512: 5 (19.2%)

üìÅ Wiki Characters:
   Chunks: 10745
   Average: 56 tokens
   Max: 1985 tokens
   Over 512: 64 (0.6%)

üìÅ Wiki Chapter Summaries:
   Chunks: 756
   Average: 742 tokens
   Max: 1989 tokens
   Over 512: 516 (68.3%)

üìÅ Wiki Concepts:
   Chunks: 5872
   Average: 111 tokens
   Max: 1929 tokens
   Over 512: 167 (2.8%)

OVERALL STATISTICS

Total chunks: 24,773
Average: 344 tokens
Median: 89 tokens
Max: 1989 tokens

üö® CRITICAL FINDING:
   Chunks >512 tokens: 7,810 (31.5%)

üìä DISTRIBUTION:
   0-256 tokens              15,840 ( 63.9%) ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
   256-512 tokens             1,122 (  4.5%) ‚ñà‚ñà
   512-1024 tokens ‚ö†Ô∏è         7,606 ( 30.7%) ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚

In [None]:
from src.retrieval.pass_17_uses_this_vector_store import VectorStoreManager
from src.utils.config import get_config

m = VectorStoreManager(get_config())
r = m.query('balefire', 'magic', k=10)

for i in range(min(5, len(r))):
    print(f"{i+1}. {r[i]['text'][:100]}...")
    print()
    

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given


OperationalError: no such column: collections.topic

In [9]:
import requests

response = requests.post(
    "http://localhost:11434/api/embeddings",
    json={
        "model": "nomic-embed-text",  # Replace with actual model name
        "prompt": "test text"
    }
)

print(f"Status: {response.status_code}")
print(f"Response: {response.text}")


Status: 200
Response: {"embedding":[0.49240708351135254,-0.616692841053009,-3.710094451904297,-0.6087756156921387,1.9103617668151855,-0.8515751957893372,0.5173655152320862,-0.1970754712820053,0.280231237411499,-0.5641685724258423,-0.3114444315433502,1.672110915184021,0.8428081274032593,0.4269160032272339,-1.5409306287765503,-1.1429648399353027,1.751835823059082,-1.1847422122955322,-0.9133931398391724,0.4924255907535553,0.6768071055412292,0.2322305142879486,-1.9192509651184082,-0.21293869614601135,1.7830898761749268,-0.1136506050825119,-1.6385924816131592,0.685033917427063,-1.2440330982208252,-0.578854501247406,0.5379506945610046,-0.7773053050041199,-0.09580922871828079,-0.9656651616096497,-0.8145878314971924,-0.22032709419727325,0.062421515583992004,1.1489791870117188,-0.9540960788726807,0.707647442817688,0.9509190320968628,0.1343558430671692,-1.285065770149231,-0.8553984761238098,1.8211205005645752,-0.3725104331970215,-0.7259283661842346,1.6568032503128052,0.5253516435623169,-1.079546

In [3]:
import json
from src.utils.config import get_config

config = get_config()

with open(config.FILE_WIKI_CONCEPT, 'r', encoding='utf-8') as f:
    wiki_concepts = json.load(f)

print(f"Type: {type(wiki_concepts)}")
print(f"Keys: {list(wiki_concepts.keys())[:10]}")
print(f"\nFirst entry:")
first_key = list(wiki_concepts.keys())[0]
print(f"Key: {first_key}")
print(f"Value: {wiki_concepts[first_key]}")

Type: <class 'dict'>
Keys: ['1000_NE.txt', '978_NE.txt', '979_NE.txt', '998_NE.txt', '999_NE.txt', 'Abayan.txt', 'Abila.txt', "Abor'maseleine.txt", 'Abrishi.txt', 'Abunai.txt']

First entry:
Key: 1000_NE.txt
Value: {'filename': '1000_NE.txt', 'page_type': 'CONCEPT', 'page_name': '1000 NE', 'metadata': {'page_id': 703, 'categories': ['Articles_that_need_to_be_wikified', 'Time', 'New_Era_chronology']}, 'sections': [{'level': 2, 'title': 'Categories', 'content': 'Articles_that_need_to_be_wikified, New_Era_chronology, Time', 'subsections': []}, {'level': 2, 'title': 'Overview', 'content': 'Initial import from http://www.stevenac.net/wot/tl1000.htm and copyrighted by Steven Cooper. Used by permission, not GFDL.', 'subsections': []}], 'aliases': []}


In [7]:
import json
from src.utils.config import get_config

config = get_config()

with open(config.FILE_FILENAME_TO_CATEGORIES, 'r', encoding='utf-8') as f:
    category_mappings = json.load(f)

# Get all unique categories
all_categories = set()
for cats in category_mappings.values():
    all_categories.update(cats)

print(f"Total unique categories: {len(all_categories)}")
print("\nAll categories (sorted):")
for cat in sorted(all_categories):
    print(f"  {cat}")

Total unique categories: 650

All categories (sorted):
  A_Crown_of_Swords_chapter_summaries
  A_Memory_of_Light_chapter_summaries
  Accepted
  Administrative_redirects
  Advocates
  Aelfinn_and_Eelfinn
  Aelgar
  Aelgar_(people)
  Aes_Sedai
  Aes_Sedai_(Age_of_Legends)
  Aes_Sedai_(Free_Years)
  Aes_Sedai_after_the_Breaking
  Aes_Sedai_factions
  Aes_Sedai_positions
  Aethan_Dor
  After_the_Breaking
  Age_of_Legends
  Age_of_Legends_(people)
  Ages
  Aiel
  Aiel_(people)
  Aiel_Waste
  Aiel_clans
  Aiel_culture
  Aiel_septs
  Aiel_warrior_societies
  Ajah_Heads
  Ajahs
  Aldeshar_(people)
  Alias_redirects
  Aliases
  All_books
  Almoren
  Almoren_(people)
  Almoth_Plain
  Altara
  Altara_(people)
  Amadicia
  Amadicia_(people)
  Amayar
  Amayar_(people)
  Amyrlin_Seats
  Andor
  Andor_(people)
  Angreal
  Animals
  Antagonists
  Apprentices
  Arad_Doman
  Arad_Doman_(people)
  Arafel
  Arafel_(people)
  Aramaelle
  Aramaelle_(people)
  Aridhol
  Aridhol_(people)
  Articles_that_need_