# Wikipedia Dataset Integration Test

This notebook tests the integration of the Wikipedia dataset from Quantum Dice v2.

Run all cells to verify the integration is working correctly.

## 1. Import Required Modules

In [None]:
from core.vector_store import VectorStore
from core.embedder import EmbeddingGenerator
from core.analysis_utils import evaluate_retrieval_quality, compute_pairwise_similarities
import numpy as np

print("✓ All modules imported successfully!")

## 2. Load Wikipedia ChromaDB

In [None]:
vector_store = VectorStore(
    collection_name="wiki_aspects",
    persist_directory="./data/wikipedia/chroma_db"
)

print(f"✓ ChromaDB loaded!")
print(f"  Total chunks: {vector_store.count:,}")

# Get statistics
stats = vector_store.get_statistics()
print(f"\nDataset Statistics:")
print(f"  Collection: {stats['collection_name']}")
print(f"  Unique sources: {stats['unique_sources']}")

## 3. Check Chunk Type Distribution

In [None]:
# Count different chunk types
chunk_types = {
    "Gold Base": vector_store.get_by_metadata({"chunk_type": "gold_base"}),
    "Gold Redundant": vector_store.get_by_metadata({"chunk_type": "gold_redundant"}),
    "Noise": vector_store.get_by_metadata({"chunk_type": "noise"}),
    "Prompts": vector_store.get_by_metadata({"chunk_type": "prompt"}),
}

print("Chunk Distribution:")
for name, chunks in chunk_types.items():
    print(f"  {name:20s}: {len(chunks):,}")

total = sum(len(chunks) for chunks in chunk_types.values())
print(f"\n  Total: {total:,}")

## 4. Load BGE-Large Embedding Model

In [None]:
print("Loading BGE-large model (this may take a moment)...")

embedder = EmbeddingGenerator(
    model_name="BAAI/bge-large-en-v1.5",
    device="cpu"  # Change to "cuda" if you have GPU
)

print(f"✓ Model loaded!")
print(f"  Model: BAAI/bge-large-en-v1.5")
print(f"  Dimension: {embedder.embedding_dim}")
print(f"  Device: cpu")

## 5. Test Basic Retrieval

In [None]:
# Test query
query = "What were the major achievements of ancient Rome?"

print(f"Query: '{query}'\n")

# Embed query
query_embedding = embedder.embed_query(query)
print(f"✓ Query embedded (shape: {query_embedding.shape})\n")

# Retrieve results
results = vector_store.search(query_embedding, k=5)

print(f"Top 5 Results:\n" + "="*80)
for i, result in enumerate(results, 1):
    chunk_type = result['metadata'].get('chunk_type', 'unknown')
    aspect = result['metadata'].get('aspect_name', 'N/A')
    
    print(f"\n[{i}] Score: {result['score']:.4f} | Type: {chunk_type} | Aspect: {aspect}")
    print(f"    {result['text'][:200]}...")

## 6. Test Filtered Retrieval

In [None]:
# Test with filters - exclude noise
print("Filtered Search (excluding noise):\n" + "="*80)

filtered_results = vector_store.search_with_filters(
    query_embedding,
    k=5,
    exclude_metadata={"chunk_type": "noise"}
)

for i, result in enumerate(filtered_results, 1):
    chunk_type = result['metadata'].get('chunk_type', 'unknown')
    aspect = result['metadata'].get('aspect_name', 'N/A')
    redundancy = result['metadata'].get('redundancy_index', 'N/A')
    
    print(f"\n[{i}] Score: {result['score']:.4f} | Type: {chunk_type}")
    print(f"    Aspect: {aspect} | Redundancy Index: {redundancy}")
    print(f"    {result['text'][:150]}...")

## 7. Test Base Chunks Only

In [None]:
# Get only gold base chunks (no redundancy)
print("Gold Base Chunks Only:\n" + "="*80)

base_results = vector_store.search_with_filters(
    query_embedding,
    k=5,
    metadata_filter={"chunk_type": "gold_base"}
)

for i, result in enumerate(base_results, 1):
    aspect = result['metadata'].get('aspect_name', 'N/A')
    article = result['metadata'].get('article_title', 'N/A')
    
    print(f"\n[{i}] Score: {result['score']:.4f}")
    print(f"    Article: {article}")
    print(f"    Aspect: {aspect}")
    print(f"    {result['text'][:150]}...")

## 8. Load Precomputed Similarity Matrix

In [None]:
# Load the precomputed similarity matrix
sim_data = np.load('./data/wikipedia/similarity/similarity_matrix.npz')
similarity_matrix = sim_data['similarity_matrix']

print(f"✓ Similarity matrix loaded!")
print(f"  Shape: {similarity_matrix.shape}")
print(f"  Min similarity: {similarity_matrix.min():.4f}")
print(f"  Max similarity: {similarity_matrix.max():.4f}")
print(f"  Mean similarity: {similarity_matrix.mean():.4f}")
print(f"  Size: {similarity_matrix.nbytes / (1024*1024):.1f} MB")

## 9. Test Analysis Utilities

In [None]:
# Test evaluation metrics
selected_indices = [0, 5, 10, 15, 20]
gold_indices = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]

# Use a subset of the similarity matrix for testing
test_sim_matrix = similarity_matrix[:50, :50]

metrics = evaluate_retrieval_quality(
    selected_indices=selected_indices,
    gold_indices=gold_indices,
    similarity_matrix=test_sim_matrix
)

print("Retrieval Quality Metrics:")
print(f"  Gold Recall: {metrics['gold_recall']:.1f}%")
print(f"  Average Redundancy: {metrics['avg_redundancy']:.4f}")
print(f"  Gold Percentage: {metrics['gold_percentage']:.1f}%")

print("\n✓ Analysis utilities working!")

## 10. Sample Different Queries

In [None]:
# Test multiple queries
test_queries = [
    "How did ancient civilizations develop writing systems?",
    "What were the main causes of World War II?",
    "Explain the theory of relativity",
]

for query in test_queries:
    print(f"\nQuery: '{query}'")
    print("-" * 80)
    
    query_emb = embedder.embed_query(query)
    results = vector_store.search_with_filters(
        query_emb,
        k=3,
        exclude_metadata={"chunk_type": "noise"}
    )
    
    for i, result in enumerate(results, 1):
        print(f"  [{i}] Score: {result['score']:.4f}")
        print(f"      {result['text'][:120]}...\n")

## ✅ Integration Test Complete!

If all cells ran successfully, the Wikipedia dataset integration is working correctly.

### What We Tested:
1. ✓ Module imports
2. ✓ ChromaDB loading (5,600 chunks)
3. ✓ Chunk type distribution
4. ✓ BGE-large embedder
5. ✓ Basic retrieval
6. ✓ Filtered retrieval
7. ✓ Metadata-based queries
8. ✓ Precomputed similarity matrix
9. ✓ Analysis utilities
10. ✓ Multiple query types

### Next Steps:
- Read `WIKIPEDIA_INTEGRATION.md` for detailed usage instructions
- Run `python test_quick_demo.py` for a command-line demo
- Experiment with different retrieval strategies (QUBO, MMR)
- Try different redundancy filtering levels
- Build your own experiments!