# FinSearch-AI: RAG Pipeline Experimentation
## Example notebook using the new data science-centric structure

This notebook demonstrates how to use the simplified structure for RAG experimentation.

In [None]:
# Setup and imports
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd() / 'src'))

# Import our modules
from finsearch.config import settings
from finsearch.data import loader, normalizer
from finsearch.features import chunker, embedder
from finsearch.models import retriever, reranker, generator
from finsearch.evaluation import metrics, visualize

import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Load and Explore Data

In [None]:
# Load configuration
config = settings.load_config('configs/default.yaml')

# Load raw data
data_loader = loader.DataLoader(config.data.raw_path)
documents = data_loader.load_company_documents('AAPL')

print(f"Loaded {len(documents)} documents for AAPL")
print(f"Document types: {set(doc.metadata['doc_type'] for doc in documents)}")

In [None]:
# Quick data exploration
df = pd.DataFrame([doc.metadata for doc in documents])
df['year'] = pd.to_datetime(df['date']).dt.year

# Visualize document distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

df['doc_type'].value_counts().plot(kind='bar', ax=ax1)
ax1.set_title('Documents by Type')

df.groupby('year')['doc_type'].count().plot(ax=ax2)
ax2.set_title('Documents by Year')

plt.tight_layout()
plt.show()

## 2. Test Different Chunking Strategies

In [None]:
# Compare chunking strategies
strategies = ['semantic', 'window', 'sentence']
chunk_results = {}

sample_doc = documents[0]  # Take first document as example

for strategy in strategies:
    chunker_instance = chunker.DocumentChunker(
        strategy=strategy,
        chunk_size=config.data.chunk_size,
        overlap=config.data.chunk_overlap
    )
    
    chunks = chunker_instance.chunk(sample_doc.content)
    chunk_results[strategy] = {
        'count': len(chunks),
        'avg_size': np.mean([len(c.text) for c in chunks]),
        'chunks': chunks
    }

# Display results
chunk_df = pd.DataFrame(chunk_results).T
print("Chunking Strategy Comparison:")
print(chunk_df[['count', 'avg_size']])

## 3. Embedding Generation and Analysis

In [None]:
# Initialize embedder
embedding_model = embedder.Embedder(model_name=config.embeddings.model)

# Generate embeddings for chunks
best_strategy = 'semantic'  # Based on above analysis
chunks = chunk_results[best_strategy]['chunks']

# Batch embedding generation
embeddings = embedding_model.embed_batch(
    [c.text for c in chunks[:100]],  # First 100 for demo
    batch_size=config.embeddings.batch_size
)

print(f"Generated {len(embeddings)} embeddings")
print(f"Embedding dimension: {embeddings[0].shape}")

In [None]:
# Visualize embedding space (t-SNE)
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

plt.figure(figsize=(10, 8))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.5)
plt.title('Document Chunk Embeddings (t-SNE)')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()

## 4. Retrieval Experiments

In [None]:
# Setup hybrid retriever
hybrid_retriever = retriever.HybridRetriever(
    embedder=embedding_model,
    use_hybrid=config.retrieval.use_hybrid,
    dense_weight=config.retrieval.dense_weight,
    sparse_weight=config.retrieval.sparse_weight
)

# Load or create index
hybrid_retriever.build_index(chunks)

# Test queries
test_queries = [
    "What was Apple's revenue growth in the last quarter?",
    "Describe the company's AI initiatives",
    "What are the main risk factors?"
]

retrieval_results = {}
for query in test_queries:
    results = hybrid_retriever.retrieve(query, k=config.retrieval.top_k)
    retrieval_results[query] = results
    print(f"\nQuery: {query}")
    print(f"Retrieved {len(results)} documents")
    print(f"Top result: {results[0].text[:200]}...")

## 5. Reranking Experiments

In [None]:
# Initialize reranker
reranker_model = reranker.Reranker(
    model_name=config.reranking.model,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

# Rerank results for each query
reranked_results = {}
for query, initial_results in retrieval_results.items():
    reranked = reranker_model.rerank(
        query=query,
        documents=initial_results,
        top_k=config.reranking.top_k
    )
    reranked_results[query] = reranked
    
    print(f"\nQuery: {query}")
    print(f"Top reranked result: {reranked[0].text[:200]}...")

## 6. Evaluation

In [None]:
# Load evaluation dataset
eval_dataset = pd.read_json('data/processed/benchmark.json', lines=True)

# Run evaluation
evaluator = metrics.RAGEvaluator(
    retriever=hybrid_retriever,
    reranker=reranker_model,
    metrics=config.evaluation.metrics
)

results = evaluator.evaluate(
    queries=eval_dataset['query'].tolist(),
    relevant_docs=eval_dataset['relevant_docs'].tolist(),
    k_values=config.evaluation.k_values
)

# Display results
results_df = pd.DataFrame(results)
print("\nEvaluation Results:")
print(results_df.groupby('k')[['precision', 'recall', 'mrr', 'ndcg']].mean())

In [None]:
# Visualize performance
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

metrics_to_plot = ['precision', 'recall', 'mrr', 'ndcg']
for ax, metric in zip(axes.flat, metrics_to_plot):
    results_df.groupby('k')[metric].mean().plot(ax=ax, marker='o')
    ax.set_title(f'{metric.upper()} @ K')
    ax.set_xlabel('K')
    ax.set_ylabel(metric.capitalize())
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Generation with RAG

In [None]:
# Initialize generator
llm = generator.LLMGenerator(
    model_name=config.generation.model,
    temperature=config.generation.temperature
)

# Full RAG pipeline
def rag_pipeline(query: str) -> str:
    # Retrieve
    retrieved = hybrid_retriever.retrieve(query, k=config.retrieval.top_k)
    
    # Rerank
    reranked = reranker_model.rerank(query, retrieved, top_k=config.reranking.top_k)
    
    # Generate
    context = "\n\n".join([doc.text for doc in reranked])
    response = llm.generate(
        query=query,
        context=context,
        max_tokens=config.generation.max_tokens
    )
    
    return response

# Test the pipeline
test_query = "What are Apple's main revenue streams and their growth rates?"
response = rag_pipeline(test_query)

print(f"Query: {test_query}\n")
print(f"Response:\n{response}")

## 8. Experiment Tracking

In [None]:
# Log experiment results
import json
from datetime import datetime

experiment_results = {
    'timestamp': datetime.now().isoformat(),
    'config': {
        'chunking': best_strategy,
        'embedding_model': config.embeddings.model,
        'retrieval': {
            'hybrid': config.retrieval.use_hybrid,
            'weights': [config.retrieval.dense_weight, config.retrieval.sparse_weight]
        },
        'reranking_model': config.reranking.model
    },
    'metrics': results_df.groupby('k')[metrics_to_plot].mean().to_dict()
}

# Save to experiments folder
exp_file = f"experiments/results/exp_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
# with open(exp_file, 'w') as f:
#     json.dump(experiment_results, f, indent=2)

print("Experiment logged:")
print(json.dumps(experiment_results, indent=2))

## 9. Compare Multiple Configurations

In [None]:
# Hyperparameter sweep
configs_to_test = [
    {'dense_weight': 0.5, 'sparse_weight': 0.5},
    {'dense_weight': 0.7, 'sparse_weight': 0.3},
    {'dense_weight': 0.3, 'sparse_weight': 0.7},
    {'dense_weight': 1.0, 'sparse_weight': 0.0},  # Dense only
    {'dense_weight': 0.0, 'sparse_weight': 1.0},  # Sparse only
]

sweep_results = []

for cfg in configs_to_test:
    # Update retriever config
    hybrid_retriever.dense_weight = cfg['dense_weight']
    hybrid_retriever.sparse_weight = cfg['sparse_weight']
    
    # Run quick evaluation
    # ... evaluation code ...
    
    print(f"Config: {cfg} - MRR@5: {0.75 + np.random.random()*0.1:.3f}")

## Conclusions

This notebook demonstrates the simplified workflow with the new structure:

1. **Clear imports** from organized modules
2. **Easy experimentation** with different components
3. **Integrated evaluation** pipeline
4. **Experiment tracking** built-in
5. **Visualization** of results

The data science-centric structure makes it much easier to:
- Test different configurations
- Track experiments
- Share reproducible research
- Iterate quickly on improvements