# Testing Gemini Performance with Local Ollama Embeddings

This notebook tests the MacRAG system using:
- **Embeddings**: Local Ollama (`nomic-embed-text`)
- **Generation**: Gemini API (`gemini-2.5-flash`)

## Prerequisites
1. Ollama running locally: `ollama serve`
2. Pull embedding model: `ollama pull nomic-embed-text`
3. Set `GEMINI_API_KEY` in `.env` file

## ‚ö†Ô∏è Key Differences from Original Paper Results

To match the original MacRAG paper results, you need to address these issues:

### 1. **Proper Index Generation**
The original uses `gen_index_macrag.py` with structured chunks containing metadata. Our simplified index uses plain text chunks which limits functionality.

### 2. **Gemini Output Format**
Gemini tends to give verbose responses like "The answer is not in the passages" instead of concise answers like "Gates v. Collier". This significantly reduces F1 scores.

### 3. **Rate Limiting** 
Free Gemini tier only allows 10 requests/minute, making full evaluation slow (~25 minutes for 200 questions).

### 4. **Recommended Settings for Better Results**
```bash
# Use prompt_version 3 for more concise answers
--prompt_version 3

# Enable reranking for better retrieval
--with_reranking 1

# Use the proper MacRAG index path
--r_path processed/sum_600_400_raw_1500_500_e5
```

In [1]:
import os
import sys
import json
from pathlib import Path

# Add src to path
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'src' else Path.cwd()
sys.path.insert(0, str(PROJECT_ROOT / 'src'))

# Set Ollama environment variables
os.environ['OLLAMA_BASE_URL'] = 'http://localhost:11434'
os.environ['OLLAMA_EMBED_MODEL'] = 'nomic-embed-text'

print(f"Project root: {PROJECT_ROOT}")
print(f"Ollama URL: {os.environ['OLLAMA_BASE_URL']}")
print(f"Ollama model: {os.environ['OLLAMA_EMBED_MODEL']}")

Project root: /home/usman619/python_code/MacRAG
Ollama URL: http://localhost:11434
Ollama model: nomic-embed-text


In [2]:
# Load environment variables (for GEMINI_API_KEY)
from dotenv import load_dotenv
load_dotenv(PROJECT_ROOT / '.env')

print(f"GEMINI_API_KEY set: {bool(os.getenv('GEMINI_API_KEY'))}")

GEMINI_API_KEY set: True


## 1. Test Ollama Embeddings

In [3]:
import requests

def test_ollama_connection():
    """Test if Ollama is running and embedding model is available."""
    base_url = os.environ.get('OLLAMA_BASE_URL', 'http://localhost:11434')
    model = os.environ.get('OLLAMA_EMBED_MODEL', 'nomic-embed-text')
    
    try:
        # Test embedding
        resp = requests.post(
            f"{base_url}/api/embed",
            json={"model": model, "input": ["Hello, world!"]},
            timeout=30
        )
        resp.raise_for_status()
        data = resp.json()
        emb = data.get('embeddings', [[]])[0]
        print(f"‚úÖ Ollama connection successful!")
        print(f"   Model: {model}")
        print(f"   Embedding dimension: {len(emb)}")
        return True
    except Exception as e:
        print(f"‚ùå Ollama connection failed: {e}")
        return False

test_ollama_connection()

‚úÖ Ollama connection successful!
   Model: nomic-embed-text
   Embedding dimension: 768


True

## 2. Test Gemini API

In [4]:
from utils.gemini_handler import get_gemini_response, _HAS_GENAI, _CONFIGURED

print(f"Gemini SDK installed: {_HAS_GENAI}")
print(f"Gemini API configured: {_CONFIGURED}")

if _HAS_GENAI and _CONFIGURED:
    response = get_gemini_response("Say hello in one word.", model_name="gemini-2.5-flash", temperature=0)
    print(f"\n‚úÖ Gemini response: {response}")
else:
    print("\n‚ùå Gemini not configured. Check GEMINI_API_KEY in .env")

  from .autonotebook import tqdm as notebook_tqdm


Gemini SDK installed: True
Gemini API configured: True

‚úÖ Gemini response: Hello

‚úÖ Gemini response: Hello


## 3. Import Evaluation Functions

In [5]:
from metric import F1_scorer
import numpy as np

def replace_and_calculate_average(lst):
    """Replace -1 values with the average of valid values."""
    valid_values = [x for x in lst if x != -1]
    if not valid_values:
        return 0.0
    average_of_valid_values = sum(valid_values) / len(valid_values)
    replaced_list = [average_of_valid_values if x == -1 else x for x in lst]
    return sum(replaced_list) / len(replaced_list)

def eval_function(data, pred_dir, max_samples=200):
    """Evaluate predictions against ground truth."""
    answer_path = PROJECT_ROOT / "data" / "eval" / f"{data}.json"
    
    with open(answer_path, encoding='utf-8') as f:
        qs_data = json.load(f)
    
    answer = [d["answers"] for d in qs_data[:max_samples]]
    
    F1 = {'F1': {}}
    doc_len = {'doc_len': {}}
    None_count = {'none_count': {}}
    
    answer_types = ["rb_pred", "rl_pred", "ext_pred", "fil_pred", "ext_fil_pred"]
    dict_ = {"rb_pred": "R&B", "rl_pred": "R&L", "ext_pred": "Ext", "fil_pred": "Fil", "ext_fil_pred": "E&F"}
    
    pred_path = Path(pred_dir)
    
    for answer_type in answer_types:
        file_path = pred_path / f"{answer_type}.json"
        if not file_path.exists():
            continue
            
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                lines = f.readlines()
            
            preds = []
            lens = []
            none_count = 0
            
            for idx in range(min(max_samples, len(lines))):
                try:
                    data_line = json.loads(lines[idx])
                    pred = data_line.get(answer_type, "None")
                    preds.append(pred)
                    lens.append(data_line.get('input_len', -1) or -1)
                except:
                    preds.append("None")
                    lens.append(-1)
                
                if pred == "None" or pred is None:
                    none_count += 1
            
            if preds:
                # Ensure we have matching lengths
                eval_answer = answer[:len(preds)]
                F1['F1'][dict_[answer_type]] = F1_scorer(preds, eval_answer)
                doc_len['doc_len'][dict_[answer_type]] = replace_and_calculate_average(lens)
                None_count['none_count'][dict_[answer_type]] = none_count
        except Exception as e:
            print(f"  Error processing {answer_type}: {e}")
    
    print("F1 Scores:")
    for k, v in F1['F1'].items():
        print(f"  {k}: {v:.4f}")
    
    print("\nNone Count:")
    for k, v in None_count['none_count'].items():
        print(f"  {k}: {v}")
    
    print("\nAvg Doc Length:")
    for k, v in doc_len['doc_len'].items():
        print(f"  {k}: {v:.1f}")
    
    print("\n" + "="*50)
    return F1, doc_len, None_count

print("‚úÖ Evaluation functions loaded")

‚úÖ Evaluation functions loaded


## 4. Check Available Log Directories

In [6]:
log_dir = PROJECT_ROOT / 'log'

if log_dir.exists():
    print("Available log directories:\n")
    for r_path in sorted(log_dir.iterdir()):
        if r_path.is_dir() and not r_path.name.startswith('.'):
            print(f"üìÅ {r_path.name}/")
            for dataset in sorted(r_path.iterdir()):
                if dataset.is_dir():
                    print(f"   ‚îî‚îÄ‚îÄ {dataset.name}/")
                    for model in sorted(dataset.iterdir()):
                        if model.is_dir():
                            print(f"       ‚îî‚îÄ‚îÄ {model.name}/")
                            for version in sorted(model.iterdir()):
                                if version.is_dir():
                                    # Check what prediction files exist
                                    pred_files = list(version.glob("*_pred.json"))
                                    pred_names = [f.stem for f in pred_files]
                                    print(f"           ‚îî‚îÄ‚îÄ {version.name}/")
                                    if pred_files:
                                        print(f"               Predictions: {pred_names}")
else:
    print("No log directory found. Run evaluation first.")

Available log directories:

üìÅ 200_2_2/
   ‚îî‚îÄ‚îÄ hotpotqa/
       ‚îî‚îÄ‚îÄ gemini-2.5-flash/
           ‚îî‚îÄ‚îÄ prompt_v1_v1_upscaling_chunk_ext0_without_reranking_top_k1_5_top_k2_3_merge_v1/
               Predictions: ['rb_pred']
           ‚îî‚îÄ‚îÄ prompt_v1_v1_upscaling_chunk_ext1_with_reranking_top_k1_5_top_k2_3_merge_v1_marco_MiniLM_upsampling_4/
           ‚îî‚îÄ‚îÄ prompt_v1_v1_upscaling_chunk_ext1_without_reranking_top_k1_5_top_k2_3_merge_v1/


## 5. Run Evaluation on Gemini Results

### 5.1 Configure evaluation parameters

In [7]:
# Configuration
MODEL = "gemini-2.5-flash"
R_PATH = "200_2_2"  # or "sum_600_400_raw_1500_500_e5"
DATASETS = ["hotpotqa", "2wikimultihopqa", "musique"]

print(f"Model: {MODEL}")
print(f"Index path: {R_PATH}")
print(f"Datasets: {DATASETS}")

Model: gemini-2.5-flash
Index path: 200_2_2
Datasets: ['hotpotqa', '2wikimultihopqa', 'musique']


### 5.2 Find available versions for the model

In [8]:
for dataset in DATASETS:
    model_path = log_dir / R_PATH / dataset / MODEL
    if model_path.exists():
        print(f"\n{dataset}:")
        for version in sorted(model_path.iterdir()):
            if version.is_dir():
                pred_files = list(version.glob("*_pred.json"))
                print(f"  ‚Ä¢ {version.name}")
                if pred_files:
                    print(f"    Files: {[f.name for f in pred_files]}")
    else:
        print(f"\n{dataset}: No results found for {MODEL}")


hotpotqa:
  ‚Ä¢ prompt_v1_v1_upscaling_chunk_ext0_without_reranking_top_k1_5_top_k2_3_merge_v1
    Files: ['rb_pred.json']
  ‚Ä¢ prompt_v1_v1_upscaling_chunk_ext1_with_reranking_top_k1_5_top_k2_3_merge_v1_marco_MiniLM_upsampling_4
  ‚Ä¢ prompt_v1_v1_upscaling_chunk_ext1_without_reranking_top_k1_5_top_k2_3_merge_v1

2wikimultihopqa: No results found for gemini-2.5-flash

musique: No results found for gemini-2.5-flash


### 5.3 Evaluate specific version

In [9]:
# Set the version to evaluate (copy from above)
VERSION = "prompt_v1_v1_upscaling_chunk_ext0_without_reranking_top_k1_5_top_k2_3_merge_v1"

print(f"Evaluating: {MODEL} / {VERSION}\n")
print("="*60)

results = {}
for dataset in DATASETS:
    pred_dir = log_dir / R_PATH / dataset / MODEL / VERSION
    if pred_dir.exists():
        print(f"\nüìä Dataset: {dataset}")
        print("-"*40)
        results[dataset] = eval_function(dataset, str(pred_dir) + "/")
    else:
        print(f"\n‚ö†Ô∏è {dataset}: No results at {pred_dir}")

Evaluating: gemini-2.5-flash / prompt_v1_v1_upscaling_chunk_ext0_without_reranking_top_k1_5_top_k2_3_merge_v1


üìä Dataset: hotpotqa
----------------------------------------
F1 Scores:
  R&B: 8.0100

None Count:
  R&B: 1

Avg Doc Length:
  R&B: 648.7


‚ö†Ô∏è 2wikimultihopqa: No results at /home/usman619/python_code/MacRAG/log/200_2_2/2wikimultihopqa/gemini-2.5-flash/prompt_v1_v1_upscaling_chunk_ext0_without_reranking_top_k1_5_top_k2_3_merge_v1

‚ö†Ô∏è musique: No results at /home/usman619/python_code/MacRAG/log/200_2_2/musique/gemini-2.5-flash/prompt_v1_v1_upscaling_chunk_ext0_without_reranking_top_k1_5_top_k2_3_merge_v1


## 6. Run MacRAG Evaluation (if needed)

If you don't have results yet, run this command in terminal:

```bash
cd /home/usman619/python_code/MacRAG

OLLAMA_BASE_URL=http://localhost:11434 \
OLLAMA_EMBED_MODEL=nomic-embed-text \
python src/main_macrag.py \
    --dataset hotpotqa \
    --model gemini-2.5-flash \
    --r_path processed/200_2_2 \
    --top_k1 5 --top_k2 3 \
    --prompt_version 1 \
    --with_reranking 0 \
    --chunk_ext 0 \
    --rb  # <-- IMPORTANT: Enable RAG-Base generation
```

In [12]:
# You can also run the evaluation from within the notebook
# WARNING: This will take a while and make API calls

RUN_EVALUATION = False  # Set to True to run

if RUN_EVALUATION:
    import subprocess
    
    cmd = [
        "python", str(PROJECT_ROOT / "src" / "main_macrag.py"),
        "--dataset", "hotpotqa",
        "--model", "gemini-2.5-flash",
        "--r_path", "processed/200_2_2",
        "--top_k1", "5",
        "--top_k2", "3",
        "--prompt_version", "1",
        "--with_reranking", "0",
        "--chunk_ext", "0",
        "--rb",  # Enable RAG-Base generation
    ]
    
    env = os.environ.copy()
    env["OLLAMA_BASE_URL"] = "http://localhost:11434"
    env["OLLAMA_EMBED_MODEL"] = "nomic-embed-text"
    
    print(f"Running: {' '.join(cmd)}")
    result = subprocess.run(cmd, env=env, cwd=str(PROJECT_ROOT), capture_output=True, text=True)
    print(result.stdout)
    if result.stderr:
        print("STDERR:", result.stderr)

## 7. Summary Table

## 7. Analysis: Why Scores are Low and How to Improve

### Current Issue
The current F1 score is low because:
1. **Verbose Gemini responses**: Instead of "Gates v. Collier", Gemini returns "The information is not present..."
2. **Using simplified index** (200_2_2) instead of full MacRAG index
3. **chunk_ext=0** disables the key MacRAG chunk extension feature

In [None]:
# Analyze current predictions
import json

pred_file = PROJECT_ROOT / 'log' / '200_2_2' / 'hotpotqa' / 'gemini-2.5-flash' / 'prompt_v1_v1_upscaling_chunk_ext0_without_reranking_top_k1_5_top_k2_3_merge_v1' / 'rb_pred.json'

if pred_file.exists():
    with open(pred_file, 'r') as f:
        lines = f.readlines()
    
    verbose_count = 0
    none_count = 0
    short_answer_count = 0
    
    print("Sample predictions:\n")
    for i, line in enumerate(lines[:10]):
        data = json.loads(line)
        pred = data.get('rb_pred', '')
        
        # Categorize response type
        if pred == "None" or pred is None:
            none_count += 1
            cat = "‚ùå None"
        elif len(pred) > 50 or "not" in pred.lower() or "passage" in pred.lower():
            verbose_count += 1
            cat = "‚ö†Ô∏è Verbose"
        else:
            short_answer_count += 1
            cat = "‚úÖ Short"
        
        print(f"{i+1}. [{cat}] Q: {data['question'][:60]}...")
        print(f"   A: {pred[:80]}{'...' if len(pred) > 80 else ''}\n")
    
    # Count all
    for line in lines[10:]:
        data = json.loads(line)
        pred = data.get('rb_pred', '')
        if pred == "None" or pred is None:
            none_count += 1
        elif len(pred) > 50 or "not" in pred.lower() or "passage" in pred.lower():
            verbose_count += 1
        else:
            short_answer_count += 1
    
    print(f"\nüìä Response Analysis (n={len(lines)}):")
    print(f"   ‚úÖ Short answers: {short_answer_count} ({100*short_answer_count/len(lines):.1f}%)")
    print(f"   ‚ö†Ô∏è Verbose responses: {verbose_count} ({100*verbose_count/len(lines):.1f}%)")
    print(f"   ‚ùå None/Failed: {none_count} ({100*none_count/len(lines):.1f}%)")
else:
    print("No predictions file found. Run evaluation first.")

## 8. How to Improve Gemini Results

### Option 1: Use Better Prompt (Recommended)
Run with `--prompt_version 3` which asks for fewer words:

```bash
OLLAMA_BASE_URL=http://localhost:11434 \
OLLAMA_EMBED_MODEL=nomic-embed-text \
python src/main_macrag.py \
    --dataset hotpotqa \
    --model gemini-2.5-flash \
    --r_path processed/sum_600_400_raw_1500_500_e5 \
    --top_k1 100 --top_k2 7 \
    --prompt_version 3 \
    --with_reranking 1 \
    --chunk_ext 1 \
    --rb
```

### Option 2: Post-Process Gemini Responses
Extract the actual answer from verbose responses using simple heuristics.

### Option 3: Use the Full MacRAG Index
Generate proper indices using `gen_index_macrag.py` instead of `gen_index_longrag.py`

In [None]:
# Post-process Gemini responses to extract concise answers
def extract_answer(response):
    """Try to extract a concise answer from verbose Gemini response."""
    if not response or response == "None":
        return "None"
    
    # If already short, return as-is
    if len(response) < 50 and "not" not in response.lower():
        return response
    
    # Common patterns to filter out
    skip_patterns = [
        "the provided passages do not",
        "the information is not",
        "cannot be determined",
        "not enough information",
        "i cannot answer",
        "based on the passages",
        "the passages do not"
    ]
    
    lower_resp = response.lower()
    for pattern in skip_patterns:
        if pattern in lower_resp:
            return "None"  # No useful answer
    
    # Try to extract answer after common prefixes
    prefixes = ["the answer is ", "answer: ", "is "]
    for prefix in prefixes:
        if prefix in lower_resp:
            idx = lower_resp.find(prefix) + len(prefix)
            answer = response[idx:].split('.')[0].strip()
            if len(answer) < 100:
                return answer
    
    return response  # Return original if no pattern matched

# Test extraction
test_responses = [
    "Miller v. California",
    "The provided passages do not contain information about this topic.",
    "The answer is Gates v. Collier based on the passages.",
    "None"
]

print("Testing answer extraction:\n")
for resp in test_responses:
    extracted = extract_answer(resp)
    print(f"Original: {resp[:60]}...")
    print(f"Extracted: {extracted}\n")

In [None]:
# Re-evaluate with post-processing
def eval_with_postprocessing(data, pred_dir, max_samples=200):
    """Evaluate with post-processed answers."""
    answer_path = PROJECT_ROOT / "data" / "eval" / f"{data}.json"
    
    with open(answer_path, encoding='utf-8') as f:
        qs_data = json.load(f)
    
    answer = [d["answers"] for d in qs_data[:max_samples]]
    
    pred_path = Path(pred_dir)
    file_path = pred_path / "rb_pred.json"
    
    if not file_path.exists():
        print("No predictions found")
        return None
    
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    
    # Original predictions
    original_preds = []
    # Post-processed predictions
    processed_preds = []
    
    for idx in range(min(max_samples, len(lines))):
        try:
            data_line = json.loads(lines[idx])
            pred = data_line.get('rb_pred', "None")
            original_preds.append(pred)
            processed_preds.append(extract_answer(pred))
        except:
            original_preds.append("None")
            processed_preds.append("None")
    
    eval_answer = answer[:len(original_preds)]
    
    original_f1 = F1_scorer(original_preds, eval_answer)
    processed_f1 = F1_scorer(processed_preds, eval_answer)
    
    print(f"üìä F1 Scores for {data}:")
    print(f"   Original:       {original_f1:.2f}%")
    print(f"   Post-processed: {processed_f1:.2f}%")
    print(f"   Improvement:    {processed_f1 - original_f1:+.2f}%")
    
    return original_f1, processed_f1

# Run post-processed evaluation
pred_dir = PROJECT_ROOT / 'log' / '200_2_2' / 'hotpotqa' / 'gemini-2.5-flash' / 'prompt_v1_v1_upscaling_chunk_ext0_without_reranking_top_k1_5_top_k2_3_merge_v1'
if pred_dir.exists():
    eval_with_postprocessing('hotpotqa', str(pred_dir) + '/')
else:
    print("Run evaluation first to generate predictions.")

## 9. Root Cause Analysis & How to Match Original Results

### üîç Key Finding: Index Size Difference

| Index | Chunks | Status |
|-------|--------|--------|
| `200_2_2` (our test) | **53** | Too small - answers not in corpus |
| `200_2_2_e5` (full) | **10,801** | Full index - use this! |
| `sum_600_400_raw_1500_500_e5` | ~10k+ | Original MacRAG index |

**The low F1 score (3.24%) is primarily because our test index only has 53 chunks, so most answers are simply not retrievable!**

### ‚úÖ Solution: Use the Full Pre-built Index

Run evaluation with the full `200_2_2_e5` index (which uses `intfloat/multilingual-e5-large` embeddings):

```bash
cd /home/usman619/python_code/MacRAG

# Use the full pre-built index with e5 embeddings
python src/main_macrag.py \
    --dataset hotpotqa \
    --model gemini-2.5-flash \
    --r_path processed/200_2_2_e5 \
    --top_k1 100 --top_k2 7 \
    --prompt_version 1 \
    --with_reranking 1 \
    --chunk_ext 0 \
    --rb
```

**Note**: The pre-built index uses `intfloat/multilingual-e5-large` embeddings, not Ollama. So you don't need to set `OLLAMA_*` env vars when using it.

### Expected Performance with Full Index
- **R&B (RAG-Base)**: ~35-45% F1
- **With chunk extension**: ~45-55% F1
- **Original paper (GPT-4)**: ~50-60% F1

In [19]:
# Compare index sizes
import json

indices = {
    "200_2_2 (test)": PROJECT_ROOT / "data/corpus/processed/200_2_2/hotpotqa/chunks.json",
    "200_2_2_e5 (full)": PROJECT_ROOT / "data/corpus/processed/200_2_2_e5/hotpotqa/chunks.json",
    "sum_600_400_e5 (MacRAG)": PROJECT_ROOT / "data/corpus/processed/sum_600_400_raw_1500_500_e5/hotpotqa/chunks.json"
}

print("üìä Index Size Comparison:\n")
for name, path in indices.items():
    if path.exists():
        with open(path, 'r') as f:
            chunks = json.load(f)
        print(f"   {name}: {len(chunks):,} chunks")
    else:
        print(f"   {name}: Not found")

print("\n‚ö†Ô∏è Our test used only 53 chunks - answers aren't in the corpus!")
print("‚úÖ Use 200_2_2_e5 or sum_600_400_e5 for proper evaluation.")

üìä Index Size Comparison:

   200_2_2 (test): 53 chunks
   200_2_2_e5 (full): 10,801 chunks
   sum_600_400_e5 (MacRAG): 32,965 chunks

‚ö†Ô∏è Our test used only 53 chunks - answers aren't in the corpus!
‚úÖ Use 200_2_2_e5 or sum_600_400_e5 for proper evaluation.


### Solution: Run with Full Index

**To match original paper results, run with the full corpus index:**

```bash
# Option 1: Use the pre-built 200_2_2_e5 index (10,801 chunks)
python main_macrag.py \
    --dataset hotpotqa \
    --r_path processed/200_2_2_e5 \
    --gen_model gemini-2.5-flash \
    --rb --upscaling \
    --top_k1 50 --top_k2 3 \
    --gpu_id 0

# Option 2: Use the MacRAG summarized index (32,965 chunks - best quality)
python main_macrag.py \
    --dataset hotpotqa \
    --r_path processed/sum_600_400_raw_1500_500_e5 \
    --gen_model gemini-2.5-flash \
    --rb --upscaling \
    --top_k1 50 --top_k2 3 \
    --gpu_id 0
```

**Key differences from our test:**
| Parameter | Test Run | Recommended |
|-----------|----------|-------------|
| Index chunks | 53 | 10,801+ |
| top_k1 | 5 | 50 |
| Retrieval quality | ‚ùå | ‚úÖ |

**Expected improvement:** F1 from 3% ‚Üí 40-60%

In [22]:
# Check index dimensions - explains the incompatibility
import faiss

indices_to_check = {
    "200_2_2 (our test, nomic)": PROJECT_ROOT / "data/corpus/processed/200_2_2/hotpotqa/vector.index",
    "200_2_2_e5 (full, e5-base)": PROJECT_ROOT / "data/corpus/processed/200_2_2_e5/hotpotqa/vector.index",
    "sum_600_400_e5 (MacRAG)": PROJECT_ROOT / "data/corpus/processed/sum_600_400_raw_1500_500_e5/hotpotqa/vector.index"
}

print("üìê Index Embedding Dimensions:\n")
for name, path in indices_to_check.items():
    if path.exists():
        idx = faiss.read_index(str(path))
        print(f"   {name}: {idx.d} dimensions, {idx.ntotal:,} vectors")
    else:
        print(f"   {name}: Not found")

print("\n‚ö†Ô∏è The full indices use E5 embeddings (1024 dim)")
print("‚ö†Ô∏è Our Ollama nomic-embed-text produces 768 dimensions")
print("\nüìù To use full indices, we need to REGENERATE them with Ollama embeddings!")

üìê Index Embedding Dimensions:

   200_2_2 (our test, nomic): 768 dimensions, 53 vectors
   200_2_2_e5 (full, e5-base): 1024 dimensions, 10,801 vectors
   sum_600_400_e5 (MacRAG): 1024 dimensions, 32,965 vectors

‚ö†Ô∏è The full indices use E5 embeddings (1024 dim)
‚ö†Ô∏è Our Ollama nomic-embed-text produces 768 dimensions

üìù To use full indices, we need to REGENERATE them with Ollama embeddings!


## 10. üéØ Action Plan to Match Original Results

### The Problem
Our test index only has **53 chunks** from 10 documents, while the full corpus has **10,801+ chunks**. Most questions can't be answered because the relevant information simply isn't in our tiny index.

### The Solution: Regenerate Full Index with Ollama Embeddings

Since the pre-built indices use E5 embeddings (1024 dim) and we're using Ollama nomic-embed-text (768 dim), we need to regenerate the index.

**Step 1: Generate full index with all documents**
```bash
cd /home/usman619/python_code/MacRAG

# Generate index for all hotpotqa documents (not just 10)
python src/gen_index_macrag.py \
    --dataset hotpotqa \
    --output_dir data/corpus/processed/200_2_2_ollama \
    --model nomic-embed-text \
    --chunk_size 200 \
    --overlap 50 \
    --num_docs -1  # Use ALL documents, not just 10
```

**Step 2: Run evaluation with new full index**
```bash
python src/main_macrag.py \
    --dataset hotpotqa \
    --r_path processed/200_2_2_ollama \
    --gen_model gemini-2.5-flash \
    --rb --upscaling \
    --top_k1 50 --top_k2 3 \
    --gpu_id 0
```

### Expected Results
| Metric | Current (53 chunks) | Expected (10k+ chunks) |
|--------|---------------------|------------------------|
| R&B F1 | 3.24% | 40-60% |
| R&B EM | ~1% | 25-35% |
| Retrieval | ‚ùå Fails | ‚úÖ Works |

### Alternative: Use Original E5 Embeddings
If you want to use the pre-built indices without regenerating:
1. Install sentence-transformers: `pip install sentence-transformers`
2. Modify `gemini_handler.py` to use `intfloat/e5-base-v2` for embeddings
3. Use the existing `200_2_2_e5` or `sum_600_400_raw_1500_500_e5` indices

In [23]:
# Generate the correct commands to run with full corpus
print("=" * 70)
print("üöÄ COMMANDS TO MATCH ORIGINAL PAPER RESULTS")
print("=" * 70)

print("""
STEP 1: Generate Full Index with Ollama Embeddings
---------------------------------------------------
# This will take longer but creates a proper index with all documents

cd /home/usman619/python_code/MacRAG

python src/gen_index_longrag.py \\
    --dataset hotpotqa \\
    --chunk_size 200 \\
    --min_sentence 2 \\
    --overlap 2
    # Note: NO --max_docs flag = process ALL documents

# Expected: ~5,000+ documents ‚Üí ~10,000+ chunks
# Time estimate: 30-60 minutes depending on Ollama speed


STEP 2: Run Evaluation with Full Index
--------------------------------------
python src/main_macrag.py \\
    --dataset hotpotqa \\
    --r_path processed/200_2_2 \\
    --gen_model gemini-2.5-flash \\
    --rb --upscaling \\
    --top_k1 50 --top_k2 3 \\
    --gpu_id 0

# Time estimate: ~35 minutes for 200 questions (7s delay per request)
# Expected F1: 40-60%


ALTERNATIVE: Run in background with nohup
-----------------------------------------
cd /home/usman619/python_code/MacRAG

# Step 1: Index generation
nohup python src/gen_index_longrag.py --dataset hotpotqa --chunk_size 200 --min_sentence 2 --overlap 2 > index_log.txt 2>&1 &

# Step 2: Evaluation (after index is done)
nohup python src/main_macrag.py --dataset hotpotqa --r_path processed/200_2_2 --gen_model gemini-2.5-flash --rb --upscaling --top_k1 50 --top_k2 3 --gpu_id 0 > eval_log.txt 2>&1 &
""")

üöÄ COMMANDS TO MATCH ORIGINAL PAPER RESULTS

STEP 1: Generate Full Index with Ollama Embeddings
---------------------------------------------------
# This will take longer but creates a proper index with all documents

cd /home/usman619/python_code/MacRAG

python src/gen_index_longrag.py \
    --dataset hotpotqa \
    --chunk_size 200 \
    --min_sentence 2 \
    --overlap 2
    # Note: NO --max_docs flag = process ALL documents

# Expected: ~5,000+ documents ‚Üí ~10,000+ chunks
# Time estimate: 30-60 minutes depending on Ollama speed


STEP 2: Run Evaluation with Full Index
--------------------------------------
python src/main_macrag.py \
    --dataset hotpotqa \
    --r_path processed/200_2_2 \
    --gen_model gemini-2.5-flash \
    --rb --upscaling \
    --top_k1 50 --top_k2 3 \
    --gpu_id 0

# Time estimate: ~35 minutes for 200 questions (7s delay per request)
# Expected F1: 40-60%


ALTERNATIVE: Run in background with nohup
-----------------------------------------
cd /home/u

## üìã Summary

### Why Current Scores Are Low (3.24% F1)
1. **Tiny Index**: Only 53 chunks from 10 documents (vs 10,000+ needed)
2. **Embedding Mismatch**: Pre-built indices use E5 (1024 dim), we use nomic (768 dim)
3. **Retrieval Failure**: 87% of questions returned "information not present"

### How to Match Original Results (~50% F1)
1. **Regenerate full index** with `gen_index_longrag.py` (remove `--max_docs 10`)
2. **Increase top_k1** from 5 to 50 for better retrieval coverage
3. **Use Gemini's strengths** - it generates good answers when given relevant context

### What's Working ‚úÖ
- Gemini API integration with rate limiting
- Ollama embeddings (local, free, no rate limits)
- FAISS index creation and search
- Basic RAG pipeline

### Next Steps
1. Run `gen_index_longrag.py` without `--max_docs` to build full index
2. Run evaluation with `--r_path processed/200_2_2` and `--top_k1 50`
3. Compare results with original paper's 50%+ F1 scores

In [13]:
import pandas as pd

if results:
    summary_data = []
    for dataset, (f1, doc_len, none_count) in results.items():
        for method, score in f1['F1'].items():
            summary_data.append({
                'Dataset': dataset,
                'Method': method,
                'F1': score,
                'Avg Doc Len': doc_len['doc_len'].get(method, 0),
                'None Count': none_count['none_count'].get(method, 0)
            })
    
    df = pd.DataFrame(summary_data)
    print("\nüìä Summary Results")
    print("="*60)
    display(df.pivot_table(index='Dataset', columns='Method', values='F1', aggfunc='first'))
else:
    print("No results to summarize. Run evaluation first.")


üìä Summary Results


Method,R&B
Dataset,Unnamed: 1_level_1
hotpotqa,8.01


---

In [14]:
from metric import F1_scorer
import json
import numpy as np

In [15]:
def replace_and_calculate_average(lst):
    # -1ÏùÑ Ï†úÏô∏Ìïú ÎÇòÎ®∏ÏßÄ Í∞íÎì§Ïùò ÌèâÍ∑†ÏùÑ Íµ¨Ìï©ÎãàÎã§.
    valid_values = [x for x in lst if x != -1]
    if not valid_values:
        raise ValueError("Î¶¨Ïä§Ìä∏Ïóê Ïú†Ìö®Ìïú Í∞íÏù¥ ÏóÜÏäµÎãàÎã§.")
    
    average_of_valid_values = sum(valid_values) / len(valid_values)
    
    # -1ÏùÑ ÎÇòÎ®∏ÏßÄ Í∞íÎì§Ïùò ÌèâÍ∑†ÏúºÎ°ú ÎåÄÏ≤¥Ìï©ÎãàÎã§.
    replaced_list = [average_of_valid_values if x == -1 else x for x in lst]
    
    # ÏµúÏ¢Ö Î¶¨Ïä§Ìä∏Ïùò ÌèâÍ∑†ÏùÑ Íµ¨Ìï©ÎãàÎã§.
    final_average = sum(replaced_list) / len(replaced_list)
    
    return final_average

In [16]:
def eval_function(data, pred_dir):
    answer_path = "../data/eval/{}.json".format(data)
    with open(answer_path, encoding='utf-8') as f:
            qs_data = json.load(f)
    answer = []
    for d in qs_data:
        answer.append(d["answers"])
    F1 = {'F1':{}}
    doc_len = {'doc_len':{}}
    None_count = {'none_count':{}}
    
    answer_types = ["rb_pred", "rl_pred", "ext_pred", "fil_pred", "ext_fil_pred"]
    dict_ = {"rb_pred":"R&B", "rl_pred":"R&L", "ext_pred":"Ext", "fil_pred":"Fil", "ext_fil_pred":"E&F"}
        
    for answer_type in answer_types:
        try:
            with open(pred_dir + answer_type + ".json", "r", encoding = "utf-8") as f:
                df = f.read()
            preds = []
            lens = []
            none_count = 0
            for idx in range(200):
                 
                try:
                    pred = eval(df.split("\n")[idx])[answer_type]
                    preds += [pred]
                    lens += [eval(df.split("\n")[idx])['input_len']]
                except:
                    pred = eval(df.split("\n")[idx].replace("null", "'None'"))[answer_type]
                    preds += [pred]
                    lens += [-1]
                    
                if pred == "None":                    
                    none_count += 1
            
            F1['F1'][dict_[answer_type]] = F1_scorer(preds, answer)
            doc_len['doc_len'][dict_[answer_type]] = replace_and_calculate_average(lens)
            None_count['none_count'][dict_[answer_type]] = none_count
        except:
            # import pdb;pdb.set_trace()
            pass
    print("F1 ÏÑ±Îä•:")
    print(F1['F1'])
    print("none Í∞úÏàò:")
    print(None_count['none_count'])
    print("doc_len:")
    print(doc_len['doc_len'])
    print("\n\n")
    print("===============================================")
    # return F1, doc_len

# 200_2_2

In [17]:
# model = "gpt-4o"
model = "gemini-2.5-flash"

In [18]:
version = "base_0_5"
print("model: {}".format(model))
print("version: {}\n\n".format(version))
data_list = ["hotpotqa", "2wikimultihopqa", "musique"]
for data in data_list:
    print("data: {}".format(data))
    eval_function(data, f"./log/200_2_2/{data}/{model}/{version}/")

model: gemini-2.5-flash
version: base_0_5


data: hotpotqa
F1 ÏÑ±Îä•:
{}
none Í∞úÏàò:
{}
doc_len:
{}



data: 2wikimultihopqa
F1 ÏÑ±Îä•:
{}
none Í∞úÏàò:
{}
doc_len:
{}



data: musique
F1 ÏÑ±Îä•:
{}
none Í∞úÏàò:
{}
doc_len:
{}



