In [20]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import json
import numpy as np
from src.pipeline import (
    run_ingestion, build_retriever,
    run_evaluation, QA_PATH
)

## 1 · Configuration

In [21]:
TOP_K = 4
CHUNK_MODES     = ['structural', 'semantic']
RETRIEVAL_MODES = ['bm25', 'dense', 'rrf']
with open(QA_PATH, 'r', encoding='utf-8') as f:
    qa_pairs = json.load(f)
print(f'{len(qa_pairs)} QA pairs loaded')

20 QA pairs loaded


### extracts PDF text, chunks, embeds, and stores everything in SQLite + FAISS

In [22]:
for mode in CHUNK_MODES:
    run_ingestion(chunk_mode=mode)


  INGESTION  –  chunk_mode = structural

[1/4] extracting PDF text …
  extracting: cricket-rules.pdf
    -> 44 pages
  total docs: 1
[2/4] setting up database …
[3/4] chunking (structural) …
   chunks created: 594
[4/4] embedding chunks …
   done – 594 chunks stored


  INGESTION  –  chunk_mode = semantic

[1/4] extracting PDF text …
  extracting: cricket-rules.pdf
    -> 44 pages
  total docs: 1
[2/4] setting up database …
[3/4] chunking (semantic) …
   chunks created: 120
[4/4] embedding chunks …
   done – 120 chunks stored



In [23]:
retrievers = {}
for mode in CHUNK_MODES:
    print(f'retriever for: {mode}')
    retrievers[mode] = build_retriever(chunk_mode=mode)

print('all retrievers ready')

retriever for: structural
  BM25 index built – 594 chunks
  FAISS index built – 594 vectors, dim=1536
retriever for: semantic
  BM25 index built – 120 chunks
  FAISS index built – 120 vectors, dim=1536
all retrievers ready


### ablation experiments

| Exp | Chunking    | Retrieval 
|-----|-------------|-----------
| A1  | structural  |bm25      
| A2  | structural  | dense     
| A3  | structural  |rrf       
| B1  | semantic    | bm25      
| B2  | semantic    | dense     
| B3  | semantic    | rrf      

In [24]:
all_summaries = []
all_details = {}

for cm in CHUNK_MODES:
    for rm in RETRIEVAL_MODES:
        label = f'{cm}+{rm}'
        print(f'EXPERIMENT: {label}')
        summary, details = run_evaluation(
            qa_path=QA_PATH,
            retriever=retrievers[cm],
            chunk_mode=cm,
            retrieval_method=rm,
            top_k=TOP_K,
        )
        summary['label'] = label
        all_summaries.append(summary)
        all_details[label] = details

print('experiments done')

EXPERIMENT: structural+bm25
  Q1/20: What is the purpose of the ICC Code of Conduct?…
    EM=0  F1=0.265  grounding=100%  time=2.51s
  Q2/20: What does Article 2.3 of the Code of Conduct cover?…
    EM=0  F1=0.051  grounding=0%  time=0.96s
  Q3/20: What are the offence levels defined in the Code of Cond…
    EM=0  F1=0.712  grounding=100%  time=1.66s
  Q4/20: What does Article 2.5 prohibit regarding a dismissed ba…
    EM=0  F1=0.474  grounding=100%  time=2.75s
  Q5/20: What is the sanction for changing the condition of the …
    EM=0  F1=0.143  grounding=0%  time=1.05s
  Q6/20: Who can report an alleged offence under the Code of Con…
    EM=0  F1=0.250  grounding=100%  time=2.30s
  Q7/20: What does Article 2.12 cover regarding physical contact…
    EM=0  F1=0.312  grounding=100%  time=2.69s
  Q8/20: Are social media posts considered public under the Code…
    EM=0  F1=0.600  grounding=100%  time=1.84s
  Q9/20: What is the reporting deadline for Level 1 and 2 on-fie…
    EM=0  F1=0.053

## 5 · Results comparison table

In [25]:
header = f'{"Experiment":<25} {"EM":>6} {"F1":>6} {"Ground%":>8} {"p95(s)":>7}'
print(header)
print('-' * len(header))

for s in all_summaries:
    print(f'{s["label"]:<25} {s["avg_em"]:>6.3f} {s["avg_f1"]:>6.3f} '
          f'{s["avg_grounding"]:>7.1f}% {s["p95_latency"]:>7.2f}')

Experiment                    EM     F1  Ground%  p95(s)
--------------------------------------------------------
structural+bm25            0.000  0.403    70.0%    3.10
structural+dense           0.000  0.348    60.0%    2.92
structural+rrf             0.000  0.453    75.0%    3.38
semantic+bm25              0.000  0.518    70.0%    4.91
semantic+dense             0.000  0.454    80.0%    5.98
semantic+rrf               0.000  0.513    80.0%    8.51


In [26]:
def get_summary(label):
    for s in all_summaries:
        if s['label'] == label:
            return s
    return None

print('Chunking ablation  (RRF held constant)')
for lbl in ['structural+rrf', 'semantic+rrf']:
    s = get_summary(lbl)
    if s:
        print(f'  {lbl:<25}  F1={s["avg_f1"]:.3f}  Grounding={s["avg_grounding"]:.1f}%')

print()
print('Retrieval ablation  (structural chunks held constant)')
for lbl in ['structural+bm25', 'structural+dense', 'structural+rrf']:
    s = get_summary(lbl)
    if s:
        print(f'  {lbl:<25}  F1={s["avg_f1"]:.3f}  Grounding={s["avg_grounding"]:.1f}%')

print()
print('Fusion value')
a1, a2, a3 = get_summary('structural+bm25'), get_summary('structural+dense'), get_summary('structural+rrf')
if a1 and a2 and a3:
    best_single = max(a1['avg_f1'], a2['avg_f1'])
    print(f'  best single retriever F1 = {best_single:.3f}')
    print(f'  RRF fusion F1            = {a3["avg_f1"]:.3f}')
    print(f'  delta                    = {a3["avg_f1"] - best_single:+.3f}')

Chunking ablation  (RRF held constant)
  structural+rrf             F1=0.453  Grounding=75.0%
  semantic+rrf               F1=0.513  Grounding=80.0%

Retrieval ablation  (structural chunks held constant)
  structural+bm25            F1=0.403  Grounding=70.0%
  structural+dense           F1=0.348  Grounding=60.0%
  structural+rrf             F1=0.453  Grounding=75.0%

Fusion value
  best single retriever F1 = 0.403
  RRF fusion F1            = 0.453
  delta                    = +0.051


### Latency

Measure p50, p95, p99 latency

**Acceptance target:** p95 ≤ 2.5 s at top-k=4 for approx 512-token answers.

In [27]:
print(f'{"Experiment":<25} {"p50(s)":>7} {"p95(s)":>7} {"p99(s)":>7} {"mean(s)":>7}')
for label, details in all_details.items():
    lats = [d['latency'] for d in details]
    p50 = np.percentile(lats, 50)
    p95 = np.percentile(lats, 95)
    p99 = np.percentile(lats, 99)
    mean = np.mean(lats)
    print(f'{label:<25} {p50:>7.2f} {p95:>7.2f} {p99:>7.2f} {mean:>7.2f}')

Experiment                 p50(s)  p95(s)  p99(s) mean(s)
structural+bm25              1.79    3.10    3.92    1.87
structural+dense             1.70    2.92    3.03    1.74
structural+rrf               1.80    3.38    3.96    1.96
semantic+bm25                2.59    4.91    7.79    2.79
semantic+dense               2.83    5.98    7.19    3.09
semantic+rrf                 2.80    8.51    8.56    3.17
