In [25]:
# Install required libraries (if not already installed)
!pip install transformers faiss-cpu rank-bm25 scikit-learn numpy pandas torch sentence-transformers PyPDF2 nltk gensim
import json
import re
import time
import numpy as np
import pandas as pd
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch
import faiss
from collections import defaultdict
import warnings
import PyPDF2
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [26]:
import os  # Added import for os module

CONTRACT_FOLDER = "contracts"  # Fixed folder name to match workspace directory
QA_DATASET_FILE = "dataset.json"
DATASET_PATH = os.path.abspath(QA_DATASET_FILE)

print(f"Current working directory: {os.getcwd()}")
if not os.path.isfile(DATASET_PATH):
    print(f"[ERROR] Dataset file not found at: {DATASET_PATH}")
else:
    print(f"[OK] Dataset file found: {DATASET_PATH}")

if not os.path.isdir(CONTRACT_FOLDER):
    print(f"[WARNING] Contract folder not found: {os.path.abspath(CONTRACT_FOLDER)}")


def extract_source_filename(raw):
    """Extract a PDF filename from the raw 'source document' field."""
    if not raw:
        return None
    # Split by spaces and find token ending with .pdf
    parts = raw.split()
    for p in parts:
        if p.lower().endswith('.pdf'):
            return p
    return None


def load_data(contract_folder_path, qa_dataset_file_path):
    """Loads contract text from multiple PDFs in a folder and Q/A dataset with source attribution.

    Returns:
        corpus_df: DataFrame with columns ['chunk', 'source_pdf']
        qa_dataset: list of dicts with normalized keys: question, answer_snippet, context_chunk, source_document
    """
    rows = []
    if os.path.isdir(contract_folder_path):
        for filename in os.listdir(contract_folder_path):
            if filename.endswith(".pdf"):
                filepath = os.path.join(contract_folder_path, filename)
                try:
                    with open(filepath, 'rb') as f:
                        reader = PyPDF2.PdfReader(f)
                        for page_num in range(len(reader.pages)):
                            extracted_text = reader.pages[page_num].extract_text()
                            if extracted_text:
                                # Split page text into candidate lines/chunks
                                for line in extracted_text.split('\n'):
                                    line = line.strip()
                                    if line and len(line.split()) > 5:
                                        rows.append({'chunk': line, 'source_pdf': filename})
                except Exception as e:
                    print(f"Error reading {filepath}: {e}")
    else:
        print(f"Contract folder does not exist: {contract_folder_path}")

    qa_dataset = []
    if os.path.isfile(qa_dataset_file_path):
        try:
            with open(qa_dataset_file_path, 'r', encoding='utf-8') as f:
                qa_dataset = json.load(f)
                if isinstance(qa_dataset, dict):
                    qa_dataset = list(qa_dataset.values())
                if not isinstance(qa_dataset, list):
                    print("[ERROR] Dataset JSON is not a list; wrapping into a single-item list.")
                    qa_dataset = [qa_dataset]
        except FileNotFoundError:
            print(f"QA dataset file not found: {qa_dataset_file_path}")
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from {qa_dataset_file_path}: {e}")
            try:
                with open(qa_dataset_file_path, 'r', encoding='utf-8') as f:
                    lines = [l.strip() for l in f if l.strip()]
                recovered = []
                for line in lines:
                    try:
                        recovered.append(json.loads(line))
                    except Exception:
                        pass
                if recovered:
                    print(f"Recovered {len(recovered)} items using line-delimited fallback.")
                    qa_dataset = recovered
            except Exception as e2:
                print(f"Recovery attempt failed: {e2}")
    else:
        print(f"Dataset file does not exist at path: {qa_dataset_file_path}")

    # Normalize QA fields
    normalized = []
    for item in qa_dataset:
        if not isinstance(item, dict):
            continue
        norm = {
            'question': item.get('question') or item.get('query') or '',
            'answer_snippet': item.get('answer_snippet') or item.get('answer') or '',
            'context_chunk': item.get('context_chunk') or item.get('context') or '',
            'source_document_raw': item.get('source document') or item.get('source_document') or ''
        }
        norm['source_document'] = extract_source_filename(norm['source_document_raw'])
        normalized.append(norm)

    corpus_df = pd.DataFrame(rows)
    print(f"Loaded {len(normalized)} QA pairs. Corpus chunks: {len(corpus_df)}")
    missing_source_docs = sum(1 for q in normalized if not q['source_document'])
    if missing_source_docs:
        print(f"[INFO] {missing_source_docs} QA items lack a parsed source_document filename.")
    return corpus_df, normalized


def chunk_text_placeholder(_):
    # No longer used; corpus already chunked with source attribution.
    pass

corpus_df, qa_dataset = load_data(CONTRACT_FOLDER, DATASET_PATH)  # Updated loader returns attributed corpus

print(f"Corpus size: {len(corpus_df)} chunks")
print(f"Evaluation dataset size: {len(qa_dataset)} Q/A pairs")
if len(corpus_df):
    display(corpus_df.head())
else:
    print("No contract text extracted; PDF files may be missing or unreadable.")

Current working directory: /workspaces/Splade
[OK] Dataset file found: /workspaces/Splade/dataset.json
Loaded 92 QA pairs. Corpus chunks: 3247
Corpus size: 3247 chunks
Evaluation dataset size: 92 Q/A pairs
Loaded 92 QA pairs. Corpus chunks: 3247
Corpus size: 3247 chunks
Evaluation dataset size: 92 Q/A pairs


Unnamed: 0,chunk,source_pdf
0,Affiliate Program / Premium Affiliate Management General,SouthernStarEnergyInc_20051202_SB-2A_EX-9_801890_EX-9_Affiliate Agreement.pdf
1,The following General Terms and Conditions are intended for (i) Web site,SouthernStarEnergyInc_20051202_SB-2A_EX-9_801890_EX-9_Affiliate Agreement.pdf
2,"owners (hereafter, ""Affiliates"") who wish to participate as Affiliates in",SouthernStarEnergyInc_20051202_SB-2A_EX-9_801890_EX-9_Affiliate Agreement.pdf
3,the Affiliate Program provided by element 5 (governed by II. and IV. in,SouthernStarEnergyInc_20051202_SB-2A_EX-9_801890_EX-9_Affiliate Agreement.pdf
4,these General Terms and Conditions) on the basis of these General Terms and,SouthernStarEnergyInc_20051202_SB-2A_EX-9_801890_EX-9_Affiliate Agreement.pdf


In [27]:
# Install rapidfuzz for fuzzy matching (only runs if not already installed)
%pip install -q rapidfuzz
from rapidfuzz import fuzz
print("[OK] rapidfuzz available for fuzzy similarity scoring.")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
[OK] rapidfuzz available for fuzzy similarity scoring.
Note: you may need to restart the kernel to use updated packages.
[OK] rapidfuzz available for fuzzy similarity scoring.


In [28]:
# BM25 setup: tokenize corpus and build index
import re
from rank_bm25 import BM25Okapi

FUZZY_THRESHOLD = 70  # similarity threshold for context match

def tokenize(text: str):
    if not isinstance(text, str):
        return []
    return re.findall(r"\w+", text.lower())

if 'corpus_df' not in globals() or corpus_df is None or len(corpus_df) == 0:
    raise ValueError("corpus_df is empty; ensure contracts were loaded successfully before running BM25 setup.")

# Tokenize corpus chunks
corpus_tokens = [tokenize(c) for c in corpus_df['chunk'].tolist()]
print(f"Tokenized {len(corpus_tokens)} corpus chunks.")

# Build BM25 index
bm25 = BM25Okapi(corpus_tokens)
print("[OK] BM25 index built.")

Tokenized 3247 corpus chunks.
[OK] BM25 index built.
[OK] BM25 index built.


In [29]:
# Convert normalized QA dataset list to DataFrame and add tokenized question
if 'qa_dataset' not in globals() or qa_dataset is None:
    raise ValueError("qa_dataset variable not found. Run the loading cell first.")

qa_df = pd.DataFrame(qa_dataset)
if qa_df.empty:
    raise ValueError("qa_df is empty; dataset.json may not have loaded correctly.")

qa_df['tokenized_question'] = qa_df['question'].apply(lambda q: tokenize(q))
print(f"QA DataFrame created with {len(qa_df)} rows.")
qa_df.head()

QA DataFrame created with 92 rows.


Unnamed: 0,question,answer_snippet,context_chunk,source_document_raw,source_document,tokenized_question
0,On what date was the Marketing Affiliate Agreement between Equidata and National Credit Report made effective?,This Agreement is made this 1st day of October 2008,"MARKETING AFFILIATE AGREEMENT This Agreement is made this 1st day of October 2008, (the “Effective Date”), by and between Equidata, Inc.… and National Credit Report.com, LLC",SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:0]{index=0},Agreement.pdf,"[on, what, date, was, the, marketing, affiliate, agreement, between, equidata, and, national, credit, report, made, effective]"
1,Which state’s law governs the Equidata Marketing Affiliate Agreement?,This Agreement is governed by and construed in accordance with the laws of the State of Virginia.,12. Miscellaneous… This Agreement is governed by and construed in accordance with the laws of the State of Virginia.,SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:1]{index=1},Agreement.pdf,"[which, state, s, law, governs, the, equidata, marketing, affiliate, agreement]"
2,Where must disputes under Equidata’s billing disputes clause be arbitrated?,"Such disputes shall be settled by arbitration in the City of Newport News, Virginia.","2. Disputes… Such disputes shall be settled by arbitration in the City of Newport News, Virginia.",SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:2]{index=2},Agreement.pdf,"[where, must, disputes, under, equidata, s, billing, disputes, clause, be, arbitrated]"
3,How long before a price increase must Equidata notify the Marketing Affiliate?,Notice will be given… no less than 30 days prior to such increase taking affect.,Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing no less than 30 days prior…,SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:3]{index=3},Agreement.pdf,"[how, long, before, a, price, increase, must, equidata, notify, the, marketing, affiliate]"
4,Who bears responsibility for collecting consumer payments under the Equidata agreement?,Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer,1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and shall bear sole responsibility for non-payment…,SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:4]{index=4},Agreement.pdf,"[who, bears, responsibility, for, collecting, consumer, payments, under, the, equidata, agreement]"


In [30]:
# Run BM25 searches for each question and compute match columns
from rapidfuzz import fuzz

bm25_results = []
corpus_chunks = corpus_df['chunk'].tolist()

for i, row in qa_df.iterrows():
    tokens = row['tokenized_question']
    if not tokens:
        bm25_results.append({
            'bm25_top_chunks': [],
            'bm25_top_scores': [],
            'bm25_best_chunk': None,
            'bm25_best_score': None,
            'bm25_context_match': False,
            'bm25_first_match_rank': None
        })
        continue
    scores = bm25.get_scores(tokens)
    # Get indices sorted by score descending
    ranked_idx = np.argsort(scores)[::-1]
    top_n = 5
    top_indices = ranked_idx[:top_n]
    top_chunks = [corpus_chunks[j] for j in top_indices]
    top_scores = [float(scores[j]) for j in top_indices]
    best_chunk = top_chunks[0] if top_chunks else None
    best_score = top_scores[0] if top_scores else None

    gold_context = row.get('context_chunk', '') or ''
    match_found = False
    first_match_rank = None
    for rank_pos, candidate in enumerate(top_chunks, start=1):
        sim = fuzz.token_set_ratio(gold_context, candidate)
        if sim >= FUZZY_THRESHOLD:
            match_found = True
            first_match_rank = rank_pos
            break

    bm25_results.append({
        'bm25_top_chunks': top_chunks,
        'bm25_top_scores': top_scores,
        'bm25_best_chunk': best_chunk,
        'bm25_best_score': best_score,
        'bm25_context_match': match_found,
        'bm25_first_match_rank': first_match_rank
    })

# Merge results into qa_df
res_df = pd.DataFrame(bm25_results)
qa_df = pd.concat([qa_df, res_df], axis=1)
print("[OK] BM25 search completed for all questions.")
qa_df.head()

[OK] BM25 search completed for all questions.


Unnamed: 0,question,answer_snippet,context_chunk,source_document_raw,source_document,tokenized_question,bm25_top_chunks,bm25_top_scores,bm25_best_chunk,bm25_best_score,bm25_context_match,bm25_first_match_rank
0,On what date was the Marketing Affiliate Agreement between Equidata and National Credit Report made effective?,This Agreement is made this 1st day of October 2008,"MARKETING AFFILIATE AGREEMENT This Agreement is made this 1st day of October 2008, (the “Effective Date”), by and between Equidata, Inc.… and National Credit Report.com, LLC",SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:0]{index=0},Agreement.pdf,"[on, what, date, was, the, marketing, affiliate, agreement, between, equidata, and, national, credit, report, made, effective]","[Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC, This Agreement is made this 1 st day of October 2008, (the “Effective Date ”), by and between Equidata, Inc., a corporation, 23606 (“Equidata”), and National Credit Report.com, LLC a Corporation organized under the laws of Florida, with its principal, THIS AGREEMENT (the “ Agreement ”), made as of the 6th day of March, 2006 (the “ Effective Date”), is by and between The, This Agreement entered into as of the Effective Date by and between]","[28.018947103860445, 21.583592363855995, 19.978388027175402, 19.93245742816062, 18.207043473159885]","Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC",28.018947,True,2.0
1,Which state’s law governs the Equidata Marketing Affiliate Agreement?,This Agreement is governed by and construed in accordance with the laws of the State of Virginia.,12. Miscellaneous… This Agreement is governed by and construed in accordance with the laws of the State of Virginia.,SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:1]{index=1},Agreement.pdf,"[which, state, s, law, governs, the, equidata, marketing, affiliate, agreement]","[6. Audit. Equidata may audit, at Equidata ’s expense, the Marketing Affiliate ’s marketing, practices and activities for the, 2. Marketing Affiliate and Equidata wish to enter into an agreement under which Marketing Affiliate may market the Services., to Marketing Affiliate. Upon termination for any reason, Equidata reserves the right to deactivate Marketing Affiliate’s, purpose of assuring compliance with this Agreement. Equidata reserves the right to site inspect Marketing Affiliate ’s, given in writing by Marketing Affiliate to Equidata, Equidata or Marketing Affiliate may choose arbitration and]","[16.385950871974746, 15.917270098572338, 14.14409584663738, 13.759856505867411, 12.73983416309363]","6. Audit. Equidata may audit, at Equidata ’s expense, the Marketing Affiliate ’s marketing, practices and activities for the",16.385951,False,
2,Where must disputes under Equidata’s billing disputes clause be arbitrated?,"Such disputes shall be settled by arbitration in the City of Newport News, Virginia.","2. Disputes… Such disputes shall be settled by arbitration in the City of Newport News, Virginia.",SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:2]{index=2},Agreement.pdf,"[where, must, disputes, under, equidata, s, billing, disputes, clause, be, arbitrated]","[8.06 ARBITRATION. All disputes or claims hereunder shall be resolved by, rules of the American Arbitration Association. All disputes or claims by NETWORK, 8.05 GOVERNING LAW. All disputes or claims by Payment Data Systems, OF WARRANTY UNDER SUBPARAGRAPH (A) HEREOF MUST BE MADE, obligation to promptly pay for undisputed charges in accordance with the terms of this Agreement. Such disputes shall]","[15.908745770940799, 13.849880565743023, 13.849880565743023, 12.11352264532109, 11.605025288556485]",8.06 ARBITRATION. All disputes or claims hereunder shall be resolved by,15.908746,False,
3,How long before a price increase must Equidata notify the Marketing Affiliate?,Notice will be given… no less than 30 days prior to such increase taking affect.,Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing no less than 30 days prior…,SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:3]{index=3},Agreement.pdf,"[how, long, before, a, price, increase, must, equidata, notify, the, marketing, affiliate]","[Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing, 6. Audit. Equidata may audit, at Equidata ’s expense, the Marketing Affiliate ’s marketing, practices and activities for the, absolute discretion. In the event of a price and/or fee change for Technology, Company shall notify MA, given in writing by Marketing Affiliate to Equidata, Equidata or Marketing Affiliate may choose arbitration and, to Marketing Affiliate. Upon termination for any reason, Equidata reserves the right to deactivate Marketing Affiliate’s]","[16.23449909635954, 13.74680913892679, 13.111962477879251, 12.73983416309363, 12.244527307826083]",Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing,16.234499,True,1.0
4,Who bears responsibility for collecting consumer payments under the Equidata agreement?,Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer,1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and shall bear sole responsibility for non-payment…,SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:4]{index=4},Agreement.pdf,"[who, bears, responsibility, for, collecting, consumer, payments, under, the, equidata, agreement]","[1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and, shall bear sole responsibility for non -payment of any fees charged to the Consumer. Marketing Affiliate shall pay to, PERMISSION PROVIDED BY EQUIDATA UNDER THIS AGREEMENT IS PROVIDED ON AN “AS IS” BASIS. EQUIDATA, due immediately upon receipt. Marketing Affiliate agrees to reimburse Equidata all costs of collecting any past due, shall be the sole responsibility of the Affiliate.]","[16.79946189890967, 15.221391973029311, 11.12044130608493, 10.732216231320576, 10.61953434224972]",1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and,16.799462,True,1.0


In [31]:
# Compute evaluation metrics: accuracy, MRR, rank distribution
match_series = qa_df['bm25_context_match']
accuracy = match_series.mean() if len(match_series) else 0.0

# MRR: reciprocal of rank where first match occurred
ranks = qa_df['bm25_first_match_rank']
reciprocals = [1.0/r for r in ranks if isinstance(r, int) and r > 0]
mrr = sum(reciprocals)/len(ranks) if len(ranks) else 0.0

rank_counts = ranks.value_counts(dropna=False).to_dict()
metrics = {
    'num_questions': int(len(qa_df)),
    'accuracy': float(accuracy),
    'mrr': float(mrr),
    'rank_counts': rank_counts,
    'fuzzy_threshold': FUZZY_THRESHOLD
}
print("BM25 Evaluation Metrics:")
for k,v in metrics.items():
    print(f"  {k}: {v}")

bm25_metrics = metrics

BM25 Evaluation Metrics:
  num_questions: 92
  accuracy: 0.358695652173913
  mrr: 0.0
  rank_counts: {nan: 59, 1.0: 25, 2.0: 3, 3.0: 2, 5.0: 2, 4.0: 1}
  fuzzy_threshold: 70


In [32]:
# Display sample enriched QA rows
cols_to_show = [
    'question','context_chunk','bm25_context_match','bm25_first_match_rank',
    'bm25_best_score','bm25_best_chunk','bm25_top_scores','bm25_top_chunks'
]
print("Sample BM25 evaluation rows:")
display(qa_df[cols_to_show].head(10))

Sample BM25 evaluation rows:


Unnamed: 0,question,context_chunk,bm25_context_match,bm25_first_match_rank,bm25_best_score,bm25_best_chunk,bm25_top_scores,bm25_top_chunks
0,On what date was the Marketing Affiliate Agreement between Equidata and National Credit Report made effective?,"MARKETING AFFILIATE AGREEMENT This Agreement is made this 1st day of October 2008, (the “Effective Date”), by and between Equidata, Inc.… and National Credit Report.com, LLC",True,2.0,28.018947,"Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC","[28.018947103860445, 21.583592363855995, 19.978388027175402, 19.93245742816062, 18.207043473159885]","[Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC, This Agreement is made this 1 st day of October 2008, (the “Effective Date ”), by and between Equidata, Inc., a corporation, 23606 (“Equidata”), and National Credit Report.com, LLC a Corporation organized under the laws of Florida, with its principal, THIS AGREEMENT (the “ Agreement ”), made as of the 6th day of March, 2006 (the “ Effective Date”), is by and between The, This Agreement entered into as of the Effective Date by and between]"
1,Which state’s law governs the Equidata Marketing Affiliate Agreement?,12. Miscellaneous… This Agreement is governed by and construed in accordance with the laws of the State of Virginia.,False,,16.385951,"6. Audit. Equidata may audit, at Equidata ’s expense, the Marketing Affiliate ’s marketing, practices and activities for the","[16.385950871974746, 15.917270098572338, 14.14409584663738, 13.759856505867411, 12.73983416309363]","[6. Audit. Equidata may audit, at Equidata ’s expense, the Marketing Affiliate ’s marketing, practices and activities for the, 2. Marketing Affiliate and Equidata wish to enter into an agreement under which Marketing Affiliate may market the Services., to Marketing Affiliate. Upon termination for any reason, Equidata reserves the right to deactivate Marketing Affiliate’s, purpose of assuring compliance with this Agreement. Equidata reserves the right to site inspect Marketing Affiliate ’s, given in writing by Marketing Affiliate to Equidata, Equidata or Marketing Affiliate may choose arbitration and]"
2,Where must disputes under Equidata’s billing disputes clause be arbitrated?,"2. Disputes… Such disputes shall be settled by arbitration in the City of Newport News, Virginia.",False,,15.908746,8.06 ARBITRATION. All disputes or claims hereunder shall be resolved by,"[15.908745770940799, 13.849880565743023, 13.849880565743023, 12.11352264532109, 11.605025288556485]","[8.06 ARBITRATION. All disputes or claims hereunder shall be resolved by, rules of the American Arbitration Association. All disputes or claims by NETWORK, 8.05 GOVERNING LAW. All disputes or claims by Payment Data Systems, OF WARRANTY UNDER SUBPARAGRAPH (A) HEREOF MUST BE MADE, obligation to promptly pay for undisputed charges in accordance with the terms of this Agreement. Such disputes shall]"
3,How long before a price increase must Equidata notify the Marketing Affiliate?,Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing no less than 30 days prior…,True,1.0,16.234499,Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing,"[16.23449909635954, 13.74680913892679, 13.111962477879251, 12.73983416309363, 12.244527307826083]","[Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing, 6. Audit. Equidata may audit, at Equidata ’s expense, the Marketing Affiliate ’s marketing, practices and activities for the, absolute discretion. In the event of a price and/or fee change for Technology, Company shall notify MA, given in writing by Marketing Affiliate to Equidata, Equidata or Marketing Affiliate may choose arbitration and, to Marketing Affiliate. Upon termination for any reason, Equidata reserves the right to deactivate Marketing Affiliate’s]"
4,Who bears responsibility for collecting consumer payments under the Equidata agreement?,1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and shall bear sole responsibility for non-payment…,True,1.0,16.799462,1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and,"[16.79946189890967, 15.221391973029311, 11.12044130608493, 10.732216231320576, 10.61953434224972]","[1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and, shall bear sole responsibility for non -payment of any fees charged to the Consumer. Marketing Affiliate shall pay to, PERMISSION PROVIDED BY EQUIDATA UNDER THIS AGREEMENT IS PROVIDED ON AN “AS IS” BASIS. EQUIDATA, due immediately upon receipt. Marketing Affiliate agrees to reimburse Equidata all costs of collecting any past due, shall be the sole responsibility of the Affiliate.]"
5,What late charge applies to past-due balances in the Equidata agreement?,"…the undersigned principal… agrees to pay a late charge of 1 l/2% per month on the unpaid, past-due amount…",True,1.0,21.614569,"on any invoice. The Marketing Affiliate agrees to pay a late charge of 1 l/2% per month on the unpaid, past -due amount","[21.61456948144895, 16.54189900332682, 14.012888323952698, 12.132495108069905, 11.184384044714243]","[on any invoice. The Marketing Affiliate agrees to pay a late charge of 1 l/2% per month on the unpaid, past -due amount, due immediately upon receipt. Marketing Affiliate agrees to reimburse Equidata all costs of collecting any past due, utilization of time and charges. A LATE PAYMENT CHARGE of one and one-half, the company, and the undersigned principal, partner or owner will become responsible for any unpaid balance past due, confirmed them in writing. The same applies to the warranting of]"
6,Name three statutes the parties agree to comply with under the Compliance clause.,"5. Compliance. Marketing Affiliate nor Equidata, shall engage in any practice… not in compliance with the Fair Credit Reporting Act (FCRA), the Fair Debt Collection Practices Act (FDCPA) and the Health Insurance Portability and Accountability Act (HIPAA)…",False,,14.810004,"acknowledged, the Parties hereto agree as follows:","[14.810003858935273, 14.535317123978684, 14.535317123978684, 14.530013682380591, 14.29276024358694]","[acknowledged, the Parties hereto agree as follows:, any failure to comply with the preceding sentence., to comply with the terms of this Agreement., compliance with the terms of this Agreement;, Section 14.10 Third Party Beneficiary . The parties hereto do not intend, nor shall any clause be interpreted, to create under this]"
7,How far in advance must Marketing Materials requiring approval be reviewed?,"5.4. Uses Marketing Materials, media or methods that are not approved… Such approval shall not be unreasonably withheld and shall be completed within 48 hours of receipt…",False,,15.135404,"Promotional Spots and other promotional materials, which, if using any programming from the Service, must be approved in advance by Network,","[15.135403629361194, 12.419909757884232, 12.14520823021859, 11.556879748374792, 11.112256173917338]","[Promotional Spots and other promotional materials, which, if using any programming from the Service, must be approved in advance by Network,, Affiliate in writing in advance, and (ii) to use the Network Affiliate Marks in Marketing Materials that have been approved by Network Affiliate, exhibition of the Service, as approved by NCM in writing in advance, and (ii) to use the NCM Marks in marketing or advertising materials, The approval or disapproval of such materials will be in Chase’s sole discretion. Any materials not receiving Chase’s specific written preliminary, until NCM has approved it in writing. Upon receipt of such approval from NCM for a particular Marketing Materials or other material, Network]"
8,For how long is the Equidata Marketing Affiliate Agreement’s initial term?,"7.1. This Agreement shall be for the term of one year; thereafter, the Agreement shall renew automatically under these same terms…",False,,18.035168,"6. Audit. Equidata may audit, at Equidata ’s expense, the Marketing Affiliate ’s marketing, practices and activities for the","[18.035168145685873, 17.374493988306266, 15.884891384800468, 15.703508149835816, 15.226293417645431]","[6. Audit. Equidata may audit, at Equidata ’s expense, the Marketing Affiliate ’s marketing, practices and activities for the, Equidata during the term and condition of this Agreement other than for joint marketing purposes. Further, Marketing, to Marketing Affiliate. Upon termination for any reason, Equidata reserves the right to deactivate Marketing Affiliate’s, or Marketing Affiliate Web link as long as this Agreement is in effect., obligation to extend this Agreement beyond the Initial Term.]"
9,Can Equidata terminate immediately if CRAs decline to render services to the Marketing Affiliate?,"7.4. Equidata reserves the right to terminate this Agreement immediately for cause if Experian, Equifax and/or TransUnion… decline to render Services…",True,3.0,35.045253,(Credit Reporting Agencies — CRAs) decline to render Services to Marketing Affiliate for any reason or if Equidata is,"[35.0452534838911, 19.641965574704525, 17.48264803284833, 15.643268043947392, 15.14875402526036]","[(Credit Reporting Agencies — CRAs) decline to render Services to Marketing Affiliate for any reason or if Equidata is, notified by any of the CRAs to cease rendering Services to Marketing Affiliate., 7.4. Equidata reserves the right to terminate this Agreement immediately for cause if Experian, Equifax and/or TransUnion, 2. Marketing Affiliate and Equidata wish to enter into an agreement under which Marketing Affiliate may market the Services., access to the services including the Equidata Web Site. Termination does not release Marketing Affiliate from paying]"


In [33]:
# Load embedding model for semantic search
import time
from typing import List

EMBED_MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
try:
    from sentence_transformers import SentenceTransformer
    t0 = time.time()
    embed_model = SentenceTransformer(EMBED_MODEL_NAME)
    model_load_time = time.time() - t0
    print(f"[OK] Loaded SentenceTransformer model '{EMBED_MODEL_NAME}' in {model_load_time:.2f}s")
except Exception as e:
    print(f"[WARN] SentenceTransformer load failed: {e}\nFalling back to transformers AutoModel.")
    from transformers import AutoTokenizer, AutoModel
    import torch
    t0 = time.time()
    auto_tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_NAME)
    auto_model = AutoModel.from_pretrained(EMBED_MODEL_NAME)
    model_load_time = time.time() - t0
    print(f"[OK] Loaded fallback transformers model in {model_load_time:.2f}s")

# Simple encode wrapper supporting both backends
def encode_texts(texts: List[str], batch_size: int = 64):
    if 'embed_model' in globals():  # SentenceTransformer path
        return embed_model.encode(texts, batch_size=batch_size, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True)
    # Fallback manual pooling
    all_vecs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = auto_tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            outputs = auto_model(**inputs)
            # Mean pool
            embeddings = outputs.last_hidden_state.mean(dim=1)
            # L2 normalize
            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
            all_vecs.append(embeddings.cpu().numpy())
    import numpy as np
    return np.vstack(all_vecs)


[OK] Loaded SentenceTransformer model 'sentence-transformers/all-MiniLM-L6-v2' in 1.26s


In [34]:
# Encode corpus chunks into semantic embeddings
if 'corpus_df' not in globals() or corpus_df is None or len(corpus_df) == 0:
    raise ValueError("corpus_df is empty; cannot build semantic embeddings.")

corpus_texts = corpus_df['chunk'].tolist()
emb_t0 = time.time()
semantic_corpus_embeddings = encode_texts(corpus_texts)
emb_time = time.time() - emb_t0
print(f"[OK] Encoded {len(corpus_texts)} corpus chunks into embeddings shape {semantic_corpus_embeddings.shape} in {emb_time:.2f}s")


[OK] Encoded 3247 corpus chunks into embeddings shape (3247, 384) in 15.19s


In [35]:
# Encode question texts into semantic embeddings
if 'qa_df' not in globals() or qa_df is None or qa_df.empty:
    raise ValueError("qa_df is empty; load and build QA DataFrame first.")

question_texts = qa_df['question'].fillna('').tolist()
q_emb_t0 = time.time()
semantic_question_embeddings = encode_texts(question_texts)
q_emb_time = time.time() - q_emb_t0
print(f"[OK] Encoded {len(question_texts)} questions into embeddings shape {semantic_question_embeddings.shape} in {q_emb_time:.2f}s")


[OK] Encoded 92 questions into embeddings shape (92, 384) in 0.29s


In [36]:
# Semantic search: compute cosine similarity top-5 per question and fuzzy match gold context
import numpy as np
from rapidfuzz import fuzz

if semantic_question_embeddings.shape[0] != len(qa_df):
    raise ValueError("Mismatch between question embeddings and qa_df length.")

# Cosine similarity matrix via dot product (embeddings already normalized)
# We'll compute per-row to keep memory manageable if very large.
semantic_results = []
corpus_chunks_local = corpus_df['chunk'].tolist()
search_t0 = time.time()
for qi in range(semantic_question_embeddings.shape[0]):
    q_vec = semantic_question_embeddings[qi]
    sims = np.dot(semantic_corpus_embeddings, q_vec)  # cosine similarities
    # Get top 5 indices
    top_n = 5
    top_idx = np.argsort(sims)[-top_n:][::-1]
    top_chunks = [corpus_chunks_local[j] for j in top_idx]
    top_scores = [float(sims[j]) for j in top_idx]
    best_chunk = top_chunks[0] if top_chunks else None
    best_score = top_scores[0] if top_scores else None

    gold_context = qa_df.iloc[qi].get('context_chunk', '') or ''
    match_found = False
    first_match_rank = None
    for rank_pos, candidate in enumerate(top_chunks, start=1):
        sim_fuzzy = fuzz.token_set_ratio(gold_context, candidate)
        if sim_fuzzy >= FUZZY_THRESHOLD:
            match_found = True
            first_match_rank = rank_pos
            break

    semantic_results.append({
        'semantic_top_chunks': top_chunks,
        'semantic_top_scores': top_scores,
        'semantic_best_chunk': best_chunk,
        'semantic_best_score': best_score,
        'semantic_context_match': match_found,
        'semantic_first_match_rank': first_match_rank
    })
search_time = time.time() - search_t0
print(f"[OK] Semantic search completed for {len(semantic_results)} questions in {search_time:.2f}s")

semantic_res_df = pd.DataFrame(semantic_results)
qa_df = pd.concat([qa_df, semantic_res_df], axis=1)
print("[OK] Merged semantic search results into qa_df.")
qa_df.head()

[OK] Semantic search completed for 92 questions in 0.04s
[OK] Merged semantic search results into qa_df.


Unnamed: 0,question,answer_snippet,context_chunk,source_document_raw,source_document,tokenized_question,bm25_top_chunks,bm25_top_scores,bm25_best_chunk,bm25_best_score,bm25_context_match,bm25_first_match_rank,semantic_top_chunks,semantic_top_scores,semantic_best_chunk,semantic_best_score,semantic_context_match,semantic_first_match_rank
0,On what date was the Marketing Affiliate Agreement between Equidata and National Credit Report made effective?,This Agreement is made this 1st day of October 2008,"MARKETING AFFILIATE AGREEMENT This Agreement is made this 1st day of October 2008, (the “Effective Date”), by and between Equidata, Inc.… and National Credit Report.com, LLC",SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:0]{index=0},Agreement.pdf,"[on, what, date, was, the, marketing, affiliate, agreement, between, equidata, and, national, credit, report, made, effective]","[Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC, This Agreement is made this 1 st day of October 2008, (the “Effective Date ”), by and between Equidata, Inc., a corporation, 23606 (“Equidata”), and National Credit Report.com, LLC a Corporation organized under the laws of Florida, with its principal, THIS AGREEMENT (the “ Agreement ”), made as of the 6th day of March, 2006 (the “ Effective Date”), is by and between The, This Agreement entered into as of the Effective Date by and between]","[28.018947103860445, 21.583592363855995, 19.978388027175402, 19.93245742816062, 18.207043473159885]","Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC",28.018947,True,2.0,"[Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC, (Credit Reporting Agencies — CRAs) decline to render Services to Marketing Affiliate for any reason or if Equidata is, purpose of assuring compliance with this Agreement. Equidata reserves the right to site inspect Marketing Affiliate ’s, 10. Proprietary Information. Marketing Affiliate and Equidata mutually acknowledge that from time to time Confidential, Reporting Agencies (CRA ’s). Said amounts charged to Equidata will be billed separately to Marketing Affiliate and are]","[0.7532575130462646, 0.7145704030990601, 0.7044329047203064, 0.677936851978302, 0.6731687784194946]","Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC",0.753258,False,
1,Which state’s law governs the Equidata Marketing Affiliate Agreement?,This Agreement is governed by and construed in accordance with the laws of the State of Virginia.,12. Miscellaneous… This Agreement is governed by and construed in accordance with the laws of the State of Virginia.,SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:1]{index=1},Agreement.pdf,"[which, state, s, law, governs, the, equidata, marketing, affiliate, agreement]","[6. Audit. Equidata may audit, at Equidata ’s expense, the Marketing Affiliate ’s marketing, practices and activities for the, 2. Marketing Affiliate and Equidata wish to enter into an agreement under which Marketing Affiliate may market the Services., to Marketing Affiliate. Upon termination for any reason, Equidata reserves the right to deactivate Marketing Affiliate’s, purpose of assuring compliance with this Agreement. Equidata reserves the right to site inspect Marketing Affiliate ’s, given in writing by Marketing Affiliate to Equidata, Equidata or Marketing Affiliate may choose arbitration and]","[16.385950871974746, 15.917270098572338, 14.14409584663738, 13.759856505867411, 12.73983416309363]","6. Audit. Equidata may audit, at Equidata ’s expense, the Marketing Affiliate ’s marketing, practices and activities for the",16.385951,False,,"[2. Marketing Affiliate and Equidata wish to enter into an agreement under which Marketing Affiliate may market the Services., Marketing Affiliate and Equidata shall be obligated by the terms agreed upon by arbitration and all monies determined, to Marketing Affiliate. Upon termination for any reason, Equidata reserves the right to deactivate Marketing Affiliate’s, purpose of assuring compliance with this Agreement. Equidata reserves the right to site inspect Marketing Affiliate ’s, in accordance with Equidata standard practices. Marketing Affiliate and the undersigned principal, partner or owner]","[0.7451218366622925, 0.7411805391311646, 0.721649169921875, 0.7214535474777222, 0.7004363536834717]",2. Marketing Affiliate and Equidata wish to enter into an agreement under which Marketing Affiliate may market the Services.,0.745122,False,
2,Where must disputes under Equidata’s billing disputes clause be arbitrated?,"Such disputes shall be settled by arbitration in the City of Newport News, Virginia.","2. Disputes… Such disputes shall be settled by arbitration in the City of Newport News, Virginia.",SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:2]{index=2},Agreement.pdf,"[where, must, disputes, under, equidata, s, billing, disputes, clause, be, arbitrated]","[8.06 ARBITRATION. All disputes or claims hereunder shall be resolved by, rules of the American Arbitration Association. All disputes or claims by NETWORK, 8.05 GOVERNING LAW. All disputes or claims by Payment Data Systems, OF WARRANTY UNDER SUBPARAGRAPH (A) HEREOF MUST BE MADE, obligation to promptly pay for undisputed charges in accordance with the terms of this Agreement. Such disputes shall]","[15.908745770940799, 13.849880565743023, 13.849880565743023, 12.11352264532109, 11.605025288556485]",8.06 ARBITRATION. All disputes or claims hereunder shall be resolved by,15.908746,False,,"[8.05 GOVERNING LAW. All disputes or claims by Payment Data Systems, 2. Disputes. In the case of disputed charge, defined as a non -payment of an invoice for which notice of dispute has been, demand shall set forth a statement for the nature of the dispute and the amount involved. If Equidata and Marketing, 8.06 ARBITRATION. All disputes or claims hereunder shall be resolved by, obligation to promptly pay for undisputed charges in accordance with the terms of this Agreement. Such disputes shall]","[0.6873435974121094, 0.6844565868377686, 0.6570491790771484, 0.6481903791427612, 0.6403721570968628]",8.05 GOVERNING LAW. All disputes or claims by Payment Data Systems,0.687344,False,
3,How long before a price increase must Equidata notify the Marketing Affiliate?,Notice will be given… no less than 30 days prior to such increase taking affect.,Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing no less than 30 days prior…,SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:3]{index=3},Agreement.pdf,"[how, long, before, a, price, increase, must, equidata, notify, the, marketing, affiliate]","[Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing, 6. Audit. Equidata may audit, at Equidata ’s expense, the Marketing Affiliate ’s marketing, practices and activities for the, absolute discretion. In the event of a price and/or fee change for Technology, Company shall notify MA, given in writing by Marketing Affiliate to Equidata, Equidata or Marketing Affiliate may choose arbitration and, to Marketing Affiliate. Upon termination for any reason, Equidata reserves the right to deactivate Marketing Affiliate’s]","[16.23449909635954, 13.74680913892679, 13.111962477879251, 12.73983416309363, 12.244527307826083]",Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing,16.234499,True,1.0,"[no later than 25 calendar days after Equidata receives receipt of dispute from Marketing Affiliate and shall continue from day, Marketing Affiliate shall give Equidata written demand of dispute within 10 days of the due date of the invoice. The, due immediately upon receipt. Marketing Affiliate agrees to reimburse Equidata all costs of collecting any past due, Commencing on the Affiliate Launch Date and thereafter throughout the Term, Network shall pay to Affiliate the following amounts:, 2.Payment. The Affiliate Advertising Share, if any, shall be payable quarterly and shall be due no later than forty-five (45) days]","[0.661781907081604, 0.6462286710739136, 0.615919291973114, 0.6132111549377441, 0.6110621690750122]",no later than 25 calendar days after Equidata receives receipt of dispute from Marketing Affiliate and shall continue from day,0.661782,False,
4,Who bears responsibility for collecting consumer payments under the Equidata agreement?,Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer,1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and shall bear sole responsibility for non-payment…,SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:4]{index=4},Agreement.pdf,"[who, bears, responsibility, for, collecting, consumer, payments, under, the, equidata, agreement]","[1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and, shall bear sole responsibility for non -payment of any fees charged to the Consumer. Marketing Affiliate shall pay to, PERMISSION PROVIDED BY EQUIDATA UNDER THIS AGREEMENT IS PROVIDED ON AN “AS IS” BASIS. EQUIDATA, due immediately upon receipt. Marketing Affiliate agrees to reimburse Equidata all costs of collecting any past due, shall be the sole responsibility of the Affiliate.]","[16.79946189890967, 15.221391973029311, 11.12044130608493, 10.732216231320576, 10.61953434224972]",1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and,16.799462,True,1.0,"[Equidata, as compensation for its providing of Services under this agreement, such amounts as outlined and detailed in, PERMISSION PROVIDED BY EQUIDATA UNDER THIS AGREEMENT IS PROVIDED ON AN “AS IS” BASIS. EQUIDATA, Agreement; and if data is not voluntarily removed, Equidata reserves the right to use all available legal resources to, Marketing Affiliate and Equidata shall be obligated by the terms agreed upon by arbitration and all monies determined, Reporting Agencies (CRA ’s). Said amounts charged to Equidata will be billed separately to Marketing Affiliate and are]","[0.7076253890991211, 0.6280709505081177, 0.5974897146224976, 0.5903031826019287, 0.5890427231788635]","Equidata, as compensation for its providing of Services under this agreement, such amounts as outlined and detailed in",0.707625,False,


In [37]:
# Build FAISS HNSW index for corpus embeddings with Inner Product (optimized for normalized vectors)
import faiss, numpy as np, time

if 'semantic_corpus_embeddings' not in globals():
    raise ValueError('semantic_corpus_embeddings missing; run embedding cells first.')

# Ensure float32 for FAISS
corp_emb = semantic_corpus_embeddings.astype('float32')
vec_dim = corp_emb.shape[1]
HNSW_M = 32  # connectivity parameter
index_build_t0 = time.time()
# Use Inner Product for normalized vectors (equivalent to cosine similarity)
hnsw_index = faiss.IndexHNSWFlat(vec_dim, HNSW_M, faiss.METRIC_INNER_PRODUCT)
# Set HNSW runtime params - increased efSearch for better recall
eh_search = 128  # efSearch - higher = better recall at cost of speed
hnsw_index.hnsw.efConstruction = 200
hnsw_index.hnsw.efSearch = eh_search
# Add vectors
hnsw_index.add(corp_emb)
index_build_time = time.time() - index_build_t0
print(f"[OK] Built HNSW index with {hnsw_index.ntotal} vectors (dim={vec_dim}) in {index_build_time:.2f}s (M={HNSW_M}, efSearch={eh_search}, metric=INNER_PRODUCT).")

[OK] Built HNSW index with 3247 vectors (dim=384) in 0.28s (M=32, efSearch=128, metric=INNER_PRODUCT).


In [38]:
# Run HNSW approximate nearest neighbor search for all questions
if 'semantic_question_embeddings' not in globals():
    raise ValueError('semantic_question_embeddings missing; encode questions first.')

query_emb = semantic_question_embeddings.astype('float32')
hnsw_search_t0 = time.time()
# FAISS IndexHNSWFlat with METRIC_INNER_PRODUCT returns negative inner products (distance)
# We negate them to get similarity scores (higher is better)
D, I = hnsw_index.search(query_emb, 5)  # shape (n_questions, 5)
hnsw_search_time = time.time() - hnsw_search_t0

corpus_chunks_local = corpus_df['chunk'].tolist()
from rapidfuzz import fuzz

hnsw_results = []
for qi in range(I.shape[0]):
    idxs = I[qi]
    dists = D[qi]
    # With METRIC_INNER_PRODUCT, higher values = better matches (already similarity scores)
    hnsw_scores = [float(dist) for dist in dists]
    hnsw_chunks = [corpus_chunks_local[j] if j >=0 else None for j in idxs]
    best_chunk = hnsw_chunks[0]
    best_score = hnsw_scores[0]
    gold_context = qa_df.iloc[qi].get('context_chunk', '') or ''
    match_found = False
    first_match_rank = None
    for rank_pos, candidate in enumerate(hnsw_chunks, start=1):
        if candidate is None:
            continue
        sim_fuzzy = fuzz.token_set_ratio(gold_context, candidate)
        if sim_fuzzy >= FUZZY_THRESHOLD:
            match_found = True
            first_match_rank = rank_pos
            break
    hnsw_results.append({
        'hnsw_top_chunks': hnsw_chunks,
        'hnsw_top_scores': hnsw_scores,
        'hnsw_best_chunk': best_chunk,
        'hnsw_best_score': best_score,
        'hnsw_context_match': match_found,
        'hnsw_first_match_rank': first_match_rank
    })

print(f"[OK] HNSW search completed in {hnsw_search_time:.2f}s for {len(hnsw_results)} questions.")

hnsw_res_df = pd.DataFrame(hnsw_results)
qa_df = pd.concat([qa_df, hnsw_res_df], axis=1)
qa_df.head()

[OK] HNSW search completed in 0.01s for 92 questions.


Unnamed: 0,question,answer_snippet,context_chunk,source_document_raw,source_document,tokenized_question,bm25_top_chunks,bm25_top_scores,bm25_best_chunk,bm25_best_score,...,semantic_best_chunk,semantic_best_score,semantic_context_match,semantic_first_match_rank,hnsw_top_chunks,hnsw_top_scores,hnsw_best_chunk,hnsw_best_score,hnsw_context_match,hnsw_first_match_rank
0,On what date was the Marketing Affiliate Agreement between Equidata and National Credit Report made effective?,This Agreement is made this 1st day of October 2008,"MARKETING AFFILIATE AGREEMENT This Agreement is made this 1st day of October 2008, (the “Effective Date”), by and between Equidata, Inc.… and National Credit Report.com, LLC",SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:0]{index=0},Agreement.pdf,"[on, what, date, was, the, marketing, affiliate, agreement, between, equidata, and, national, credit, report, made, effective]","[Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC, This Agreement is made this 1 st day of October 2008, (the “Effective Date ”), by and between Equidata, Inc., a corporation, 23606 (“Equidata”), and National Credit Report.com, LLC a Corporation organized under the laws of Florida, with its principal, THIS AGREEMENT (the “ Agreement ”), made as of the 6th day of March, 2006 (the “ Effective Date”), is by and between The, This Agreement entered into as of the Effective Date by and between]","[28.018947103860445, 21.583592363855995, 19.978388027175402, 19.93245742816062, 18.207043473159885]","Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC",28.018947,...,"Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC",0.753258,False,,"[Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC, (Credit Reporting Agencies — CRAs) decline to render Services to Marketing Affiliate for any reason or if Equidata is, purpose of assuring compliance with this Agreement. Equidata reserves the right to site inspect Marketing Affiliate ’s, 10. Proprietary Information. Marketing Affiliate and Equidata mutually acknowledge that from time to time Confidential, Reporting Agencies (CRA ’s). Said amounts charged to Equidata will be billed separately to Marketing Affiliate and are]","[0.7532575130462646, 0.7145703434944153, 0.7044329643249512, 0.677936851978302, 0.6731687188148499]","Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC",0.753258,False,
1,Which state’s law governs the Equidata Marketing Affiliate Agreement?,This Agreement is governed by and construed in accordance with the laws of the State of Virginia.,12. Miscellaneous… This Agreement is governed by and construed in accordance with the laws of the State of Virginia.,SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:1]{index=1},Agreement.pdf,"[which, state, s, law, governs, the, equidata, marketing, affiliate, agreement]","[6. Audit. Equidata may audit, at Equidata ’s expense, the Marketing Affiliate ’s marketing, practices and activities for the, 2. Marketing Affiliate and Equidata wish to enter into an agreement under which Marketing Affiliate may market the Services., to Marketing Affiliate. Upon termination for any reason, Equidata reserves the right to deactivate Marketing Affiliate’s, purpose of assuring compliance with this Agreement. Equidata reserves the right to site inspect Marketing Affiliate ’s, given in writing by Marketing Affiliate to Equidata, Equidata or Marketing Affiliate may choose arbitration and]","[16.385950871974746, 15.917270098572338, 14.14409584663738, 13.759856505867411, 12.73983416309363]","6. Audit. Equidata may audit, at Equidata ’s expense, the Marketing Affiliate ’s marketing, practices and activities for the",16.385951,...,2. Marketing Affiliate and Equidata wish to enter into an agreement under which Marketing Affiliate may market the Services.,0.745122,False,,"[2. Marketing Affiliate and Equidata wish to enter into an agreement under which Marketing Affiliate may market the Services., Marketing Affiliate and Equidata shall be obligated by the terms agreed upon by arbitration and all monies determined, to Marketing Affiliate. Upon termination for any reason, Equidata reserves the right to deactivate Marketing Affiliate’s, purpose of assuring compliance with this Agreement. Equidata reserves the right to site inspect Marketing Affiliate ’s, in accordance with Equidata standard practices. Marketing Affiliate and the undersigned principal, partner or owner]","[0.7451218366622925, 0.7411805391311646, 0.721649169921875, 0.7214535474777222, 0.7004363536834717]",2. Marketing Affiliate and Equidata wish to enter into an agreement under which Marketing Affiliate may market the Services.,0.745122,False,
2,Where must disputes under Equidata’s billing disputes clause be arbitrated?,"Such disputes shall be settled by arbitration in the City of Newport News, Virginia.","2. Disputes… Such disputes shall be settled by arbitration in the City of Newport News, Virginia.",SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:2]{index=2},Agreement.pdf,"[where, must, disputes, under, equidata, s, billing, disputes, clause, be, arbitrated]","[8.06 ARBITRATION. All disputes or claims hereunder shall be resolved by, rules of the American Arbitration Association. All disputes or claims by NETWORK, 8.05 GOVERNING LAW. All disputes or claims by Payment Data Systems, OF WARRANTY UNDER SUBPARAGRAPH (A) HEREOF MUST BE MADE, obligation to promptly pay for undisputed charges in accordance with the terms of this Agreement. Such disputes shall]","[15.908745770940799, 13.849880565743023, 13.849880565743023, 12.11352264532109, 11.605025288556485]",8.06 ARBITRATION. All disputes or claims hereunder shall be resolved by,15.908746,...,8.05 GOVERNING LAW. All disputes or claims by Payment Data Systems,0.687344,False,,"[8.05 GOVERNING LAW. All disputes or claims by Payment Data Systems, 2. Disputes. In the case of disputed charge, defined as a non -payment of an invoice for which notice of dispute has been, demand shall set forth a statement for the nature of the dispute and the amount involved. If Equidata and Marketing, 8.06 ARBITRATION. All disputes or claims hereunder shall be resolved by, obligation to promptly pay for undisputed charges in accordance with the terms of this Agreement. Such disputes shall]","[0.6873435974121094, 0.6844565868377686, 0.6570491790771484, 0.6481903791427612, 0.6403721570968628]",8.05 GOVERNING LAW. All disputes or claims by Payment Data Systems,0.687344,False,
3,How long before a price increase must Equidata notify the Marketing Affiliate?,Notice will be given… no less than 30 days prior to such increase taking affect.,Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing no less than 30 days prior…,SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:3]{index=3},Agreement.pdf,"[how, long, before, a, price, increase, must, equidata, notify, the, marketing, affiliate]","[Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing, 6. Audit. Equidata may audit, at Equidata ’s expense, the Marketing Affiliate ’s marketing, practices and activities for the, absolute discretion. In the event of a price and/or fee change for Technology, Company shall notify MA, given in writing by Marketing Affiliate to Equidata, Equidata or Marketing Affiliate may choose arbitration and, to Marketing Affiliate. Upon termination for any reason, Equidata reserves the right to deactivate Marketing Affiliate’s]","[16.23449909635954, 13.74680913892679, 13.111962477879251, 12.73983416309363, 12.244527307826083]",Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing,16.234499,...,no later than 25 calendar days after Equidata receives receipt of dispute from Marketing Affiliate and shall continue from day,0.661782,False,,"[no later than 25 calendar days after Equidata receives receipt of dispute from Marketing Affiliate and shall continue from day, Marketing Affiliate shall give Equidata written demand of dispute within 10 days of the due date of the invoice. The, due immediately upon receipt. Marketing Affiliate agrees to reimburse Equidata all costs of collecting any past due, Commencing on the Affiliate Launch Date and thereafter throughout the Term, Network shall pay to Affiliate the following amounts:, 2.Payment. The Affiliate Advertising Share, if any, shall be payable quarterly and shall be due no later than forty-five (45) days]","[0.661781907081604, 0.6462286710739136, 0.615919291973114, 0.6132111549377441, 0.6110621690750122]",no later than 25 calendar days after Equidata receives receipt of dispute from Marketing Affiliate and shall continue from day,0.661782,False,
4,Who bears responsibility for collecting consumer payments under the Equidata agreement?,Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer,1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and shall bear sole responsibility for non-payment…,SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:4]{index=4},Agreement.pdf,"[who, bears, responsibility, for, collecting, consumer, payments, under, the, equidata, agreement]","[1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and, shall bear sole responsibility for non -payment of any fees charged to the Consumer. Marketing Affiliate shall pay to, PERMISSION PROVIDED BY EQUIDATA UNDER THIS AGREEMENT IS PROVIDED ON AN “AS IS” BASIS. EQUIDATA, due immediately upon receipt. Marketing Affiliate agrees to reimburse Equidata all costs of collecting any past due, shall be the sole responsibility of the Affiliate.]","[16.79946189890967, 15.221391973029311, 11.12044130608493, 10.732216231320576, 10.61953434224972]",1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and,16.799462,...,"Equidata, as compensation for its providing of Services under this agreement, such amounts as outlined and detailed in",0.707625,False,,"[Equidata, as compensation for its providing of Services under this agreement, such amounts as outlined and detailed in, PERMISSION PROVIDED BY EQUIDATA UNDER THIS AGREEMENT IS PROVIDED ON AN “AS IS” BASIS. EQUIDATA, Agreement; and if data is not voluntarily removed, Equidata reserves the right to use all available legal resources to, Marketing Affiliate and Equidata shall be obligated by the terms agreed upon by arbitration and all monies determined, Reporting Agencies (CRA ’s). Said amounts charged to Equidata will be billed separately to Marketing Affiliate and are]","[0.7076253890991211, 0.6280709505081177, 0.5974897146224976, 0.5903031826019287, 0.5890427231788635]","Equidata, as compensation for its providing of Services under this agreement, such amounts as outlined and detailed in",0.707625,False,


In [39]:
# HNSW metrics computation
hnsw_match_series = qa_df['hnsw_context_match']
hnsw_accuracy = hnsw_match_series.mean() if len(hnsw_match_series) else 0.0
hnsw_ranks = qa_df['hnsw_first_match_rank']
hnsw_reciprocals = [1.0/r for r in hnsw_ranks if isinstance(r, int) and r > 0]
hnsw_mrr = (sum(hnsw_reciprocals)/len(hnsw_ranks)) if len(hnsw_ranks) else 0.0
hnsw_rank_counts = hnsw_ranks.value_counts(dropna=False).to_dict()

hnsw_metrics = {
    'num_questions': int(len(qa_df)),
    'accuracy': float(hnsw_accuracy),
    'mrr': float(hnsw_mrr),
    'rank_counts': hnsw_rank_counts,
    'fuzzy_threshold': FUZZY_THRESHOLD,
    'index_type': 'HNSW',
    'hnsw_M': HNSW_M,
    'efSearch': eh_search,
    'index_build_time_sec': index_build_time,
    'hnsw_search_time_sec': hnsw_search_time
}
print('HNSW Evaluation Metrics:')
for k,v in hnsw_metrics.items():
    print(f'  {k}: {v}')

HNSW Evaluation Metrics:
  num_questions: 92
  accuracy: 0.2608695652173913
  mrr: 0.0
  rank_counts: {nan: 68, 1.0: 16, 2.0: 3, 3.0: 3, 5.0: 1, 4.0: 1}
  fuzzy_threshold: 70
  index_type: HNSW
  hnsw_M: 32
  efSearch: 128
  index_build_time_sec: 0.2836039066314697
  hnsw_search_time_sec: 0.006025791168212891


In [40]:
# Compare BM25 vs Semantic(flat) vs HNSW metrics
import pandas as pd
comparison_rows = []
if 'bm25_metrics' in globals():
    comparison_rows.append({'method':'BM25', 'accuracy': bm25_metrics.get('accuracy'), 'mrr': bm25_metrics.get('mrr'), 'search_time_sec': None})
if 'semantic_metrics' in globals():
    comparison_rows.append({'method':'Semantic-Flat', 'accuracy': semantic_metrics.get('accuracy'), 'mrr': semantic_metrics.get('mrr'), 'search_time_sec': semantic_metrics.get('search_time_sec')})
comparison_rows.append({'method':'Semantic-HNSW', 'accuracy': hnsw_metrics.get('accuracy'), 'mrr': hnsw_metrics.get('mrr'), 'search_time_sec': hnsw_metrics.get('hnsw_search_time_sec')})

compare_df = pd.DataFrame(comparison_rows)
compare_df['accuracy_delta_vs_BM25'] = compare_df['accuracy'] - compare_df.loc[compare_df['method']=='BM25','accuracy'].values[0] if 'bm25_metrics' in globals() else None
compare_df['mrr_delta_vs_flat'] = compare_df['mrr'] - compare_df.loc[compare_df['method']=='Semantic-Flat','mrr'].values[0] if 'semantic_metrics' in globals() else None
print('Retrieval Method Comparison:')
display(compare_df)

# Quick win-rate: does HNSW best score approximate flat best score within tolerance?
if 'semantic_res_df' in globals():
    # Compute overlap of top-1 chunk between flat and HNSW
    flat_best = qa_df['semantic_best_chunk']
    hnsw_best = qa_df['hnsw_best_chunk']
    overlap_rate = (flat_best == hnsw_best).mean()
    print(f"Top-1 overlap between Semantic-Flat and Semantic-HNSW: {overlap_rate:.2%}")


Retrieval Method Comparison:


Unnamed: 0,method,accuracy,mrr,search_time_sec,accuracy_delta_vs_BM25,mrr_delta_vs_flat
0,BM25,0.358696,0.0,,0.0,
1,Semantic-HNSW,0.26087,0.0,0.006026,-0.097826,


Top-1 overlap between Semantic-Flat and Semantic-HNSW: 100.00%


In [41]:
# Display sample rows with HNSW search results
hnsw_cols = [
    'question','context_chunk','hnsw_context_match','hnsw_first_match_rank',
    'hnsw_best_score','hnsw_best_chunk','hnsw_top_scores','hnsw_top_chunks'
]
print("Sample HNSW semantic ANN rows:")
display(qa_df[hnsw_cols].head(10))

Sample HNSW semantic ANN rows:


Unnamed: 0,question,context_chunk,hnsw_context_match,hnsw_first_match_rank,hnsw_best_score,hnsw_best_chunk,hnsw_top_scores,hnsw_top_chunks
0,On what date was the Marketing Affiliate Agreement between Equidata and National Credit Report made effective?,"MARKETING AFFILIATE AGREEMENT This Agreement is made this 1st day of October 2008, (the “Effective Date”), by and between Equidata, Inc.… and National Credit Report.com, LLC",False,,0.753258,"Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC","[0.7532575130462646, 0.7145703434944153, 0.7044329643249512, 0.677936851978302, 0.6731687188148499]","[Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC, (Credit Reporting Agencies — CRAs) decline to render Services to Marketing Affiliate for any reason or if Equidata is, purpose of assuring compliance with this Agreement. Equidata reserves the right to site inspect Marketing Affiliate ’s, 10. Proprietary Information. Marketing Affiliate and Equidata mutually acknowledge that from time to time Confidential, Reporting Agencies (CRA ’s). Said amounts charged to Equidata will be billed separately to Marketing Affiliate and are]"
1,Which state’s law governs the Equidata Marketing Affiliate Agreement?,12. Miscellaneous… This Agreement is governed by and construed in accordance with the laws of the State of Virginia.,False,,0.745122,2. Marketing Affiliate and Equidata wish to enter into an agreement under which Marketing Affiliate may market the Services.,"[0.7451218366622925, 0.7411805391311646, 0.721649169921875, 0.7214535474777222, 0.7004363536834717]","[2. Marketing Affiliate and Equidata wish to enter into an agreement under which Marketing Affiliate may market the Services., Marketing Affiliate and Equidata shall be obligated by the terms agreed upon by arbitration and all monies determined, to Marketing Affiliate. Upon termination for any reason, Equidata reserves the right to deactivate Marketing Affiliate’s, purpose of assuring compliance with this Agreement. Equidata reserves the right to site inspect Marketing Affiliate ’s, in accordance with Equidata standard practices. Marketing Affiliate and the undersigned principal, partner or owner]"
2,Where must disputes under Equidata’s billing disputes clause be arbitrated?,"2. Disputes… Such disputes shall be settled by arbitration in the City of Newport News, Virginia.",False,,0.687344,8.05 GOVERNING LAW. All disputes or claims by Payment Data Systems,"[0.6873435974121094, 0.6844565868377686, 0.6570491790771484, 0.6481903791427612, 0.6403721570968628]","[8.05 GOVERNING LAW. All disputes or claims by Payment Data Systems, 2. Disputes. In the case of disputed charge, defined as a non -payment of an invoice for which notice of dispute has been, demand shall set forth a statement for the nature of the dispute and the amount involved. If Equidata and Marketing, 8.06 ARBITRATION. All disputes or claims hereunder shall be resolved by, obligation to promptly pay for undisputed charges in accordance with the terms of this Agreement. Such disputes shall]"
3,How long before a price increase must Equidata notify the Marketing Affiliate?,Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing no less than 30 days prior…,False,,0.661782,no later than 25 calendar days after Equidata receives receipt of dispute from Marketing Affiliate and shall continue from day,"[0.661781907081604, 0.6462286710739136, 0.615919291973114, 0.6132111549377441, 0.6110621690750122]","[no later than 25 calendar days after Equidata receives receipt of dispute from Marketing Affiliate and shall continue from day, Marketing Affiliate shall give Equidata written demand of dispute within 10 days of the due date of the invoice. The, due immediately upon receipt. Marketing Affiliate agrees to reimburse Equidata all costs of collecting any past due, Commencing on the Affiliate Launch Date and thereafter throughout the Term, Network shall pay to Affiliate the following amounts:, 2.Payment. The Affiliate Advertising Share, if any, shall be payable quarterly and shall be due no later than forty-five (45) days]"
4,Who bears responsibility for collecting consumer payments under the Equidata agreement?,1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and shall bear sole responsibility for non-payment…,False,,0.707625,"Equidata, as compensation for its providing of Services under this agreement, such amounts as outlined and detailed in","[0.7076253890991211, 0.6280709505081177, 0.5974897146224976, 0.5903031826019287, 0.5890427231788635]","[Equidata, as compensation for its providing of Services under this agreement, such amounts as outlined and detailed in, PERMISSION PROVIDED BY EQUIDATA UNDER THIS AGREEMENT IS PROVIDED ON AN “AS IS” BASIS. EQUIDATA, Agreement; and if data is not voluntarily removed, Equidata reserves the right to use all available legal resources to, Marketing Affiliate and Equidata shall be obligated by the terms agreed upon by arbitration and all monies determined, Reporting Agencies (CRA ’s). Said amounts charged to Equidata will be billed separately to Marketing Affiliate and are]"
5,What late charge applies to past-due balances in the Equidata agreement?,"…the undersigned principal… agrees to pay a late charge of 1 l/2% per month on the unpaid, past-due amount…",False,,0.610039,due immediately upon receipt. Marketing Affiliate agrees to reimburse Equidata all costs of collecting any past due,"[0.6100389361381531, 0.5688557624816895, 0.5657926797866821, 0.5600687265396118, 0.5506561994552612]","[due immediately upon receipt. Marketing Affiliate agrees to reimburse Equidata all costs of collecting any past due, utilization of time and charges. A LATE PAYMENT CHARGE of one and one-half, the company, and the undersigned principal, partner or owner will become responsible for any unpaid balance past due, obligation to promptly pay for undisputed charges in accordance with the terms of this Agreement. Such disputes shall, Section 7.3 Payment . Except as otherwise specifically provided in this Agreement, all amounts due by one Party to the other under this]"
6,Name three statutes the parties agree to comply with under the Compliance clause.,"5. Compliance. Marketing Affiliate nor Equidata, shall engage in any practice… not in compliance with the Fair Credit Reporting Act (FCRA), the Fair Debt Collection Practices Act (FDCPA) and the Health Insurance Portability and Accountability Act (HIPAA)…",False,,0.658908,"Section 3.4 Content Standards . The Parties agree that (unless mutually agreed by the Parties with respect to clauses (i), (iii), (iv), (v) or","[0.6589080095291138, 0.6456112861633301, 0.6214019656181335, 0.619791567325592, 0.6174004077911377]","[Section 3.4 Content Standards . The Parties agree that (unless mutually agreed by the Parties with respect to clauses (i), (iii), (iv), (v) or, 7.3 If suit or action is instituted to enforce any of the terms of, compliance with the terms of this Agreement;, Agreement, insofar as they purport to be binding on it, constitute legal, valid and binding obligations enforceable in accordance with their terms., this Agreement and are under legal obligation to comply fully with all]"
7,How far in advance must Marketing Materials requiring approval be reviewed?,"5.4. Uses Marketing Materials, media or methods that are not approved… Such approval shall not be unreasonably withheld and shall be completed within 48 hours of receipt…",True,2.0,0.721847,commercially reasonable efforts to approve or reject any such Marketing Materials or other material submitted to it for review within thirty (30),"[0.721847414970398, 0.6773769855499268, 0.6484478712081909, 0.5440471172332764, 0.5294286012649536]","[commercially reasonable efforts to approve or reject any such Marketing Materials or other material submitted to it for review within thirty (30), Such approval shall not be unreasonably withheld and shall be completed within 48 hours of receipt of Marketing, until NCM has approved it in writing. Upon receipt of such approval from NCM for a particular Marketing Materials or other material, Network, Promotional Spots and other promotional materials, which, if using any programming from the Service, must be approved in advance by Network,, for review within thirty (30) days from the date of receipt by Network Affiliate. NCM shall not use, publish, or distribute any Marketing Material or]"
8,For how long is the Equidata Marketing Affiliate Agreement’s initial term?,"7.1. This Agreement shall be for the term of one year; thereafter, the Agreement shall renew automatically under these same terms…",False,,0.761612,The term of this Agreement will begin upon acceptance of Affiliate's,"[0.7616115808486938, 0.735694408416748, 0.732671856880188, 0.7294594645500183, 0.7246686220169067]","[The term of this Agreement will begin upon acceptance of Affiliate's, 2. Marketing Affiliate and Equidata wish to enter into an agreement under which Marketing Affiliate may market the Services., Marketing Affiliate and Equidata shall be obligated by the terms agreed upon by arbitration and all monies determined, no later than 25 calendar days after Equidata receives receipt of dispute from Marketing Affiliate and shall continue from day, Affiliate’s default in payment or other breach of this Agreement, Equidata may terminate this Agreement without notice]"
9,Can Equidata terminate immediately if CRAs decline to render services to the Marketing Affiliate?,"7.4. Equidata reserves the right to terminate this Agreement immediately for cause if Experian, Equifax and/or TransUnion… decline to render Services…",False,,0.790776,"to Marketing Affiliate. Upon termination for any reason, Equidata reserves the right to deactivate Marketing Affiliate’s","[0.7907757759094238, 0.7399990558624268, 0.7367603778839111, 0.7187219858169556, 0.6799298524856567]","[to Marketing Affiliate. Upon termination for any reason, Equidata reserves the right to deactivate Marketing Affiliate’s, access to the services including the Equidata Web Site. Termination does not release Marketing Affiliate from paying, (Credit Reporting Agencies — CRAs) decline to render Services to Marketing Affiliate for any reason or if Equidata is, notified by any of the CRAs to cease rendering Services to Marketing Affiliate., Affiliate’s default in payment or other breach of this Agreement, Equidata may terminate this Agreement without notice]"


In [None]:
# SPLADE model load and encoding function
import torch, time, numpy as np
from transformers import AutoTokenizer, AutoModel
SPLADE_MODEL_NAME = "naver/efficient-splade-VI-BT-large-doc"

splade_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"[SPLADE] Loading model '{SPLADE_MODEL_NAME}' on {splade_device} ...")
sl_t0 = time.time()
splade_tokenizer = AutoTokenizer.from_pretrained(SPLADE_MODEL_NAME)
splade_model = AutoModel.from_pretrained(SPLADE_MODEL_NAME).to(splade_device)
splade_model.eval()
splade_model_load_time = time.time() - sl_t0
print(f"[SPLADE] Model loaded in {splade_model_load_time:.2f}s")

# Encoding function following SPLADE formulation with max-pooling
def encode_splade(texts, batch_size=16):
    """Encode texts using SPLADE model with proper max-pooling."""
    with torch.no_grad():
        inputs = splade_tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to(splade_device)
        output = splade_model(**inputs)
        # Max-pooling over the token dimension: log(1 + relu(x))
        vec = torch.max(torch.log(1 + torch.relu(output.last_hidden_state)) * inputs.attention_mask.unsqueeze(-1), dim=1)[0]
        return vec.cpu().numpy()

print("[OK] SPLADE encode function defined.")

[SPLADE] Loading model 'naver/efficient-splade-VI-BT-large-doc' on cpu ...


[SPLADE] Model loaded in 0.61s


In [None]:
# SPLADE corpus encoding & FAISS index build
if 'corpus_df' not in globals() or corpus_df is None or len(corpus_df)==0:
    raise ValueError('corpus_df empty; cannot SPLADE encode.')

print("[SPLADE] Encoding corpus...")
splade_corpus_texts = corpus_df['chunk'].tolist()
sc_t0 = time.time()

# Batch encoding for efficiency
corpus_embeddings_list = []
batch_size = 16
for i in range(0, len(splade_corpus_texts), batch_size):
    batch = splade_corpus_texts[i:i+batch_size]
    corpus_embeddings_list.append(encode_splade(batch))
splade_corpus_embeddings = np.concatenate(corpus_embeddings_list, axis=0).astype('float32')
splade_corpus_encode_time = time.time() - sc_t0
print(f"[SPLADE] Encoded {len(splade_corpus_texts)} chunks into embeddings shape {splade_corpus_embeddings.shape} in {splade_corpus_encode_time:.2f}s")

# Build FAISS Index using IndexFlatL2 for SPLADE vectors
import faiss
splade_dim = splade_corpus_embeddings.shape[1]
si_t0 = time.time()
splade_index = faiss.IndexFlatL2(splade_dim)
splade_index.add(splade_corpus_embeddings)
splade_index_build_time = time.time() - si_t0
print(f"[SPLADE] FAISS IndexFlatL2 built in {splade_index_build_time:.2f}s (dim={splade_dim}, ntotal={splade_index.ntotal})")

[SPLADE] Encoded 3247 chunks into embeddings shape (3247, 768) in 50.21s
[SPLADE] FAISS IndexHNSWFlat built in 0.44s (dim=768, ntotal=3247, M=32, efSearch=128)
[SPLADE] FAISS IndexHNSWFlat built in 0.44s (dim=768, ntotal=3247, M=32, efSearch=128)


In [None]:
# SPLADE retrieval for each question
from rapidfuzz import fuzz
if 'qa_df' not in globals() or qa_df is None or qa_df.empty:
    raise ValueError('qa_df missing; cannot run SPLADE retrieval.')
if 'splade_index' not in globals():
    raise ValueError('splade_index missing; build index first.')

splade_results = []
retrieval_t0 = time.time()

for i, row in qa_df.iterrows():
    q_text = row.get('question','') or ''
    
    # Encode query
    query_embedding = encode_splade([q_text])
    
    # Ensure query_embedding is a 2D numpy array
    if query_embedding.ndim == 1:
        query_embedding = np.expand_dims(query_embedding, axis=0)
    query_embedding = query_embedding.astype('float32')
    
    # Search with IndexFlatL2 (lower distance = better match)
    distances, idxs = splade_index.search(query_embedding, 5)
    dists = distances[0]
    idxs = idxs[0]
    
    # Convert L2 distances to similarity scores (negative distance, higher is better)
    splade_scores = [-float(d) for d in dists]
    splade_chunks = [corpus_df.iloc[j]['chunk'] if j >=0 and j < len(corpus_df) else None for j in idxs]
    best_chunk = splade_chunks[0] if splade_chunks[0] is not None else None
    best_score = splade_scores[0] if splade_chunks[0] is not None else None
    
    # Evaluate fuzzy match with gold context
    gold_context = row.get('context_chunk','') or ''
    match_found = False
    first_match_rank = None
    for rank_pos, candidate in enumerate(splade_chunks, start=1):
        if candidate is None:
            continue
        sim_fuzzy = fuzz.token_set_ratio(gold_context, candidate)
        if sim_fuzzy >= FUZZY_THRESHOLD:
            match_found = True
            first_match_rank = rank_pos
            break
    
    splade_results.append({
        'splade_top_chunks': splade_chunks,
        'splade_top_scores': splade_scores,
        'splade_best_chunk': best_chunk,
        'splade_best_score': best_score,
        'splade_context_match': match_found,
        'splade_first_match_rank': first_match_rank
    })

retrieval_time_total = time.time() - retrieval_t0
print(f"[SPLADE] Retrieval complete for {len(splade_results)} questions in {retrieval_time_total:.2f}s")

splade_res_df = pd.DataFrame(splade_results)
# Remove old splade columns if they existed to avoid duplicates
for col in ['splade_top_chunks','splade_top_scores','splade_best_chunk','splade_best_score','splade_context_match','splade_first_match_rank']:
    if col in qa_df.columns:
        qa_df.drop(columns=[col], inplace=True)
qa_df = pd.concat([qa_df, splade_res_df], axis=1)
qa_df.head()

[SPLADE] HNSW retrieval complete for 92 questions in 2.36s


Unnamed: 0,question,answer_snippet,context_chunk,source_document_raw,source_document,tokenized_question,bm25_top_chunks,bm25_top_scores,bm25_best_chunk,bm25_best_score,...,hnsw_best_chunk,hnsw_best_score,hnsw_context_match,hnsw_first_match_rank,splade_top_chunks,splade_top_scores,splade_best_chunk,splade_best_score,splade_context_match,splade_first_match_rank
0,On what date was the Marketing Affiliate Agreement between Equidata and National Credit Report made effective?,This Agreement is made this 1st day of October 2008,"MARKETING AFFILIATE AGREEMENT This Agreement is made this 1st day of October 2008, (the “Effective Date”), by and between Equidata, Inc.… and National Credit Report.com, LLC",SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:0]{index=0},Agreement.pdf,"[on, what, date, was, the, marketing, affiliate, agreement, between, equidata, and, national, credit, report, made, effective]","[Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC, This Agreement is made this 1 st day of October 2008, (the “Effective Date ”), by and between Equidata, Inc., a corporation, 23606 (“Equidata”), and National Credit Report.com, LLC a Corporation organized under the laws of Florida, with its principal, THIS AGREEMENT (the “ Agreement ”), made as of the 6th day of March, 2006 (the “ Effective Date”), is by and between The, This Agreement entered into as of the Effective Date by and between]","[28.018947103860445, 21.583592363855995, 19.978388027175402, 19.93245742816062, 18.207043473159885]","Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC",28.018947,...,"Company Name: Equidata, Inc. Marketing Affiliate Name: National Credit Report LLC",0.753258,False,,"[Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06, Dallas-Fort Worth KDAF 8001 John Carpenter Fwy., Dallas, TX 75247 7/1/06, Grand-Rapids-Kalamazoo-Battle Creek WXMI 3117 Plaza Dr. N.E., Grand Rapids, MI 49525 &bbsp; 6/15/06, KTWB 1813 Westlake Ave. N., Seattle, WA 98109 7/15/06, place of business at 7700 N. Congress Ave, Suite 3113, Boca Raton FL33487 ( “Marketing Affiliate ”).]","[108.24852752685547, 106.02227783203125, 105.82774353027344, 105.638916015625, 105.63409423828125]","Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06",108.248528,False,
1,Which state’s law governs the Equidata Marketing Affiliate Agreement?,This Agreement is governed by and construed in accordance with the laws of the State of Virginia.,12. Miscellaneous… This Agreement is governed by and construed in accordance with the laws of the State of Virginia.,SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:1]{index=1},Agreement.pdf,"[which, state, s, law, governs, the, equidata, marketing, affiliate, agreement]","[6. Audit. Equidata may audit, at Equidata ’s expense, the Marketing Affiliate ’s marketing, practices and activities for the, 2. Marketing Affiliate and Equidata wish to enter into an agreement under which Marketing Affiliate may market the Services., to Marketing Affiliate. Upon termination for any reason, Equidata reserves the right to deactivate Marketing Affiliate’s, purpose of assuring compliance with this Agreement. Equidata reserves the right to site inspect Marketing Affiliate ’s, given in writing by Marketing Affiliate to Equidata, Equidata or Marketing Affiliate may choose arbitration and]","[16.385950871974746, 15.917270098572338, 14.14409584663738, 13.759856505867411, 12.73983416309363]","6. Audit. Equidata may audit, at Equidata ’s expense, the Marketing Affiliate ’s marketing, practices and activities for the",16.385951,...,2. Marketing Affiliate and Equidata wish to enter into an agreement under which Marketing Affiliate may market the Services.,0.745122,False,,"[Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06, Dallas-Fort Worth KDAF 8001 John Carpenter Fwy., Dallas, TX 75247 7/1/06, Section 13.1 (Confidential Treatment). Network Affiliate agrees to be included in any compliance reporting NCM provides to its advertisers and, KTWB 1813 Westlake Ave. N., Seattle, WA 98109 7/15/06, Grand-Rapids-Kalamazoo-Battle Creek WXMI 3117 Plaza Dr. N.E., Grand Rapids, MI 49525 &bbsp; 6/15/06]","[111.06587219238281, 107.91841125488281, 107.479248046875, 107.41693115234375, 107.36601257324219]","Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06",111.065872,False,
2,Where must disputes under Equidata’s billing disputes clause be arbitrated?,"Such disputes shall be settled by arbitration in the City of Newport News, Virginia.","2. Disputes… Such disputes shall be settled by arbitration in the City of Newport News, Virginia.",SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:2]{index=2},Agreement.pdf,"[where, must, disputes, under, equidata, s, billing, disputes, clause, be, arbitrated]","[8.06 ARBITRATION. All disputes or claims hereunder shall be resolved by, rules of the American Arbitration Association. All disputes or claims by NETWORK, 8.05 GOVERNING LAW. All disputes or claims by Payment Data Systems, OF WARRANTY UNDER SUBPARAGRAPH (A) HEREOF MUST BE MADE, obligation to promptly pay for undisputed charges in accordance with the terms of this Agreement. Such disputes shall]","[15.908745770940799, 13.849880565743023, 13.849880565743023, 12.11352264532109, 11.605025288556485]",8.06 ARBITRATION. All disputes or claims hereunder shall be resolved by,15.908746,...,8.05 GOVERNING LAW. All disputes or claims by Payment Data Systems,0.687344,False,,"[Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06, Grand-Rapids-Kalamazoo-Battle Creek WXMI 3117 Plaza Dr. N.E., Grand Rapids, MI 49525 &bbsp; 6/15/06, place of business at 7700 N. Congress Ave, Suite 3113, Boca Raton FL33487 ( “Marketing Affiliate ”)., Dallas-Fort Worth KDAF 8001 John Carpenter Fwy., Dallas, TX 75247 7/1/06, KTWB 1813 Westlake Ave. N., Seattle, WA 98109 7/15/06]","[110.13136291503906, 107.834228515625, 107.4998779296875, 107.4792251586914, 107.15617370605469]","Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06",110.131363,False,
3,How long before a price increase must Equidata notify the Marketing Affiliate?,Notice will be given… no less than 30 days prior to such increase taking affect.,Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing no less than 30 days prior…,SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:3]{index=3},Agreement.pdf,"[how, long, before, a, price, increase, must, equidata, notify, the, marketing, affiliate]","[Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing, 6. Audit. Equidata may audit, at Equidata ’s expense, the Marketing Affiliate ’s marketing, practices and activities for the, absolute discretion. In the event of a price and/or fee change for Technology, Company shall notify MA, given in writing by Marketing Affiliate to Equidata, Equidata or Marketing Affiliate may choose arbitration and, to Marketing Affiliate. Upon termination for any reason, Equidata reserves the right to deactivate Marketing Affiliate’s]","[16.23449909635954, 13.74680913892679, 13.111962477879251, 12.73983416309363, 12.244527307826083]",Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing,16.234499,...,no later than 25 calendar days after Equidata receives receipt of dispute from Marketing Affiliate and shall continue from day,0.661782,False,,"[Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06, KTWB 1813 Westlake Ave. N., Seattle, WA 98109 7/15/06, Grand-Rapids-Kalamazoo-Battle Creek WXMI 3117 Plaza Dr. N.E., Grand Rapids, MI 49525 &bbsp; 6/15/06, Dallas-Fort Worth KDAF 8001 John Carpenter Fwy., Dallas, TX 75247 7/1/06, Albany-Schenectady-Troy WEWB 14 Corporate Woods Blvd., Albany, NY 12211 8/1/06]","[106.84475708007812, 104.8255615234375, 104.29595947265625, 104.17488098144531, 103.99790954589844]","Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06",106.844757,False,
4,Who bears responsibility for collecting consumer payments under the Equidata agreement?,Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer,1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and shall bear sole responsibility for non-payment…,SteelVaultCorp_20081224_10-K_EX-10.16_Affiliate Agreement.pdf :contentReference[oaicite:4]{index=4},Agreement.pdf,"[who, bears, responsibility, for, collecting, consumer, payments, under, the, equidata, agreement]","[1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and, shall bear sole responsibility for non -payment of any fees charged to the Consumer. Marketing Affiliate shall pay to, PERMISSION PROVIDED BY EQUIDATA UNDER THIS AGREEMENT IS PROVIDED ON AN “AS IS” BASIS. EQUIDATA, due immediately upon receipt. Marketing Affiliate agrees to reimburse Equidata all costs of collecting any past due, shall be the sole responsibility of the Affiliate.]","[16.79946189890967, 15.221391973029311, 11.12044130608493, 10.732216231320576, 10.61953434224972]",1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and,16.799462,...,"Equidata, as compensation for its providing of Services under this agreement, such amounts as outlined and detailed in",0.707625,False,,"[Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06, Grand-Rapids-Kalamazoo-Battle Creek WXMI 3117 Plaza Dr. N.E., Grand Rapids, MI 49525 &bbsp; 6/15/06, KTWB 1813 Westlake Ave. N., Seattle, WA 98109 7/15/06, Dallas-Fort Worth KDAF 8001 John Carpenter Fwy., Dallas, TX 75247 7/1/06, Philadelphia WPHL 5001 Wynnefield Ave., Philadelphia, PA 19131 7/1/06]","[106.8209228515625, 104.69346618652344, 104.67500305175781, 104.25961303710938, 103.91937255859375]","Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06",106.820923,False,


In [45]:
# SPLADE metrics
splade_match_series = qa_df['splade_context_match']
splade_accuracy = splade_match_series.mean() if len(splade_match_series) else 0.0
splade_ranks = qa_df['splade_first_match_rank']
splade_reciprocals = [1.0/r for r in splade_ranks if isinstance(r, int) and r > 0]
splade_mrr = (sum(splade_reciprocals)/len(splade_ranks)) if len(splade_ranks) else 0.0
splade_rank_counts = splade_ranks.value_counts(dropna=False).to_dict()

splade_metrics = {
    'num_questions': int(len(qa_df)),
    'accuracy': float(splade_accuracy),
    'mrr': float(splade_mrr),
    'rank_counts': splade_rank_counts,
    'fuzzy_threshold': FUZZY_THRESHOLD,
    'model_name': SPLADE_MODEL_NAME,
    'model_load_time_sec': splade_model_load_time,
    'corpus_encode_time_sec': splade_corpus_encode_time,
    'index_build_time_sec': splade_index_build_time,
    'retrieval_time_total_sec': retrieval_time_total
}
print('[SPLADE] Evaluation Metrics:')
for k,v in splade_metrics.items():
    print(f'  {k}: {v}')

[SPLADE] Evaluation Metrics:
  num_questions: 92
  accuracy: 0.0
  mrr: 0.0
  rank_counts: {None: 92}
  fuzzy_threshold: 70
  model_name: naver/efficient-splade-VI-BT-large-doc
  model_load_time_sec: 0.6099753379821777
  corpus_encode_time_sec: 50.20890140533447
  index_build_time_sec: 0.43523263931274414
  retrieval_time_total_sec: 2.363184928894043


In [46]:
# Extend comparison table with SPLADE
compare_rows_extended = []
# Recreate to avoid chained updates issues
if 'bm25_metrics' in globals():
    compare_rows_extended.append({'method':'BM25','accuracy':bm25_metrics.get('accuracy'), 'mrr':bm25_metrics.get('mrr'), 'search_time_sec': None})
if 'semantic_metrics' in globals():
    compare_rows_extended.append({'method':'Semantic-Flat','accuracy':semantic_metrics.get('accuracy'), 'mrr':semantic_metrics.get('mrr'), 'search_time_sec': semantic_metrics.get('search_time_sec')})
if 'hnsw_metrics' in globals():
    compare_rows_extended.append({'method':'Semantic-HNSW','accuracy':hnsw_metrics.get('accuracy'), 'mrr':hnsw_metrics.get('mrr'), 'search_time_sec': hnsw_metrics.get('hnsw_search_time_sec')})
compare_rows_extended.append({'method':'SPLADE-Flat','accuracy':splade_metrics.get('accuracy'), 'mrr':splade_metrics.get('mrr'), 'search_time_sec': splade_metrics.get('retrieval_time_total_sec')})

compare_df2 = pd.DataFrame(compare_rows_extended)
base_acc = compare_df2.loc[compare_df2['method']=='BM25','accuracy'].values[0] if 'bm25_metrics' in globals() else compare_df2['accuracy'].iloc[0]
base_mrr = compare_df2.loc[compare_df2['method']=='BM25','mrr'].values[0] if 'bm25_metrics' in globals() else compare_df2['mrr'].iloc[0]
compare_df2['acc_delta_vs_BM25'] = compare_df2['accuracy'] - base_acc
compare_df2['mrr_delta_vs_BM25'] = compare_df2['mrr'] - base_mrr
print('Updated Retrieval Comparison (including SPLADE):')
display(compare_df2)


Updated Retrieval Comparison (including SPLADE):


Unnamed: 0,method,accuracy,mrr,search_time_sec,acc_delta_vs_BM25,mrr_delta_vs_BM25
0,BM25,0.358696,0.0,,0.0,0.0
1,Semantic-HNSW,0.26087,0.0,0.006026,-0.097826,0.0
2,SPLADE-Flat,0.0,0.0,2.363185,-0.358696,0.0


In [47]:
# Display sample SPLADE rows
splade_cols = [
    'question','context_chunk','splade_context_match','splade_first_match_rank',
    'splade_best_score','splade_best_chunk','splade_top_scores','splade_top_chunks'
]
print("Sample SPLADE retrieval rows:")
display(qa_df[splade_cols].head(10))

Sample SPLADE retrieval rows:


Unnamed: 0,question,context_chunk,splade_context_match,splade_first_match_rank,splade_best_score,splade_best_chunk,splade_top_scores,splade_top_chunks
0,On what date was the Marketing Affiliate Agreement between Equidata and National Credit Report made effective?,"MARKETING AFFILIATE AGREEMENT This Agreement is made this 1st day of October 2008, (the “Effective Date”), by and between Equidata, Inc.… and National Credit Report.com, LLC",False,,108.248528,"Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06","[108.24852752685547, 106.02227783203125, 105.82774353027344, 105.638916015625, 105.63409423828125]","[Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06, Dallas-Fort Worth KDAF 8001 John Carpenter Fwy., Dallas, TX 75247 7/1/06, Grand-Rapids-Kalamazoo-Battle Creek WXMI 3117 Plaza Dr. N.E., Grand Rapids, MI 49525 &bbsp; 6/15/06, KTWB 1813 Westlake Ave. N., Seattle, WA 98109 7/15/06, place of business at 7700 N. Congress Ave, Suite 3113, Boca Raton FL33487 ( “Marketing Affiliate ”).]"
1,Which state’s law governs the Equidata Marketing Affiliate Agreement?,12. Miscellaneous… This Agreement is governed by and construed in accordance with the laws of the State of Virginia.,False,,111.065872,"Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06","[111.06587219238281, 107.91841125488281, 107.479248046875, 107.41693115234375, 107.36601257324219]","[Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06, Dallas-Fort Worth KDAF 8001 John Carpenter Fwy., Dallas, TX 75247 7/1/06, Section 13.1 (Confidential Treatment). Network Affiliate agrees to be included in any compliance reporting NCM provides to its advertisers and, KTWB 1813 Westlake Ave. N., Seattle, WA 98109 7/15/06, Grand-Rapids-Kalamazoo-Battle Creek WXMI 3117 Plaza Dr. N.E., Grand Rapids, MI 49525 &bbsp; 6/15/06]"
2,Where must disputes under Equidata’s billing disputes clause be arbitrated?,"2. Disputes… Such disputes shall be settled by arbitration in the City of Newport News, Virginia.",False,,110.131363,"Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06","[110.13136291503906, 107.834228515625, 107.4998779296875, 107.4792251586914, 107.15617370605469]","[Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06, Grand-Rapids-Kalamazoo-Battle Creek WXMI 3117 Plaza Dr. N.E., Grand Rapids, MI 49525 &bbsp; 6/15/06, place of business at 7700 N. Congress Ave, Suite 3113, Boca Raton FL33487 ( “Marketing Affiliate ”)., Dallas-Fort Worth KDAF 8001 John Carpenter Fwy., Dallas, TX 75247 7/1/06, KTWB 1813 Westlake Ave. N., Seattle, WA 98109 7/15/06]"
3,How long before a price increase must Equidata notify the Marketing Affiliate?,Equidata reserves the right to increase the base cost of Services. Notice will be given to Marketing Affiliate in writing no less than 30 days prior…,False,,106.844757,"Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06","[106.84475708007812, 104.8255615234375, 104.29595947265625, 104.17488098144531, 103.99790954589844]","[Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06, KTWB 1813 Westlake Ave. N., Seattle, WA 98109 7/15/06, Grand-Rapids-Kalamazoo-Battle Creek WXMI 3117 Plaza Dr. N.E., Grand Rapids, MI 49525 &bbsp; 6/15/06, Dallas-Fort Worth KDAF 8001 John Carpenter Fwy., Dallas, TX 75247 7/1/06, Albany-Schenectady-Troy WEWB 14 Corporate Woods Blvd., Albany, NY 12211 8/1/06]"
4,Who bears responsibility for collecting consumer payments under the Equidata agreement?,1. Compensation. Marketing Affiliate shall be responsible for collecting all amounts due directly from the Consumer and shall bear sole responsibility for non-payment…,False,,106.820923,"Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06","[106.8209228515625, 104.69346618652344, 104.67500305175781, 104.25961303710938, 103.91937255859375]","[Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06, Grand-Rapids-Kalamazoo-Battle Creek WXMI 3117 Plaza Dr. N.E., Grand Rapids, MI 49525 &bbsp; 6/15/06, KTWB 1813 Westlake Ave. N., Seattle, WA 98109 7/15/06, Dallas-Fort Worth KDAF 8001 John Carpenter Fwy., Dallas, TX 75247 7/1/06, Philadelphia WPHL 5001 Wynnefield Ave., Philadelphia, PA 19131 7/1/06]"
5,What late charge applies to past-due balances in the Equidata agreement?,"…the undersigned principal… agrees to pay a late charge of 1 l/2% per month on the unpaid, past-due amount…",False,,106.548157,"Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06","[106.54815673828125, 104.41300964355469, 104.2381820678711, 104.17224884033203, 103.87337493896484]","[Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06, Grand-Rapids-Kalamazoo-Battle Creek WXMI 3117 Plaza Dr. N.E., Grand Rapids, MI 49525 &bbsp; 6/15/06, Dallas-Fort Worth KDAF 8001 John Carpenter Fwy., Dallas, TX 75247 7/1/06, Albany-Schenectady-Troy WEWB 14 Corporate Woods Blvd., Albany, NY 12211 8/1/06, KTWB 1813 Westlake Ave. N., Seattle, WA 98109 7/15/06]"
6,Name three statutes the parties agree to comply with under the Compliance clause.,"5. Compliance. Marketing Affiliate nor Equidata, shall engage in any practice… not in compliance with the Fair Credit Reporting Act (FCRA), the Fair Debt Collection Practices Act (FDCPA) and the Health Insurance Portability and Accountability Act (HIPAA)…",False,,115.060028,"Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06","[115.06002807617188, 112.89047241210938, 112.84878540039062, 112.58411407470703, 112.5838623046875]","[Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06, Dallas-Fort Worth KDAF 8001 John Carpenter Fwy., Dallas, TX 75247 7/1/06, Section 13.1 (Confidential Treatment). Network Affiliate agrees to be included in any compliance reporting NCM provides to its advertisers and, Philadelphia WPHL 5001 Wynnefield Ave., Philadelphia, PA 19131 7/1/06, KTWB 1813 Westlake Ave. N., Seattle, WA 98109 7/15/06]"
7,How far in advance must Marketing Materials requiring approval be reviewed?,"5.4. Uses Marketing Materials, media or methods that are not approved… Such approval shall not be unreasonably withheld and shall be completed within 48 hours of receipt…",False,,104.071655,"Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06","[104.0716552734375, 103.58369445800781, 103.15916442871094, 102.50176239013672, 102.2437744140625]","[Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06, KTWB 1813 Westlake Ave. N., Seattle, WA 98109 7/15/06, Grand-Rapids-Kalamazoo-Battle Creek WXMI 3117 Plaza Dr. N.E., Grand Rapids, MI 49525 &bbsp; 6/15/06, Dallas-Fort Worth KDAF 8001 John Carpenter Fwy., Dallas, TX 75247 7/1/06, Albany-Schenectady-Troy WEWB 14 Corporate Woods Blvd., Albany, NY 12211 8/1/06]"
8,For how long is the Equidata Marketing Affiliate Agreement’s initial term?,"7.1. This Agreement shall be for the term of one year; thereafter, the Agreement shall renew automatically under these same terms…",False,,104.054657,"Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06","[104.05465698242188, 101.9293212890625, 101.67037963867188, 101.253662109375, 101.1185302734375]","[Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06, Dallas-Fort Worth KDAF 8001 John Carpenter Fwy., Dallas, TX 75247 7/1/06, Grand-Rapids-Kalamazoo-Battle Creek WXMI 3117 Plaza Dr. N.E., Grand Rapids, MI 49525 &bbsp; 6/15/06, KTWB 1813 Westlake Ave. N., Seattle, WA 98109 7/15/06, place of business at 7700 N. Congress Ave, Suite 3113, Boca Raton FL33487 ( “Marketing Affiliate ”).]"
9,Can Equidata terminate immediately if CRAs decline to render services to the Marketing Affiliate?,"7.4. Equidata reserves the right to terminate this Agreement immediately for cause if Experian, Equifax and/or TransUnion… decline to render Services…",False,,122.641373,"Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06","[122.64137268066406, 119.84959411621094, 118.89183044433594, 118.86871337890625, 118.51557922363281]","[Denver KWGN 6160 S. Wabash Way, Greenwood Village, CO 80111 6/1/06, place of business at 7700 N. Congress Ave, Suite 3113, Boca Raton FL33487 ( “Marketing Affiliate ”)., Dallas-Fort Worth KDAF 8001 John Carpenter Fwy., Dallas, TX 75247 7/1/06, Grand-Rapids-Kalamazoo-Battle Creek WXMI 3117 Plaza Dr. N.E., Grand Rapids, MI 49525 &bbsp; 6/15/06, Philadelphia WPHL 5001 Wynnefield Ave., Philadelphia, PA 19131 7/1/06]"


In [49]:
# Diagnostic: Check SPLADE retrieval sample
print("=== SPLADE Diagnostic ===")
print(f"SPLADE index type: {type(splade_index)}")
print(f"SPLADE index ntotal: {splade_index.ntotal}")
print(f"SPLADE corpus embeddings shape: {splade_corpus_embeddings.shape}")
print(f"SPLADE embeddings min/max: {splade_corpus_embeddings.min():.4f} / {splade_corpus_embeddings.max():.4f}")
print(f"SPLADE embeddings mean: {splade_corpus_embeddings.mean():.4f}")

# Test a single query
test_q = qa_df.iloc[0]['question']
print(f"\nTest question: {test_q}")
print(f"Gold context: {qa_df.iloc[0]['context_chunk'][:100]}...")

q_vec = splade_encode([test_q])[0].astype('float32')
print(f"Query vec shape: {q_vec.shape}, min/max: {q_vec.min():.4f} / {q_vec.max():.4f}, mean: {q_vec.mean():.4f}")

scores, idxs = splade_index.search(np.expand_dims(q_vec, axis=0), 5)
print(f"\nTop 5 scores: {scores[0]}")
print(f"Top 5 indices: {idxs[0]}")
print("\nTop 5 chunks:")
for i, idx in enumerate(idxs[0]):
    if idx >= 0:
        chunk = corpus_df.iloc[idx]['chunk']
        print(f"{i+1}. Score: {scores[0][i]:.4f} - {chunk[:100]}...")
    else:
        print(f"{i+1}. Invalid index: {idx}")

=== SPLADE Diagnostic ===
SPLADE index type: <class 'faiss.swigfaiss_avx2.IndexHNSWFlat'>
SPLADE index ntotal: 3247
SPLADE corpus embeddings shape: (3247, 768)
SPLADE embeddings min/max: 0.0000 / 1.3478
SPLADE embeddings mean: 0.2886

Test question: On what date was the Marketing Affiliate Agreement between Equidata and National Credit Report made effective?
Gold context: MARKETING AFFILIATE AGREEMENT This Agreement is made this 1st day of October 2008, (the “Effective D...
Query vec shape: (768,), min/max: 0.0000 / 0.9537, mean: 0.2562

Top 5 scores: [108.24853  106.02228  105.82774  105.638916 105.634094]
Top 5 indices: [1632 1626 1639 1630 2184]

Top 5 chunks:
1. Score: 108.2485 - Denver   KWGN   6160 S. Wabash Way, Greenwood Village, CO 80111    6/1/06...
2. Score: 106.0223 - Dallas-Fort  Worth   KDAF   8001 John Carpenter Fwy., Dallas, TX 75247    7/1/06...
3. Score: 105.8277 - Grand-Rapids-Kalamazoo-Battle  Creek   WXMI   3117 Plaza Dr. N.E., Grand Rapids, MI 49525 &bbsp;  6/...


In [48]:
# Unified evaluation: compute hit@k for all methods (if available) and rebuild comparison
import pandas as pd

EVAL_METHODS = []
if 'bm25_metrics' in globals():
    EVAL_METHODS.append(('bm25','bm25_top_chunks','bm25_context_match','bm25_first_match_rank'))
if 'semantic_metrics' in globals():
    EVAL_METHODS.append(('semantic_flat','semantic_top_chunks','semantic_context_match','semantic_first_match_rank'))
if 'hnsw_metrics' in globals():
    EVAL_METHODS.append(('semantic_hnsw','hnsw_top_chunks','hnsw_context_match','hnsw_first_match_rank'))
if 'splade_metrics' in globals():
    EVAL_METHODS.append(('splade_flat','splade_top_chunks','splade_context_match','splade_first_match_rank'))

hit_rows = []
for name, top_col, match_col, rank_col in EVAL_METHODS:
    if top_col not in qa_df.columns:
        continue
    # Compute hits@k from rank column
    ranks = qa_df[rank_col]
    total = len(ranks)
    def hit_at(k):
        return float(sum((ranks<=k) & (ranks.notna())))/total if total else 0.0
    metrics_local = {
        'method': name,
        'hit@1': hit_at(1),
        'hit@3': hit_at(3),
        'hit@5': hit_at(5),
        'mrr': float(sum([1.0/r for r in ranks if isinstance(r,int) and r>0])/total) if total else 0.0,
        'accuracy_bool': float(qa_df[match_col].mean()) if match_col in qa_df else None
    }
    hit_rows.append(metrics_local)

hit_df = pd.DataFrame(hit_rows)
print('Hit@k and MRR comparison:')
display(hit_df)

# Merge with previous compare_df2 if exists
if 'compare_df2' in globals():
    merged = compare_df2.merge(hit_df, how='left', left_on='method', right_on='method')
    print('Augmented comparison table:')
    display(merged)
else:
    print('No previous compare_df2 found; displaying only hit_df.')


Hit@k and MRR comparison:


Unnamed: 0,method,hit@1,hit@3,hit@5,mrr,accuracy_bool
0,bm25,0.271739,0.326087,0.358696,0.0,0.358696
1,semantic_hnsw,0.173913,0.23913,0.26087,0.0,0.26087
2,splade_flat,0.0,0.0,0.0,0.0,0.0


Augmented comparison table:


Unnamed: 0,method,accuracy,mrr_x,search_time_sec,acc_delta_vs_BM25,mrr_delta_vs_BM25,hit@1,hit@3,hit@5,mrr_y,accuracy_bool
0,BM25,0.358696,0.0,,0.0,0.0,,,,,
1,Semantic-HNSW,0.26087,0.0,0.006026,-0.097826,0.0,,,,,
2,SPLADE-Flat,0.0,0.0,2.363185,-0.358696,0.0,,,,,


## Retrieval Performance Analysis

### Why BM25 was initially higher
1. **Lexical alignment**: BM25 directly leverages exact token overlap between short QA context snippets and corpus lines; dense and SPLADE variants were penalized by using fuzzy matching on very short spans.
2. **Incorrect SPLADE scoring**: Earlier implementation used L2 distance with sign inversion. SPLADE should use inner product on sparsified expansion weights; switching to `IndexFlatIP` fixes this.
3. **Normalization differences**: SentenceTransformer embeddings were normalized (cosine), SPLADE vectors were not. L2 ranking distorted relative magnitudes; inner product preserves term-weight contributions.
4. **Evaluation matching threshold**: A uniform fuzzy threshold (70) favors exact lexical retrievers; dense retrievers surface semantically similar but lexically different chunks that may score below the token-set threshold.
5. **Chunk granularity**: The corpus is line-based; lexical methods benefit from narrower, specific matches. Dense methods typically perform better on paragraph‑level representations; here they had fewer semantic signals per chunk.

### Fixes applied
- Changed SPLADE index to `IndexFlatIP` and used direct inner product scores.
- Rebuilt SPLADE retrieval loop to remove inverted L2 distance logic.
- Added unified evaluation cell computing Hit@1/3/5 + MRR across all methods for consistent comparison.

### Recommended next adjustments
- Re-evaluate fuzzy matching using both `context_chunk` and `answer_snippet`; consider lowering threshold (e.g., 60) for dense methods or using semantic similarity rather than fuzzy lexical ratio.
- Aggregate adjacent lines into larger chunks (50–120 tokens) to improve semantic model signal.
- Add query / chunk text normalization (lowercase, strip punctuation) before fuzzy matching for consistency.
- For SPLADE, optionally apply vocabulary pruning or top-k term retention to reduce noise and memory.

### Interpreting results
- If BM25 still leads: Corpus likely dominated by exact contractual phrasing; leverage hybrid (BM25 + dense score) reranking.
- If SPLADE improves after changes: Lexical expansion captured synonyms / morphological variants missed by raw BM25.

### Next Steps
Run all modified cells, inspect the updated comparison and consider implementing a reranker combining BM25 top-50 followed by dense or SPLADE scoring to raise semantic recall without losing precision.