In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import uuid
import os
import json
import datasets
from sklearn.metrics.pairwise import cosine_similarity
from itertools import islice

  from .autonotebook import tqdm as notebook_tqdm


# Load data

In [2]:
# Initialize
all_dfs = []

# List of years to process
years = ["year_2018", "year_2019", "year_2020"]

for year in years:
    # Load dataset for the current year
    dataset = datasets.load_dataset("eloukas/edgar-corpus", year, split="train")
    
    # Convert to pandas DataFrame and take top 10 rows, mostl
    df = dataset.to_pandas().head(10)
    
    # Handle missing values
    df = df.fillna(pd.NA).replace(pd.NA, None)
    
    # Append to the list of DataFrames
    all_dfs.append(df)

# Concatenate all DataFrames
final_df = pd.concat(all_dfs, ignore_index=True)

In [3]:
final_df.to_csv("results/test_three_years.csv", index=False)

In [3]:
final_df.head()

Unnamed: 0,filename,cik,year,section_1,section_1A,section_1B,section_2,section_3,section_4,section_5,...,section_8,section_9,section_9A,section_9B,section_10,section_11,section_12,section_13,section_14,section_15
0,1566373_2018.htm,1566373,2018,Item 1. Business\nOverview\nWe are a clinical-...,Item 1A. Risk Factors.\nInvesting in our commo...,Item 1B. Unresolved Staff Comments.\nNot appli...,Item 2. Properties.\nOur current operations ar...,"Item 3. Legal Proceedings.\nFrom time to time,...",Item 4. Mine Safety Disclosures.\nNot applicab...,"Item 5. Market for Registrant’s Common Equity,...",...,Item 8. Financial Statements and Supplementary...,Item 9. Changes in and Disagreements With Acco...,Item 9A. Controls and Procedures.\nEvaluation ...,Item 9B. Other Information.\nNone.\nPART III\n...,"Item 10. Directors, Executive Officers and Cor...",Item 11. Executive Compensation.\nThe response...,Item 12. Security Ownership of Certain Benefic...,Item 13. Certain Relationships and Related Tra...,Item 14. Principal Accounting Fees and Service...,"Item 15. Exhibits, Financial Statement Schedul..."
1,1263364_2018.htm,1263364,2018,Item 1. BUSINESS.\nOverview\nThrough our PRC O...,Item 1A. RISK FACTORS.\nAn investment in our c...,,Item 2. PROPERTIES.\nThere is no private land ...,Item 3. LEGAL PROCEEDINGS.\nWe have no knowled...,Item 4. MINE SAFETY DISCLOSURES.\nNot applicab...,"Item 5. MARKET FOR REGISTRANT’S COMMON EQUITY,...",...,Item 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY...,Item 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,Item 9A. CONTROLS AND PROCEDURES.\nDisclosure ...,Item 9B. OTHER INFORMATION.\nNone.\nPART III\n...,"Item 10. DIRECTORS, EXECUTIVE OFFICERS AND COR...",Item 11. EXECUTIVE COMPENSATION.\nExecutive Of...,Item 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,"Item 13. CERTAIN RELATIONSHIPS, RELATED TRANSA...",Item 14. PRINCIPAL ACCOUNTING FEES AND SERVICE...,"Item 15. EXHIBITS, FINANCIAL STATEMENT SCHEDUL..."
2,1168165_2018.htm,1168165,2018,,,ITEM 1B.\nUNRESOLVED STAFF COMMENTS\nAll Regis...,,,,,...,,ITEM 9.\nCHANGES IN AND DISAGREEMENTS WITH ACC...,ITEM 9A.\nCONTROLS AND PROCEDURES\nAll Registr...,ITEM 9B.\nOTHER INFORMATION\nAll Registrants\n...,"ITEM 10.\nDIRECTORS, EXECUTIVE OFFICERS AND CO...",,ITEM 12.\nSECURITY OWNERSHIP OF CERTAIN BENEFI...,ITEM 13.\nCERTAIN RELATIONSHIPS AND RELATED TR...,ITEM 14.\nPRINCIPAL ACCOUNTING FEES AND SERVIC...,"ITEM 15.\nEXHIBITS, FINANCIAL STATEMENT SCHEDU..."
3,1518171_2018.htm,1518171,2018,Item 1. Business.\nForward-Looking Statements\...,Item 1A. Risk Factors.\nAs a “smaller reportin...,Item 1B. Unresolved Staff Comments.\nAs a “sma...,Item 2. Properties.\nOur company does not own ...,Item 3. Legal Proceedings.\nWe know of no mate...,Item 4. Mine Safety Disclosures.\nNot applicab...,"Item 5. Market for Registrant’s Common Equity,...",...,Item 8. Financial Statements and Supplementary...,Item 9. Changes in and Disagreements with Acco...,Item 9A. Controls and Procedures.\nEvaluation ...,Item 9B. Other Information.\nNone.\nPART III\n...,"Item 10. Directors, Executive Officers and Cor...",Item 11. Executive Compensation.\nThe followin...,Item 12. Security Ownership of Certain Benefic...,Item 13. Certain Relationships and Related Tra...,Item 14. Principal Accounting Fees and Service...,"Item 15. Exhibits, Financial Statement Schedul..."
4,1431567_2018.htm,1431567,2018,ITEM 1. BUSINESS OF OAK VALLEY BANCORP\nOvervi...,ITEM 1A. RISK FACTORS\nAn investment in our se...,ITEM 1B. UNRESOLVED STAFF COMMENTS\nNone.\nITE...,ITEM 2. PROPERTIES\nOur main branch office is ...,"ITEM 3. LEGAL PROCEEDINGS\nFrom time to time, ...",ITEM 4. MINE SAFETY DISCLOSURES\nNot applicabl...,"ITEM 5. MARKET FOR REGISTRANT’S COMMON EQUITY,...",...,ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY...,ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,ITEM 9A. CONTROLS AND PROCEDURES\nEvaluation o...,ITEM 9B. OTHER INFORMATION\nNone.\nPART III\nI...,"ITEM 10.\nDIRECTORS, EXECUTIVE OFFICERS AND CO...",ITEM 11.\nEXECUTIVE COMPENSATION\nThe informat...,ITEM 12.\nSECURITY OWNERSHIP OF CERTAIN BENEFI...,ITEM 13.\nCERTAIN RELATIONSHIPS AND RELATED TR...,ITEM 14.\nPRINCIPAL ACCOUNTANT FEES AND SERVIC...,"ITEM 15.\nEXHIBITS, FINANCIAL STATEMENT SCHEDU..."


In [4]:
sections = list(final_df.columns[3:])

In [5]:
# Chunking function
def chunk_text(text, max_length=1000):
    if not text or not isinstance(text, str):
        return []
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        current_length += len(word) + 1
        if current_length > max_length:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = len(word) + 1
        else:
            current_chunk.append(word)
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Apply chunking to each section and create chunked columns
for section in sections:
    final_df[f'{section}_chunks'] = final_df[section].apply(chunk_text)

In [6]:
len(final_df['section_1_chunks'][0])

133

In [7]:
# List to hold individual DataFrames
dfs = []

# Explode each section's chunks individually
for section in sections:
    # Select relevant columns and explode the chunks column
    section_df = final_df[['filename', 'cik', 'year', f'{section}_chunks']].copy()
    section_df = section_df.explode(f'{section}_chunks').reset_index(drop=True)
    # Rename the exploded column to 'chunk_text' and add 'section' column
    section_df = section_df.rename(columns={f'{section}_chunks': 'chunk_text'})
    section_df['section'] = section
    # Filter out null or empty chunks
    section_df = section_df[section_df['chunk_text'].str.strip().ne('')]
    dfs.append(section_df)

# Union all section DataFrames using concat
final_chunks = pd.concat(dfs, ignore_index=True)

# Display result
print(final_chunks.head())
print(f"Total chunks: {len(final_chunks)}")

           filename      cik  year  \
0  1566373_2018.htm  1566373  2018   
1  1566373_2018.htm  1566373  2018   
2  1566373_2018.htm  1566373  2018   
3  1566373_2018.htm  1566373  2018   
4  1566373_2018.htm  1566373  2018   

                                          chunk_text    section  
0  Item 1. Business Overview We are a clinical-st...  section_1  
1  mechanism of action that is designed to select...  section_1  
2  standard of care therapies for the treatment o...  section_1  
3  have demonstrated that they potently suppress ...  section_1  
4  substantially increase chronic HBV functional ...  section_1  
Total chunks: 9876


## Generate validation dataset

final_chunks

In [9]:
final_chunks.dropna(inplace=True)

In [17]:
embed_text = [f"company:{row['cik']} year: {row['year']} content: {row['chunk_text']}" for _, row in final_chunks.iterrows()]

In [23]:
final_chunks['embed_text'] = embed_text

In [19]:
# Step 2: Generate embeddings for chunks

embedding_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
embeddings = embedding_model.encode(embed_text, show_progress_bar=True)  # Drop NaN before encoding


valid_texts = embed_text
valid_embeddings = embeddings
text_embedding_pairs = list(zip(valid_texts, valid_embeddings))

Batches: 100%|██████████| 309/309 [00:26<00:00, 11.73it/s]


## Retrival with hybird search

In [20]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import pandas as pd
import numpy as np
from rank_bm25 import BM25Okapi
import logging

# Set up logging for debugging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

In [21]:
embeddings_obj = HuggingFaceEmbeddings(model_name='multi-qa-MiniLM-L6-cos-v1')
vector_store = FAISS.from_embeddings(text_embedding_pairs, embeddings_obj)

# vector_store = FAISS.from_embeddings(text_embedding_pairs, embedding_model)


  embeddings_obj = HuggingFaceEmbeddings(model_name='multi-qa-MiniLM-L6-cos-v1')
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: multi-qa-MiniLM-L6-cos-v1
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/multi-qa-MiniLM-L6-cos-v1/resolve/main/modules.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/multi-qa-MiniLM-L6-cos-v1/resolve/main/config_sentence_transformers.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/multi-qa-MiniLM-L6-cos-v1/resolve/main/README.md HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/multi-qa-MiniLM-L6-cos-v1/resolve/main/modules.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/multi-qa-MiniLM-L6

In [22]:

# Initialize BM25 for keyword search
tokenized_corpus = [doc.lower().split() for doc in embed_text]
bm25 = BM25Okapi(tokenized_corpus)

# Initialize cross-encoder for reranking
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /cross-encoder/ms-marco-MiniLM-L-6-v2/resolve/main/config.json HTTP/1.1" 307 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /cross-encoder/ms-marco-MiniLM-L6-v2/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /cross-encoder/ms-marco-MiniLM-L-6-v2/resolve/main/tokenizer_config.json HTTP/1.1" 307 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /cross-encoder/ms-marco-MiniLM-L6-v2/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/models/cross-encoder/ms-marco-MiniLM-L-6-v2/tree/main/additional_chat_templates?recursive=False&expand=False HTTP/1.1" 307 147
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/models/cross-encoder/ms-marco-MiniLM-L6-v2/tree/main/additional_chat_templates?recursive=False&expand=False HTTP/1.1" 404 64
DEBUG:urllib3.connectionpool:https

In [69]:


# Hybrid search function with separate scores and search mode option
def hybrid_search1(final_chunks,query, k=5, search_mode='hybrid'):
    logger.debug(f"Processing query: {query} with mode: {search_mode}")
    results = []

    # Semantic search (Vector search)
    if search_mode in ['hybrid', 'vector']:
        try:
            semantic_results = vector_store.similarity_search_with_score(query, k=k)
            semantic_docs = [result[0].page_content for result in semantic_results]
            semantic_scores = [result[1] for result in semantic_results]
        except Exception as e:
            logger.error(f"Semantic search failed: {e}")
            semantic_docs, semantic_scores = [], []

    # Keyword search (BM25)
    if search_mode in ['hybrid', 'keyword']:
        tokenized_query = query.lower().split()
        bm25_scores = bm25.get_scores(tokenized_query)
        bm25_top_k = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:k]
        bm25_docs = [final_chunks.iloc[i]['embed_text'] for i in bm25_top_k if pd.notna(final_chunks.iloc[i]['embed_text'])]
        bm25_scores = [bm25_scores[i] for i in bm25_top_k if pd.notna(final_chunks.iloc[i]['embed_text'])]

    # Align lengths for hybrid combination
    if search_mode == 'hybrid':
        min_len = min(len(semantic_docs), len(bm25_docs), k)
        semantic_docs = semantic_docs[:min_len] 
        semantic_scores = semantic_scores[:min_len]
        bm25_docs = bm25_docs[:min_len]
        bm25_scores = bm25_scores[:min_len]

        # Combine results
        combined_scores = {}
        for i in range(min_len):
            doc = semantic_docs[i] if semantic_docs else bm25_docs[i]
            sem_score = semantic_scores[i] if semantic_scores else 0.0
            bm25_score = bm25_scores[i] if bm25_scores else 0.0
            norm_sem_score = sem_score / max(semantic_scores) if semantic_scores and max(semantic_scores) > 0 else 0
            norm_bm25_score = bm25_score / max(bm25_scores) if bm25_scores and max(bm25_scores) > 0 else 0
            combined_score = (norm_sem_score + norm_bm25_score) / 2 if search_mode == 'hybrid' else 0.0
            combined_scores[doc] = combined_score

        # Get top-k combined results
        top_k_docs = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:k]
        top_k_texts = [doc for doc, _ in top_k_docs]
        top_k_combined_scores = [score for _, score in top_k_docs]

    elif search_mode == 'vector':
        top_k_texts = semantic_docs
        top_k_combined_scores = semantic_scores
    elif search_mode == 'keyword':
        top_k_texts = bm25_docs
        top_k_combined_scores = bm25_scores
    else:
        raise ValueError(f"Unsupported search_mode: {search_mode}. Use 'hybrid', 'vector', or 'keyword'.")

    # Rerank with cross-encoder (optional for all modes)
    pairs = [[query, doc] for doc in top_k_texts]
    rerank_scores = reranker.predict(pairs) if pairs else [0.0] * k
    reranked_indices = np.argsort(rerank_scores)[::-1]
    reranked_docs = [top_k_texts[i] if i < len(top_k_texts) else '' for i in reranked_indices]
    reranked_scores = [rerank_scores[i] if i < len(rerank_scores) else 0.0 for i in reranked_indices]

    # Map back to original DataFrame rows and save separate scores
    for doc, combined_score, sem_score, bm25_score, rerank_score in zip(
        reranked_docs,
        top_k_combined_scores if search_mode == 'hybrid' else [0.0] * k,
        semantic_scores if search_mode in ['hybrid', 'vector'] else [0.0] * k,
        bm25_scores if search_mode in ['hybrid', 'keyword'] else [0.0] * k,
        reranked_scores
    ):
        idx = final_chunks.index[final_chunks['embed_text'] == doc][0] if doc in final_chunks['embed_text'].values else 0
        results.append({
            'cik': final_chunks.loc[idx, 'cik'],
            'year': final_chunks.loc[idx, 'year'],
            'section': final_chunks.loc[idx, 'section'],
            'embed_text': doc,
            'vector_score': sem_score if search_mode in ['hybrid', 'vector'] else 0.0,
            'keyword_score': bm25_score if search_mode in ['hybrid', 'keyword'] else 0.0,
            'hybrid_score': combined_score if search_mode == 'hybrid' else 0.0,
            'rerank_score': rerank_score
        })

    return results



In [94]:
import re
import logging
logger = logging.getLogger(__name__)

def hybrid_search(final_chunks, query, k=5, search_mode='hybrid'):
    logger.debug(f"Processing query: {query} with mode: {search_mode}")
    results = []

    # Extract cik and year from query
    cik_match = re.search(r'CIK\s+(\d+)', query)
    year_match = re.search(r'in\s+(\d{4})', query)
    cik = cik_match.group(1) if cik_match else None
    year = year_match.group(1) if year_match else None

    # Filter chunks based on cik and year
    sub_df = final_chunks.copy()
    if cik and year:
        sub_df = final_chunks[(final_chunks['cik'] == cik) & (final_chunks['year'] == year)]
    elif cik:
        sub_df = final_chunks[final_chunks['cik'] == cik]
    elif year:
        sub_df = final_chunks[final_chunks['year'] == year]

    if sub_df.empty:
        logger.warning(f"No data found for cik={cik}, year={year}")
        return results
    print(sub_df.shape)
    # Semantic search (Vector search) with filtered data
    if search_mode in ['hybrid', 'vector']:
        try:
            # Rebuild vector store with filtered subset if needed
            valid_texts = sub_df['embed_text'].dropna().tolist()
            if not valid_texts:
                raise ValueError("No valid texts for vector search")
            valid_embeddings = embedding_model.encode(valid_texts, show_progress_bar=False)
            text_embedding_pairs = list(zip(valid_texts, valid_embeddings))
            vector_store = FAISS.from_embeddings(text_embedding_pairs, embeddings_obj)
            semantic_results = vector_store.similarity_search_with_score(query, k=k)
            semantic_docs = [result[0].page_content for result in semantic_results]
            semantic_scores = [result[1] for result in semantic_results]
        except Exception as e:
            logger.error(f"Semantic search failed: {e}")
            semantic_docs, semantic_scores = [], []

    # Keyword search (BM25) with filtered data
    if search_mode in ['hybrid', 'keyword']:
        tokenized_corpus = [doc.lower().split() for doc in sub_df['embed_text'].dropna()]
        if not tokenized_corpus:
            raise ValueError("No valid texts for keyword search")
        bm25 = BM25Okapi(tokenized_corpus)
        tokenized_query = query.lower().split()
        bm25_scores = bm25.get_scores(tokenized_query)
        bm25_top_k = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:k]
        bm25_docs = [sub_df.iloc[i]['embed_text'] for i in bm25_top_k if pd.notna(sub_df.iloc[i]['embed_text'])]
        bm25_scores = [bm25_scores[i] for i in bm25_top_k if pd.notna(sub_df.iloc[i]['embed_text'])]

    # Align lengths for hybrid combination
    if search_mode == 'hybrid':
        min_len = min(len(semantic_docs), len(bm25_docs), k)
        semantic_docs = semantic_docs[:min_len] 
        semantic_scores = semantic_scores[:min_len]
        bm25_docs = bm25_docs[:min_len]
        bm25_scores = bm25_scores[:min_len]

        # Combine results
        combined_scores = {}
        for i in range(min_len):
            doc = semantic_docs[i] if semantic_docs else bm25_docs[i]
            sem_score = semantic_scores[i] if semantic_scores else 0.0
            bm25_score = bm25_scores[i] if bm25_scores else 0.0
            norm_sem_score = sem_score / max(semantic_scores) if semantic_scores and max(semantic_scores) > 0 else 0
            norm_bm25_score = bm25_score / max(bm25_scores) if bm25_scores and max(bm25_scores) > 0 else 0
            combined_score = (norm_sem_score + norm_bm25_score) / 2 if search_mode == 'hybrid' else 0.0
            combined_scores[doc] = combined_score

        # Get top-k combined results
        top_k_docs = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:k]
        top_k_texts = [doc for doc, _ in top_k_docs]
        top_k_combined_scores = [score for _, score in top_k_docs]

    elif search_mode == 'vector':
        top_k_texts = semantic_docs
        top_k_combined_scores = semantic_scores
    elif search_mode == 'keyword':
        top_k_texts = bm25_docs
        top_k_combined_scores = bm25_scores
    else:
        raise ValueError(f"Unsupported search_mode: {search_mode}. Use 'hybrid', 'vector', or 'keyword'.")

    # Rerank with cross-encoder
    pairs = [[query, doc] for doc in top_k_texts]
    rerank_scores = reranker.predict(pairs) if pairs else [0.0] * k
    reranked_indices = np.argsort(rerank_scores)[::-1]
    reranked_docs = [top_k_texts[i] if i < len(top_k_texts) else '' for i in reranked_indices]
    reranked_scores = [rerank_scores[i] if i < len(rerank_scores) else 0.0 for i in reranked_indices]

    # Map back to original DataFrame rows and save separate scores
    for doc, combined_score, sem_score, bm25_score, rerank_score in zip(
        reranked_docs,
        top_k_combined_scores if search_mode == 'hybrid' else [0.0] * k,
        semantic_scores if search_mode in ['hybrid', 'vector'] else [0.0] * k,
        bm25_scores if search_mode in ['hybrid', 'keyword'] else [0.0] * k,
        reranked_scores
    ):
        idx = sub_df.index[sub_df['embed_text'] == doc][0] if doc in sub_df['embed_text'].values else 0
        results.append({
            'cik': sub_df.loc[idx, 'cik'],
            'year': sub_df.loc[idx, 'year'],
            'section': sub_df.loc[idx, 'section'],
            'embed_text': doc,
            'vector_score': sem_score if search_mode in ['hybrid', 'vector'] else 0.0,
            'keyword_score': bm25_score if search_mode in ['hybrid', 'keyword'] else 0.0,
            'hybrid_score': combined_score if search_mode == 'hybrid' else 0.0,
            'rerank_score': rerank_score
        })

    return results

In [95]:
# Example query with different search modes
query = "What is the details overview and overview history of the company with CIK 1168165 in 2018?"
# we have a function which can idnetify which company and year
mode = 'hybrid'

retrieved_results = hybrid_search(final_chunks,query, k=10, search_mode=mode)


# sub_df =  final_chunks[(final_chunks['cik']== '1046025')
#             &(final_chunks['year']== '2018')]
# retrieved_results = hybrid_search(sub_df,query, k=10, search_mode=mode)



DEBUG:__main__:Processing query: What is the details overview and overview history of the company with CIK 1168165 in 2018? with mode: hybrid


(504, 6)


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.24it/s]


## Debug

In [86]:
# Extract cik and year from query
cik_match = re.search(r'CIK\s+(\d+)', query)
year_match = re.search(r'in\s+(\d{4})', query)
cik = cik_match.group(1) if cik_match else None
year = year_match.group(1) if year_match else None

# Filter chunks based on cik and year
sub_df = final_chunks.copy()
if cik and year:
    sub_df = final_chunks[(final_chunks['cik'] == cik) & (final_chunks['year'] == year)]
elif cik:
    sub_df = final_chunks[final_chunks['cik'] == cik]
elif year:
    sub_df = final_chunks[final_chunks['year'] == year]

In [87]:
sub_df = final_chunks[(final_chunks['cik'] == cik) 
& (final_chunks['year'] == year)
]

In [88]:
sub_df.shape

(504, 6)

In [89]:
# Rebuild vector store with filtered subset if needed
valid_texts = sub_df['embed_text'].dropna().tolist()
if not valid_texts:
    raise ValueError("No valid texts for vector search")
valid_embeddings = embedding_model.encode(valid_texts, show_progress_bar=False)
text_embedding_pairs = list(zip(valid_texts, valid_embeddings))
vector_store = FAISS.from_embeddings(text_embedding_pairs, embeddings_obj)
semantic_results = vector_store.similarity_search_with_score(query, k=k)
semantic_docs = [result[0].page_content for result in semantic_results]
semantic_scores = [result[1] for result in semantic_results]

In [90]:
semantic_docs

["company:1168165 year: 2018 content: Financial Statements and ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS. __________ (a) The 2016 financial results include the activity of PHI from the merger effective date of March 24, 2016 through December 31, 2016. (b) On April 1, 2014, Generation assumed operational control of CENG’s nuclear fleet. As a result, the 2014 financial results include CENG’s results of operations on a fully consolidated basis. (c) Amounts have been recasted to reflect the Improving the Presentation of Net Periodic Pension Cost and Net Periodic Postretirement Benefit Cost guidance adopted as of January 1, 2018. See Note 1 - Significant Accounting Policies of the Combined Notes to Consolidated Financial Statements for additional information. (d) Amounts for 2017 and 2016 have been recasted to reflect the Revenue from Contracts with Customers guidance adopted as of January 1, 2018. See Note 1 - Significant Accounting Polic

In [91]:
# Keyword search (BM25) with filtered data
# if search_mode in ['hybrid', 'keyword']:
tokenized_corpus = [doc.lower().split() for doc in sub_df['embed_text'].dropna()]
if not tokenized_corpus:
    raise ValueError("No valid texts for keyword search")
bm25 = BM25Okapi(tokenized_corpus)
tokenized_query = query.lower().split()
bm25_scores = bm25.get_scores(tokenized_query)
bm25_top_k = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:k]
bm25_docs = [sub_df.iloc[i]['embed_text'] for i in bm25_top_k if pd.notna(sub_df.iloc[i]['embed_text'])]
bm25_scores = [bm25_scores[i] for i in bm25_top_k if pd.notna(sub_df.iloc[i]['embed_text'])]


In [93]:
bm25_docs, bm25_scores

(['company:1168165 year: 2018 content: is exposed to market risks associated with credit and interest rates. These risks are described above under Quantitative and Qualitative Disclosures about Market Risk-Exelon. ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS BGE General BGE operates in a single business segment and its operations consist of the purchase and regulated retail sale of electricity and the provision of distribution and transmission services in central Maryland, including the City of Baltimore, and the purchase and regulated retail sale of natural gas and the provision of distribution service in central Maryland, including the City of Baltimore. This segment is discussed in further detail in ITEM 1. BUSINESS-BGE of this Form 10-K. Executive Overview A discussion of items pertinent to BGE’s executive overview is set forth under EXELON CORPORATION - Executive Overview of this Form 10-K. Results of Operations Year Ended December 

In [67]:
results = []

semantic_results = vector_store.similarity_search_with_score(query, k=k)
semantic_docs = [result[0].page_content for result in semantic_results]
semantic_scores = [result[1] for result in semantic_results]

In [None]:
final_chunks

In [None]:



tokenized_query = query.lower().split()
bm25_scores = bm25.get_scores(tokenized_query)
bm25_top_k = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:k]
bm25_docs = [final_chunks.iloc[i]['embed_text'] for i in bm25_top_k if pd.notna(final_chunks.iloc[i]['embed_text'])]
bm25_scores = [bm25_scores[i] for i in bm25_top_k if pd.notna(final_chunks.iloc[i]['embed_text'])]

In [None]:
bm

In [46]:
min_len

0

# Get all results

In [105]:
query = "What is the details overview and overview history of the company with CIK 1168165 in 2018?"

results = []
for mode in ['hybrid', 'vector', 'keyword']:
    print(f"\nResults for {mode} search:")
    retrieved_results = hybrid_search(final_chunks,query, k=10, search_mode=mode)
    retrieved_df = pd.DataFrame(retrieved_results)
    retrieved_df['method'] = mode
    results.append(retrieved_df)
    print(retrieved_df)

DEBUG:__main__:Processing query: What is the details overview and overview history of the company with CIK 1168165 in 2018? with mode: hybrid



Results for hybrid search:
(504, 6)


Batches: 100%|██████████| 1/1 [00:00<00:00, 95.65it/s]
DEBUG:__main__:Processing query: What is the details overview and overview history of the company with CIK 1168165 in 2018? with mode: vector


       cik  year     section  \
0  1168165  2018  section_1B   
1  1168165  2018  section_1B   
2  1168165  2018  section_1B   
3  1168165  2018  section_1B   
4  1168165  2018  section_1B   
5  1168165  2018  section_1B   
6  1168165  2018  section_1B   
7  1168165  2018  section_15   
8  1168165  2018  section_15   

                                          embed_text  vector_score  \
0  company:1168165 year: 2018 content: and the fi...      0.868922   
1  company:1168165 year: 2018 content: audited th...      0.887480   
2  company:1168165 year: 2018 content: Financial ...      0.904708   
3  company:1168165 year: 2018 content: financial ...      0.910113   
4  company:1168165 year: 2018 content: appearing ...      0.941592   
5  company:1168165 year: 2018 content: statement ...      0.941592   
6  company:1168165 year: 2018 content: Consolidat...      0.946575   
7  company:1168165 year: 2018 content: and Direct...      0.953680   
8  company:1168165 year: 2018 content: M. VELAZQU

Batches: 100%|██████████| 1/1 [00:00<00:00,  8.67it/s]
DEBUG:__main__:Processing query: What is the details overview and overview history of the company with CIK 1168165 in 2018? with mode: keyword


       cik  year     section  \
0  1168165  2018  section_1B   
1  1168165  2018  section_1B   
2  1168165  2018  section_1B   
3  1168165  2018  section_1B   
4  1168165  2018  section_1B   
5  1168165  2018  section_1B   
6  1168165  2018  section_1B   
7  1168165  2018  section_1B   
8  1168165  2018  section_15   
9  1168165  2018  section_15   

                                          embed_text  vector_score  \
0  company:1168165 year: 2018 content: and the fi...      0.868922   
1  company:1168165 year: 2018 content: audited th...      0.887480   
2  company:1168165 year: 2018 content: Financial ...      0.904708   
3  company:1168165 year: 2018 content: financial ...      0.910113   
4  company:1168165 year: 2018 content: financial ...      0.941592   
5  company:1168165 year: 2018 content: appearing ...      0.941592   
6  company:1168165 year: 2018 content: statement ...      0.946575   
7  company:1168165 year: 2018 content: Consolidat...      0.953680   
8  company:116816

Batches: 100%|██████████| 1/1 [00:00<00:00, 22.52it/s]


       cik  year     section  \
0  1168165  2018  section_1B   
1  1168165  2018  section_1B   
2  1168165  2018  section_1B   
3  1168165  2018  section_1B   
4  1168165  2018  section_1B   
5  1168165  2018  section_1B   
6  1168165  2018  section_1B   
7  1168165  2018  section_1B   
8  1168165  2018  section_1B   
9  1168165  2018  section_1B   

                                          embed_text  vector_score  \
0  company:1168165 year: 2018 content: Mid-Atlant...           0.0   
1  company:1168165 year: 2018 content: and to a l...           0.0   
2  company:1168165 year: 2018 content: single bus...           0.0   
3  company:1168165 year: 2018 content: customers ...           0.0   
4  company:1168165 year: 2018 content: distributi...           0.0   
5  company:1168165 year: 2018 content: ABOUT MARK...           0.0   
6  company:1168165 year: 2018 content: is exposed...           0.0   
7  company:1168165 year: 2018 content: QUALITATIV...           0.0   
8  company:116816

In [127]:
result_df = pd.concat([sub for sub in results], axis=0)
result_df['chunk_text'] = result_df['embed_text'].apply(lambda x: x.split('content: ')[1])

In [None]:
# result_df.to_csv("results/retrieval_result_v2.csv")

## Evaluation

In [262]:
def cal_recall_precision_f1(reference_df, retrieved_df,col = 'chunk_text'):
    common = set(reference_df[col]) & set(retrieved_df[col])
    num_correct = len(common)

    recall = num_correct / len(reference_df)

    precision = num_correct / len(retrieved_df)
    f1 = 2 * (precision * recall) / (precision + recall+1e-5)

    return [recall, precision, f1]

def cal_mmr(reference_df, retrieved_df,col = 'chunk_text'):
    common = list(set(reference_df[col]) & set(retrieved_df[col]))[0]
    #find the first match and see their rank locaion
    rank = list(retrieved_df[col]).index(common) + 1

    mrr = rank/len(retrieved_df)    

    return mrr

In [252]:
# Load validation dataset
with open("results/val_question_retrieval_pairs_v2.json", "r") as f:
    val_data = json.load(f)

In [203]:
query

'What is the details overview and overview history of the company with CIK 1168165 in 2018?'

In [216]:
ground_truth_df = pd.DataFrame(val_data[query])

In [260]:
methods = list(result_df['method'].unique())
cols = ['section','chunk_text']
top_k = 5

In [265]:
eval_res = []

for method in methods:
    sub_df = result_df[result_df["method"] == method]

    top_k_df = sub_df.head(top_k)
    for col in cols:

        tem_res = cal_recall_precision_f1(reference_df= sub_df,
                            retrieved_df= top_k_df,
                            col = col
                            )
        mrr = cal_mmr(reference_df= sub_df,
                            retrieved_df= top_k_df,
                            col = col
                            )
        print(f"{method} - {col}: recall: {res[0]}, precision: {res[1]}, f1: {res[2]}, mrr: {mrr}")

        res.append([method, col, mrr]+tem_res)

hybrid - section: recall: 0.3333333333333333, precision: 0.2, f1: 0.25, mrr: 0.2
hybrid - chunk_text: recall: 0.3333333333333333, precision: 0.2, f1: 0.25, mrr: 0.4
vector - section: recall: 0.3333333333333333, precision: 0.2, f1: 0.25, mrr: 0.2
vector - chunk_text: recall: 0.3333333333333333, precision: 0.2, f1: 0.25, mrr: 0.4
keyword - section: recall: 0.3333333333333333, precision: 0.2, f1: 0.25, mrr: 0.2
keyword - chunk_text: recall: 0.3333333333333333, precision: 0.2, f1: 0.25, mrr: 0.8
