In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, struct, udf, lit, concat_ws,pandas_udf
from pyspark.sql.types import ArrayType, StringType, FloatType, StructType, StructField
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import uuid
import os
import json
import datasets
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set Java Home
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk@17"
os.environ["SPARK_HOME"] = "/Users/tankwin08/Documents/spark"
os.environ["PYSPARK_PYTHON"] = 'python3'

# Initialize Spark Session
# Initialize Spark session with optimized configurations
spark = SparkSession.builder \
    .appName("SEC10KAnalysis") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.memory.offHeap.size", "4g") \
    .config("spark.default.parallelism", "200") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.task.maxDirectMemory", "2g") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/18 15:58:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [19]:
# Function to chunk text
def chunk_text(text, max_length=1000):
    if not text or not isinstance(text, str):
        return []
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        current_length += len(word) + 1
        if current_length > max_length:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = len(word) + 1
        else:
            current_chunk.append(word)
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

chunk_text_udf = udf(chunk_text, ArrayType(StringType()))

# Overview

(1) load data

(2) preprocessing of data: chunk  + embedding

(3) created a validation dataset: 

(4) retrival using vector similarity

(5) evaluate the performance of retrival

## Load three data

I will only select top 10 company from each file, it may need more time to get understand what will be better way to select data. ideally these data should overlap which maybe better for performance.

In [20]:
# Initialize
all_dfs = []

# List of years to process
years = ["year_2018", "year_2019", "year_2020"]

for year in years:
    # Load dataset for the current year
    dataset = datasets.load_dataset("eloukas/edgar-corpus", year, split="train")
    
    # Convert to pandas DataFrame and take top 10 rows, mostl
    df = dataset.to_pandas().head(10)
    
    # Handle missing values
    df = df.fillna(pd.NA).replace(pd.NA, None)
    
    # Append to the list of DataFrames
    all_dfs.append(df)

# Concatenate all DataFrames
final_df = pd.concat(all_dfs, ignore_index=True)

In [21]:
data = spark.createDataFrame(final_df)

In [22]:
data.show(2)

+----------------+-------+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|        filename|    cik|year|           section_1|          section_1A|          section_1B|           section_2|           section_3|           section_4|           section_5|           section_6|           section_7|          section_7A|           section_8|           section_9|          section_9A|          section_9B|          section_10|          section_11|          section_12|          section_13|          section_14|          section_15|
+----------------+-------+----+--------------------+--------------------+--------------------+--

In [23]:
## get section names
sections = data.columns[3:]
sections

['section_1',
 'section_1A',
 'section_1B',
 'section_2',
 'section_3',
 'section_4',
 'section_5',
 'section_6',
 'section_7',
 'section_7A',
 'section_8',
 'section_9',
 'section_9A',
 'section_9B',
 'section_10',
 'section_11',
 'section_12',
 'section_13',
 'section_14',
 'section_15']

## Chunks

In [24]:
# Convert sections to chunks
chunked_data = data.select(
    "cik", "year",
    *[chunk_text_udf(col(section)).alias(f"{section}_chunks") for section in sections]
).repartition(1000)  # Increase partitions to reduce task size

In [25]:
# Explode each section's chunks individually and union
dfs = []
for section in sections:
    section_df = chunked_data.select(
        "cik",
        "year",
        lit(section).alias("section"),
        explode(col(f"{section}_chunks")).alias("chunk_text")
    ).filter(col("chunk_text").isNotNull())
    dfs.append(section_df)

# Union all section DataFrames
final_chunks = dfs[0]
for df in dfs[1:]:
    final_chunks = final_chunks.union(df)

In [26]:
# Convert to Pandas
chunks_df = final_chunks.select("cik", "year", "section", "chunk_text").toPandas()

                                                                                

In [27]:
len(chunks_df)

9857

In [30]:
print(chunks_df['chunk_text'][2])

Canadian corporation, (4) Pacific Green Marine Technologies Limited, a United Kingdom company, (5) Pacific Green Technologies Asia Limited, a Hong Kong company, (6) Pacific Green Technologies China Limited, a Hong Kong company, (7) Pacific Green Marine Technologies Trading Limited, a United Kingdom company, (8) Pacific Green Marine Technologies (Norway) AS, (9) Pacific Green Marine Technologies (USA) Inc., a Delaware Corporation (inactive), (10) Pacific Green Environmental Technologies Ltd. (11) Shanghai Engin Digital Technology Co. Ltd., a Chinese company, and (12) Guangdong Northeast Power Engineering Design Co Ltd., a Chinese company, unless otherwise indicated. Strategy The Company is the proprietary owner of emission control technologies with three distinct applications: ●ENVI-MarineTM, for the marine industry; ●ENVI-PureTM, for the waste to energy and biomass industries; and ●ENVI-CleanTM, designed for coal fired power electricity generation and industrial plants involved in


## embedding with open source model


In [13]:
# # Initialize model and PCA
# model = SentenceTransformer('paraphrase-MiniLM-L3-v2')  # Smallest model

# Load the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the chunk_text column
embeddings = model.encode(chunks_df['chunk_text'].tolist(), show_progress_bar=True)

# Add embeddings as a new column
chunks_df['embedding'] = embeddings.tolist()

# Save to CSV
chunks_df.to_csv("results/three_year_chunk_embeddings.csv", index=False)

## to save memory
spark.stop()

Batches: 100%|██████████| 606/606 [00:22<00:00, 26.48it/s]


In [14]:
chunks_df.head(2)

Unnamed: 0,cik,year,section,chunk_text,embedding
0,1553404,2020,section_1,Item 1. Business This annual report contains f...,"[-0.0021327303256839514, 0.06389391422271729, ..."
1,1553404,2020,section_1,"uncertainties and other factors, including the...","[0.002712106565013528, 0.020565859973430634, 0..."


In [31]:
chunks_df.head()

Unnamed: 0,cik,year,section,chunk_text
0,1553404,2020,section_1,Item 1. Business This annual report contains f...
1,1553404,2020,section_1,"performance, or achievements. Except as requir..."
2,1553404,2020,section_1,"Canadian corporation, (4) Pacific Green Marine..."
3,1553404,2020,section_1,steel generation. The first of the Company’s t...
4,1553404,2020,section_1,marine division has been driving the Company’s...


# Query

This is generated based on looking at one pdf's first section. there is a lot of data understanding need to be done for a better query.
so far, this is mainly for demo purpose.

In [17]:

# Step 3: Create the prompt with CIK
def create_query(question, cik, year):
    prompt = f"""
    Given the following query: "{question}"
    Retrieve the most relevant text chunks from the dataset for the company with CIK {cik} and the year {year}. Return only the chunks that match the specified CIK and year and are semantically similar to the query. Provide the chunk text, cik, year, and section for each retrieved chunk.
    """
    return prompt

In [15]:
question= "What is the organizational history and structure of Community Bancorp and its subsidiary bank?"
cik = '718413'
year = '2020'


In [18]:
create_query(question, cik = cik, year = year)

'\n    Given the following query: "What is the organizational history and structure of Community Bancorp and its subsidiary bank?"\n    Retrieve the most relevant text chunks from the dataset for the company with CIK 718413 and the year 2020. Return only the chunks that match the specified CIK and year and are semantically similar to the query. Provide the chunk text, cik, year, and section for each retrieved chunk.\n    '

## create validation based on file understanding

in this stage, i mainly based on reading to give possible questions.

we can used llm model to generate question and correspdonging retrivals based on the input text and meta data.


In [19]:
all_dfs[-1].head()

Unnamed: 0,filename,cik,year,section_1,section_1A,section_1B,section_2,section_3,section_4,section_5,...,section_8,section_9,section_9A,section_9B,section_10,section_11,section_12,section_13,section_14,section_15
0,718413_2020.htm,718413,2020,Item 1. The Business\nOrganization and Operati...,Item 1A. Risk Factors\nBefore deciding to inve...,Item 1B. Unresolved Staff Comments\nNot Applic...,Item 2. Properties\nAlthough the Company does ...,Item 3. Legal Proceedings\nThere are no pendin...,Item 4. Mine Safety Disclosures\nNot Applicabl...,"Item 5. Market for Registrant’s Common Equity,...",...,Item 8. Financial Statements and Supplementary...,Item 9. Changes in and Disagreements with Acco...,Item 9A. Controls and Procedures\nDisclosure C...,Item 9B. Other Information\nNone\nPART III.\nI...,"Item 10. Directors, Executive Officers and Cor...",Item 11. Executive Compensation\nThe following...,Item 12. Security Ownership of Certain Benefic...,Item 13. Certain Relationships and Related Tra...,Item 14. Principal Accounting Fees and Service...,Item 15. Exhibits and Financial Statement Sche...
1,931059_2020.htm,931059,2020,"Item 1. Business\nRennova Health, Inc. (“Renno...",Item 1A. Risk Factors\nAn investment in our se...,Item 1B. Unresolved Staff Comments\nNot applic...,Item 2. Properties\nThe table below summarizes...,"Item 3. Legal Proceedings\nFrom time to time, ...",Item 4. Mine Safety Disclosures\nNot applicabl...,"Item 5. Market for Registrant’s Common Equity,...",...,Item 8. Financial Statements and Supplementary...,Item 9. Changes in and Disagreements With Acco...,Item 9A. Controls and Procedures.\nEvaluation ...,Item 9B. Other Information.\nNone\nPART III\nI...,"Item 10. Directors, Executive Officers and Cor...",Item 11. Executive Compensation.\nThe followin...,Item 12. Security Ownership of Certain Benefic...,Item 13. Certain Relationships and Related Tra...,Item 14. Principal Accounting Fees and Service...,"Item 15. Exhibits, Financial Statement Schedul..."
2,1282224_2020.htm,1282224,2020,ITEM 1. BUSINESS\nOverview\nWe are a leading i...,ITEM 1A. RISK FACTORS\nRisks Related to our Bu...,ITEM 1B. UNRESOLVED STAFF COMMENTS\nNone.\nITE...,ITEM 2. PROPERTIES\nAs of the date of this rep...,ITEM 3. LEGAL PROCEEDINGS\nWe currently do not...,ITEM 4. MINE SAFETY DISCLOSURES\nNot applicabl...,"ITEM 5. MARKET FOR REGISTRANT’S COMMON EQUITY,...",...,ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY...,ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,ITEM 9A. CONTROLS AND PROCEDURES\nManagement’s...,ITEM 9B. OTHER INFORMATION\nItem 3.02 Unregist...,"ITEM 10. DIRECTORS, EXECUTIVE OFFICERS, AND CO...",ITEM 11. EXECUTIVE COMPENSATION\nThe informati...,ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...,ITEM 14. PRINCIPAL ACCOUNTING FEES AND SERVICE...,"ITEM 15. EXHIBITS, FINANCIAL STATEMENT SCHEDUL..."
3,723531_2020.htm,723531,2020,Item 1. Business\nUnless we state otherwise or...,Item 1A. Risk Factors\nOur future results of o...,Item 1B. Unresolved Staff Comments\nNone.\nIte...,Item 2. Properties\nWe owned and leased the fo...,Item 3. Legal Proceedings\nWe are subject to v...,Item 4. Mine Safety Disclosures\nNot applicabl...,"Item 5. Market for Registrant’s Common Equity,...",...,Item 8. Financial Statements and Supplementary...,Item 9. Changes in and Disagreements with Acco...,Item 9A. Controls and Procedures\nDisclosure C...,Item 9B. Other Information\nNone.\nPART III\nI...,"Item 10. Directors, Executive Officers and Cor...",Item 11. Executive Compensation\nThe informati...,Item 12. Security Ownership of Certain Benefic...,Item 13. Certain Relationships and Related Tra...,Item 14. Principal Accounting Fees and Service...,Item 15. Exhibits and Financial Statement Sche...
4,1490873_2020.htm,1490873,2020,ITEM 1.\nBUSINESS\nCorporate Overview\nThe Com...,ITEM 1A.\nRISK FACTORS\nThe Company is a small...,ITEM 1B.\nUNRESOLVED STAFF COMMENTS\nNone.\nIT...,ITEM 2.\nPROPERTIES\nThe Company's corporate h...,"ITEM 3.\nLEGAL PROCEEDINGS\n(1) On July 7, 201...",ITEM 4.\nMINE SAFETY DISCLOSURES\nNot applicab...,ITEM 5.\nMARKET FOR COMMON EQUITY AND RELATED ...,...,ITEM 8.\nFINANCIAL STATEMENTS AND SUPPLEMENTAR...,ITEM 9.\nCHANGES IN AND DISAGREEMENTS WITH ACC...,ITEM 9A. CONTROLS AND PROCEDURES\nEvaluation o...,ITEM 9B. OTHER INFORMATION\nNone.\nPART III\nI...,"ITEM 10.\nDIRECTORS, EXECUTIVE OFFICERS AND CO...",ITEM 11.\nEXECUTIVE COMPENSATION\nSummary Comp...,ITEM 12.\nSECURITY OWNERSHIP OF CERTAIN BENEFI...,ITEM 13.\nCERTAIN RELATIONSHIPS AND RELATED TR...,ITEM 14.\nPRINCIPAL ACCOUNTANTS FEES AND SERVI...,"ITEM 15.\nEXHIBITS, FINANCIAL STATEMENT SCHEDU..."


In [20]:
all_dfs[-1].head()

Unnamed: 0,filename,cik,year,section_1,section_1A,section_1B,section_2,section_3,section_4,section_5,...,section_8,section_9,section_9A,section_9B,section_10,section_11,section_12,section_13,section_14,section_15
0,718413_2020.htm,718413,2020,Item 1. The Business\nOrganization and Operati...,Item 1A. Risk Factors\nBefore deciding to inve...,Item 1B. Unresolved Staff Comments\nNot Applic...,Item 2. Properties\nAlthough the Company does ...,Item 3. Legal Proceedings\nThere are no pendin...,Item 4. Mine Safety Disclosures\nNot Applicabl...,"Item 5. Market for Registrant’s Common Equity,...",...,Item 8. Financial Statements and Supplementary...,Item 9. Changes in and Disagreements with Acco...,Item 9A. Controls and Procedures\nDisclosure C...,Item 9B. Other Information\nNone\nPART III.\nI...,"Item 10. Directors, Executive Officers and Cor...",Item 11. Executive Compensation\nThe following...,Item 12. Security Ownership of Certain Benefic...,Item 13. Certain Relationships and Related Tra...,Item 14. Principal Accounting Fees and Service...,Item 15. Exhibits and Financial Statement Sche...
1,931059_2020.htm,931059,2020,"Item 1. Business\nRennova Health, Inc. (“Renno...",Item 1A. Risk Factors\nAn investment in our se...,Item 1B. Unresolved Staff Comments\nNot applic...,Item 2. Properties\nThe table below summarizes...,"Item 3. Legal Proceedings\nFrom time to time, ...",Item 4. Mine Safety Disclosures\nNot applicabl...,"Item 5. Market for Registrant’s Common Equity,...",...,Item 8. Financial Statements and Supplementary...,Item 9. Changes in and Disagreements With Acco...,Item 9A. Controls and Procedures.\nEvaluation ...,Item 9B. Other Information.\nNone\nPART III\nI...,"Item 10. Directors, Executive Officers and Cor...",Item 11. Executive Compensation.\nThe followin...,Item 12. Security Ownership of Certain Benefic...,Item 13. Certain Relationships and Related Tra...,Item 14. Principal Accounting Fees and Service...,"Item 15. Exhibits, Financial Statement Schedul..."
2,1282224_2020.htm,1282224,2020,ITEM 1. BUSINESS\nOverview\nWe are a leading i...,ITEM 1A. RISK FACTORS\nRisks Related to our Bu...,ITEM 1B. UNRESOLVED STAFF COMMENTS\nNone.\nITE...,ITEM 2. PROPERTIES\nAs of the date of this rep...,ITEM 3. LEGAL PROCEEDINGS\nWe currently do not...,ITEM 4. MINE SAFETY DISCLOSURES\nNot applicabl...,"ITEM 5. MARKET FOR REGISTRANT’S COMMON EQUITY,...",...,ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY...,ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,ITEM 9A. CONTROLS AND PROCEDURES\nManagement’s...,ITEM 9B. OTHER INFORMATION\nItem 3.02 Unregist...,"ITEM 10. DIRECTORS, EXECUTIVE OFFICERS, AND CO...",ITEM 11. EXECUTIVE COMPENSATION\nThe informati...,ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...,ITEM 14. PRINCIPAL ACCOUNTING FEES AND SERVICE...,"ITEM 15. EXHIBITS, FINANCIAL STATEMENT SCHEDUL..."
3,723531_2020.htm,723531,2020,Item 1. Business\nUnless we state otherwise or...,Item 1A. Risk Factors\nOur future results of o...,Item 1B. Unresolved Staff Comments\nNone.\nIte...,Item 2. Properties\nWe owned and leased the fo...,Item 3. Legal Proceedings\nWe are subject to v...,Item 4. Mine Safety Disclosures\nNot applicabl...,"Item 5. Market for Registrant’s Common Equity,...",...,Item 8. Financial Statements and Supplementary...,Item 9. Changes in and Disagreements with Acco...,Item 9A. Controls and Procedures\nDisclosure C...,Item 9B. Other Information\nNone.\nPART III\nI...,"Item 10. Directors, Executive Officers and Cor...",Item 11. Executive Compensation\nThe informati...,Item 12. Security Ownership of Certain Benefic...,Item 13. Certain Relationships and Related Tra...,Item 14. Principal Accounting Fees and Service...,Item 15. Exhibits and Financial Statement Sche...
4,1490873_2020.htm,1490873,2020,ITEM 1.\nBUSINESS\nCorporate Overview\nThe Com...,ITEM 1A.\nRISK FACTORS\nThe Company is a small...,ITEM 1B.\nUNRESOLVED STAFF COMMENTS\nNone.\nIT...,ITEM 2.\nPROPERTIES\nThe Company's corporate h...,"ITEM 3.\nLEGAL PROCEEDINGS\n(1) On July 7, 201...",ITEM 4.\nMINE SAFETY DISCLOSURES\nNot applicab...,ITEM 5.\nMARKET FOR COMMON EQUITY AND RELATED ...,...,ITEM 8.\nFINANCIAL STATEMENTS AND SUPPLEMENTAR...,ITEM 9.\nCHANGES IN AND DISAGREEMENTS WITH ACC...,ITEM 9A. CONTROLS AND PROCEDURES\nEvaluation o...,ITEM 9B. OTHER INFORMATION\nNone.\nPART III\nI...,"ITEM 10.\nDIRECTORS, EXECUTIVE OFFICERS AND CO...",ITEM 11.\nEXECUTIVE COMPENSATION\nSummary Comp...,ITEM 12.\nSECURITY OWNERSHIP OF CERTAIN BENEFI...,ITEM 13.\nCERTAIN RELATIONSHIPS AND RELATED TR...,ITEM 14.\nPRINCIPAL ACCOUNTANTS FEES AND SERVI...,"ITEM 15.\nEXHIBITS, FINANCIAL STATEMENT SCHEDU..."


In [21]:
print(all_dfs[-1]['section_1'][0])

Item 1. The Business
Organization and Operation
The Company. The Company was organized under the laws of the State of Vermont in 1982 and became a registered bank holding company under the Bank Holding Company Act of 1956, as amended, in October 1983 when it acquired all of the voting shares of the Bank, headquartered in Derby, Vermont. The Bank is the only subsidiary of the Company and principally all of the Company’s business operations are presently conducted through it. Therefore, the following narrative and the other information about the Company contained in this report are based primarily on the Bank’s operations.
The Bank; Banking Services. Community National Bank was organized in 1851 as the Peoples Bank, and was subsequently reorganized as the National Bank of Derby Line in 1865. In 1975, after 110 continuous years of operation as the National Bank of Derby Line, the Bank acquired the Island Pond National Bank and changed its name to “Community National Bank.” On December 31,

In [22]:
## this will be our validation dataset with five questions
cik = '718413'
year = '2020'

In [23]:
questions = ["What is the organizational history and structure of Community Bancorp and its subsidiary bank?",
"What types of banking services does Community National Bank offer to its customers?",
"Which geographic markets does Community Bancorp target for growth and expansion?",
"Who are the main competitors of Community National Bank in its market areas?",
"How do regulatory policies, particularly from the Federal Reserve Board, impact Community Bancorp’s operations?"
]

In [24]:
cik = '718413'
year = '2020'
question = questions[0]

query = create_query(question,cik, year)

In [25]:
query

'\n    Given the following query: "What is the organizational history and structure of Community Bancorp and its subsidiary bank?"\n    Retrieve the most relevant text chunks from the dataset for the company with CIK 718413 and the year 2020. Return only the chunks that match the specified CIK and year and are semantically similar to the query. Provide the chunk text, cik, year, and section for each retrieved chunk.\n    '

## Retrival model

Demonstrate LLM retrieval using embeddings

If we have more time, we can do some comparions and used commerical embeeding, generally commerical embedding is more powerful than open source embedding. 

I will just showed one example, we can have more examples to explore but need more time to explore.

this one is manily to show the model is working.


In [30]:
query

'\n    Given the following query: "What is the organizational history and structure of Community Bancorp and its subsidiary bank?"\n    Retrieve the most relevant text chunks from the dataset for the company with CIK 718413 and the year 2020. Return only the chunks that match the specified CIK and year and are semantically similar to the query. Provide the chunk text, cik, year, and section for each retrieved chunk.\n    '

In [31]:
question

'What is the organizational history and structure of Community Bancorp and its subsidiary bank?'

In [27]:

# Load the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode the query
query_embedding = model.encode([query])[0]

# Filter DataFrame for the specified CIK and year using meta data
filtered_df = chunks_df[(chunks_df['cik'] == cik) & (chunks_df['year'] == year)].copy()

In [28]:

# Compute cosine similarity between query and chunk embeddings, we also can improve this part
## we can add hybird search and reranking here to improve the performance

embeddings = np.array(filtered_df['embedding'].tolist())
similarities = cosine_similarity([query_embedding], embeddings)[0]

# Add similarity scores to the DataFrame
filtered_df['similarity'] = similarities

# Sort by similarity and get top 5 most relevant chunks
top_k = 5
retrieved_df = filtered_df.sort_values(by='similarity', ascending=False).head(top_k)

  ret = a @ b
  ret = a @ b
  ret = a @ b


In [29]:
retrieved_df.head()

Unnamed: 0,cik,year,section,chunk_text,embedding,similarity
11804,718413,2020,section_8,Item 8. Financial Statements and Supplementary...,"[-0.04734430089592934, -0.0024288450367748737,...",0.332971
180,718413,2020,section_1,Company’s website at www.communitybancorpvt.co...,"[-0.051769010722637177, -0.012434111908078194,...",0.279183
96,718413,2020,section_1,"incur, costs in connection with its on-going c...","[-0.06663941591978073, -0.05993643030524254, -...",0.258534
159,718413,2020,section_1,TRID rules has increased the Company’s complia...,"[0.0031465329229831696, -0.06038631871342659, ...",0.252277
8343,718413,2020,section_5,The balance of the information required by ite...,"[-0.0326150618493557, 0.015050563029944897, -0...",0.247863


## evaluation of recall

we know all section should come from section 1, it is around 60% for retrival 

if we have dataset like this, then we can evaulate the performance of retrival



In [35]:
# Load validation dataset
with open("results/val_question_retrieval_pairs.json", "r") as f:
    val_data = json.load(f)

In [36]:
question = "What is the organizational history and structure of Community Bancorp and its subsidiary bank?"

In [37]:

ground_truth = val_data[question][0]  # Take the first (and only) relevant chunk
ground_truth_dict = {
    "cik": ground_truth["cik"],
    "year": ground_truth["year"],
    "section": ground_truth["section"],
    "chunk_text": ground_truth["chunk_text"]
}

In [42]:
# Function to check if retrieved chunk matches ground truth
def is_match(retrieved, ground_truth):
    return (retrieved["cik"] == ground_truth["cik"] and
            retrieved["year"] == ground_truth["year"] and
            retrieved["section"] == ground_truth["section"] and
            retrieved["chunk_text"] == ground_truth["chunk_text"])

# Evaluation function with top_k parameter
def evaluate_retrieval(retrieved_df, ground_truth_dict, top_k=5):
    # Take the top-k rows
    top_k_df = retrieved_df.head(top_k)
    
    # Check which of the top-k chunks match the ground truth
    matches = [is_match(row, ground_truth_dict) for _, row in top_k_df.iterrows()]
    num_correct = sum(matches)
    
    # Calculate metrics
    precision = num_correct / top_k if top_k > 0 else 0.0  # Precision@k
    recall = num_correct / 1.0 if num_correct > 0 else 0.0  # Recall@k (1 ground truth chunk)
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0  # F1@k
    
    # Get the highest similarity score among top-k for relevance check
    max_similarity = top_k_df["similarity"].max()
    ground_truth_similarity = ground_truth["similarity"]
    
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "num_correct": num_correct,
        "max_similarity": max_similarity,
        "ground_truth_similarity": ground_truth_similarity
    }

# Evaluate with top_k=5
top_k = 5
results = evaluate_retrieval(retrieved_df, ground_truth_dict, top_k)

In [43]:
results

{'precision': 0.0,
 'recall': 0.0,
 'f1': 0.0,
 'num_correct': 0,
 'max_similarity': np.float64(0.3329705583830159),
 'ground_truth_similarity': 0.5382008455522215}