# Setup

In [11]:
import sys

if '..' not in sys.path:
    sys.path.append('..')

import logging
import pandas as pd 
import csv
csv.field_size_limit(sys.maxsize)

from tqdm import tqdm
from statistics import mean
from utils.embeddings_utils import compute_text_embedding 
from utils.chunking_utils import adaptive_semantic_chunking, simple_chunking, sentence_chunking

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [14]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,  # Set to WARNING to mute INFO level logs
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('../logs/chunking.log')  # Remove StreamHandler to prevent console output
    ]
)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)  # Also set logger level to WARNING


# Load LongBench Data

In [13]:
df_longbench_filtered = pd.read_csv('../data/longbench_filtered.csv', delimiter="§", engine='python')
df_longbench_filtered.head()

Unnamed: 0,_id,domain,sub_domain,difficulty,length,question,choice_A,choice_B,choice_C,choice_D,answer,context,context_tokens,within_context_window
0,66f36490821e116aacb2cc22,Single-Document QA,Financial,easy,short,"According to the report, how to promote the co...","Through technology empowerment, change the way...","Establish new types of courts, such as intelle...",Improve the work ability of office staff and s...,Use advanced information systems to improve th...,D,Contents\nPreface.\n.............................,38133,True
1,66ebed525a08c7b9b35e1cb4,Single-Document QA,Academic,hard,short,"When Miller tried to answer the question ""shou...",Each must read for himself or herself and test...,Readers must reach a high standrad to some deg...,"It is the readers' obligation to get the ""trut...",The performative interpretation of language tr...,B,Chapter Five\nJOSEPH CONRAD:\nSHOULD WE READ\n...,24007,True
2,671b3cabbb02136c067d5252,Long-dialogue History Understanding,Agent history QA,hard,short,Which player got the least utility in the game?,player_1,player_3,player_5,player_7,B,"{\n ""meta"": {\n ""name_exp"": ""gemini-1.0-pr...",43168,True
3,66ec0c4c821e116aacb1994a,Multi-Document QA,Academic,easy,medium,Which of the following statements is correct?,Both contractor data and data crawled from the...,All machine learning methods involved in the t...,Both voyager and VPT control Minecraft agents ...,VPT's modeling of action space is approximate ...,D,Video PreTraining (VPT): Learning to Act by\nW...,67185,True
4,66f920d8bb02136c067c4b81,Single-Document QA,Literary,hard,medium,What is mainly symbolized by the frequent chol...,Confusion of The Times,The impermanence of the character's fate,Love is dangerous and uncontrollable,Social indifference,C,Chapter 1\nIT WAS INEVITABLE: the scent of bit...,85218,True


In [4]:
df_longbench_filtered.shape

(303, 14)

# Apply Adaptive Semantic Chunking for filtered LongBench

In [6]:
df_longbench_context_chunked_asc = pd.DataFrame(columns=["_id", "chunk_id", "chunk_text", "embeddings"]) 

for _, row in tqdm(df_longbench_filtered.iterrows(), total=len(df_longbench_filtered)):
    logger.info(f"Processing row {row['_id']} #########################################################")
    id = row["_id"]
    context = row["context"]
    chunk_list, embeddings_list, cosine_similarity_list = adaptive_semantic_chunking(context, similarity_threshold=0.75)
    logger.info(f"Mean cosine similarity: {mean(cosine_similarity_list)}")
    for i in range(len(chunk_list)):
        chunk_text = chunk_list[i]
        chunk_embedding = embeddings_list[i]
        df_longbench_context_chunked_asc = pd.concat([df_longbench_context_chunked_asc, pd.DataFrame([{"_id": id, "chunk_id": i, "chunk_text": chunk_text, "embeddings": chunk_embedding}])], ignore_index=True)

100%|██████████| 303/303 [7:48:47<00:00, 92.83s/it]   


In [7]:
df_longbench_context_chunked_asc.head()

Unnamed: 0,_id,chunk_id,chunk_text,embeddings
0,66f36490821e116aacb2cc22,0,Contents\nPreface.\n.............................,"[[-0.0290374755859375, -0.0050048828125, -0.03..."
1,66f36490821e116aacb2cc22,1,Advancing the Construction of Intelligent Cour...,"[[-0.0261077880859375, 0.0311126708984375, 0.0..."
2,66f36490821e116aacb2cc22,2,67\n-\n-\nJudicial Reform of Chinese Courts（20...,"[[-0.009765625, 0.031280517578125, -0.04086303..."
3,66f36490821e116aacb2cc22,3,The said three Programs served as the basis of...,"[[0.00789642333984375, 0.01081085205078125, -0..."
4,66f36490821e116aacb2cc22,4,In consideration that the improvement of class...,"[[-0.045562744140625, 0.043731689453125, -0.01..."


In [8]:
# df_longbench_context_chunked_asc.to_csv("../data/longbench_context_chunked_asc.csv", index=False, sep="§")

# Apply Simple Chunking for filtered LongBench

In [None]:
def apply_simple_chunking(df, chunk_size=256, chunk_overlap=20):
    """
    Simple fixed-size chunking with optional overlap for LongBench dataset.

    Parameters:
    df: Input DataFrame containing '_id' and 'context'
    chunk_size: Number of tokens per chunk
    overlap: Token overlap between consecutive chunks

    Returns:
    pd.DataFrame: Chunked dataset with columns [_id, chunk_id, chunk_text, embeddings]
    """

    chunked_df = pd.DataFrame(columns=["_id", "chunk_id", "chunk_text", "embeddings"])

    for _, row in tqdm(df.iterrows(), total=len(df)):
        context = row["context"]
        chunks = simple_chunking(text=context, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        embeddings = [compute_text_embedding(chunk) for chunk in chunks]

        chunked_df = pd.concat([chunked_df, pd.DataFrame({
            "_id": [row["_id"]] * len(chunks),
            "chunk_id": range(len(chunks)),
            "chunk_text": chunks,
            "embeddings": embeddings
        })], ignore_index=True)

    return chunked_df

In [8]:
df_longbench_context_chunked_simple = apply_simple_chunking(
    df=df_longbench_filtered,  
    chunk_size=512,            
    chunk_overlap=64              
)

100%|██████████| 5/5 [03:53<00:00, 46.67s/it]


In [None]:
# df_longbench_context_chunked_simple.to_csv('../data/longbench_context_chunked_simple.csv', sep="§", index=False)

# Apply Sentence Chunking for filtered LongBench

In [21]:
import re

def sentence_chunking(text):
    sentence_endings = re.compile(r'(?<=[.!?]) +')
    sentences = sentence_endings.split(text)
    return sentences

In [22]:
def apply_sentence_chunking(df):
    """
    Sentence-based chunking for LongBench dataset.

    Parameters:
    df: Input DataFrame containing '_id' and 'context'

    Returns:
    pd.DataFrame: Chunked dataset with columns [_id, chunk_id, chunk_text, embeddings]
    """
    chunked_df = pd.DataFrame(columns=["_id", "chunk_id", "chunk_text", "embeddings"])

    for _, row in tqdm(df.iterrows(), total=len(df)):
        context = row["context"]
        chunks = sentence_chunking(text=context)
        embeddings = [compute_text_embedding(chunk) for chunk in chunks]

        chunked_df = pd.concat([chunked_df, pd.DataFrame({
            "_id": [row["_id"]] * len(chunks),
            "chunk_id": range(len(chunks)),
            "chunk_text": chunks,
            "embeddings": embeddings
        })], ignore_index=True)

    return chunked_df

df_longbench_context_chunked_sentence = apply_sentence_chunking(df_longbench_filtered)

  0%|          | 0/303 [02:46<?, ?it/s]


KeyboardInterrupt: 