## 1- Set up

In [None]:
import sys
from transformers import AutoTokenizer
if '..' not in sys.path:
    sys.path.append('..')
import pandas as pd 
import csv
csv.field_size_limit(sys.maxsize)

from tqdm import tqdm
from statistics import mean
from utils.embeddings_utils import compute_text_embedding 
from utils.chunking_utils import adaptive_semantic_chunking, simple_chunking, sentence_chunking

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [4]:
df_longbench_filtered = pd.read_csv('../data/longbench_filtered.csv', delimiter="§", engine='python')
df_longbench_filtered.head(3)

Unnamed: 0,_id,domain,sub_domain,difficulty,length,question,choice_A,choice_B,choice_C,choice_D,answer,context,context_tokens,within_context_window
0,66f36490821e116aacb2cc22,Single-Document QA,Financial,easy,short,"According to the report, how to promote the co...","Through technology empowerment, change the way...","Establish new types of courts, such as intelle...",Improve the work ability of office staff and s...,Use advanced information systems to improve th...,D,Contents\nPreface.\n.............................,38133,True
1,66ebed525a08c7b9b35e1cb4,Single-Document QA,Academic,hard,short,"When Miller tried to answer the question ""shou...",Each must read for himself or herself and test...,Readers must reach a high standrad to some deg...,"It is the readers' obligation to get the ""trut...",The performative interpretation of language tr...,B,Chapter Five\nJOSEPH CONRAD:\nSHOULD WE READ\n...,24007,True
2,671b3cabbb02136c067d5252,Long-dialogue History Understanding,Agent history QA,hard,short,Which player got the least utility in the game?,player_1,player_3,player_5,player_7,B,"{\n ""meta"": {\n ""name_exp"": ""gemini-1.0-pr...",43168,True


In [5]:
df_longbench_filtered['context']

0      Contents\nPreface.\n.............................
1      Chapter Five\nJOSEPH CONRAD:\nSHOULD WE READ\n...
2      {\n  "meta": {\n    "name_exp": "gemini-1.0-pr...
3      Video PreTraining (VPT): Learning to Act by\nW...
4      Chapter 1\nIT WAS INEVITABLE: the scent of bit...
                             ...                        
298    WebGPT: Browser-assisted question-answering wi...
299    [\n    [\n        {\n            "role": "user...
300    Preprint. Under review.\nAutonomous Evaluation...
301    CROUCH END, LONDON A bottle of wine. A family-...
302    Article\nAccurate structure prediction of \nbi...
Name: context, Length: 303, dtype: object

## 2- Simple Chunking Function
- chunk_size set at 512-token chunks as common pratice for LLM
- overlap at 64-token overlap prevents information loss
- Key reference: https://www.geeksforgeeks.org/how-to-chunk-text-data-a-comparative-analysis/

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def simple_chunking(text, chunk_size=512, overlap=64):
    tokens = tokenizer.tokenize(text)
    chunks = []
    
    start = 0
    while start < len(tokens):
        end = start + chunk_size
        chunk = tokens[start:end]
        chunk_text = tokenizer.convert_tokens_to_string(chunk)
        chunks.append(chunk_text)
        start += chunk_size - overlap  # step forward, retaining overlap
    
    return chunks



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [26]:
def apply_simple_chunking(df, chunk_size=512, overlap=64):
    """
    Simple fixed-size chunking with optional overlap for LongBench dataset.

    Parameters:
    df: Input DataFrame containing '_id' and 'context'
    chunk_size: Number of tokens per chunk
    overlap: Token overlap between consecutive chunks

    Returns:
    pd.DataFrame: Chunked dataset with columns [_id, chunk_id, chunk_text, embeddings]
    """
    import pandas as pd
    from tqdm import tqdm

    chunked_df = pd.DataFrame(columns=["_id", "chunk_id", "chunk_text", "embeddings"])

    for _, row in tqdm(df.iterrows(), total=len(df)):
        context = row["context"]
        chunks = simple_chunking(text=context, chunk_size=chunk_size, overlap=overlap)
        embeddings = [compute_text_embedding(chunk) for chunk in chunks]

        chunked_df = pd.concat([chunked_df, pd.DataFrame({
            "_id": [row["_id"]] * len(chunks),
            "chunk_id": range(len(chunks)),
            "chunk_text": chunks,
            "embeddings": embeddings
        })], ignore_index=True)

    return chunked_df

## 3- Test 

In [28]:
df_longbench_context_chunked_simple = apply_simple_chunking(
    df=df_longbench_filtered[:5],  # Process first 5 rows for testing
    chunk_size=512)


100%|██████████| 5/5 [05:02<00:00, 60.58s/it]


In [29]:
df_longbench_context_chunked_simple


Unnamed: 0,_id,chunk_id,chunk_text,embeddings
0,66f36490821e116aacb2cc22,0,contents preface.................................,"[[-0.01959228515625, -0.00420379638671875, -0...."
1,66f36490821e116aacb2cc22,1,system and mechanism of judicial service and s...,"[[-0.0224609375, 0.044097900390625, -0.0298461..."
2,66f36490821e116aacb2cc22,2,"2019 / 03 / 01, 星 [UNK] 五 17 : 42 : 01 中 国 法 [...","[[-0.01340484619140625, 0.0162353515625, -0.04..."
3,66f36490821e116aacb2cc22,3,"assessors, defense, and judgment of the second...","[[0.00482940673828125, 0.020172119140625, -0.0..."
4,66f36490821e116aacb2cc22,4,areas. the judicial reform has become an impor...,"[[-0.0039520263671875, -0.00231170654296875, -..."
...,...,...,...,...
558,66f920d8bb02136c067c4b81,191,in the closets : until the nexttime. she would...,"[[-0.0099639892578125, 0.017486572265625, -0.0..."
559,66f920d8bb02136c067c4b81,192,"arthritis and repenting herwayward life, in th...","[[-0.0004036426544189453, 0.033294677734375, -..."
560,66f920d8bb02136c067c4b81,193,along the docks. fermina daza had heard that s...,"[[0.0285491943359375, 0.024078369140625, -0.02..."
561,66f920d8bb02136c067c4b81,194,shadows of the drawing room he did not have ti...,"[[0.0195465087890625, 0.026275634765625, -0.03..."


In [30]:
df_longbench_context_chunked_simple = apply_simple_chunking(
    df=df_longbench_filtered,  
    chunk_size=512,            
    overlap=64              
)
df_longbench_context_chunked_simple.to_csv('../data/longbench_context_chunked_simple.csv', index=False)



100%|██████████| 303/303 [7:43:51<00:00, 91.85s/it]   


In [34]:
df_longbench_context_chunked_simple

Unnamed: 0,_id,chunk_id,chunk_text,embeddings
0,66f36490821e116aacb2cc22,0,contents preface.................................,"[[-0.01959228515625, -0.00420379638671875, -0...."
1,66f36490821e116aacb2cc22,1,system and mechanism of judicial service and s...,"[[-0.0224609375, 0.044097900390625, -0.0298461..."
2,66f36490821e116aacb2cc22,2,"2019 / 03 / 01, 星 [UNK] 五 17 : 42 : 01 中 国 法 [...","[[-0.01340484619140625, 0.0162353515625, -0.04..."
3,66f36490821e116aacb2cc22,3,"assessors, defense, and judgment of the second...","[[0.00482940673828125, 0.020172119140625, -0.0..."
4,66f36490821e116aacb2cc22,4,areas. the judicial reform has become an impor...,"[[-0.0039520263671875, -0.00231170654296875, -..."
...,...,...,...,...
37969,66f2a7a9821e116aacb2a721,191,##lter term list is available upon request. 2....,"[[-0.032470703125, -0.0443115234375, 0.0051574..."
37970,66f2a7a9821e116aacb2a721,192,"), or to a compute - matched version of esm3 -...","[[-0.0238800048828125, -0.035552978515625, -0...."
37971,66f2a7a9821e116aacb2a721,193,##3 - open achieves a mean average precision f...,"[[-0.0280914306640625, -0.0279388427734375, -0..."
37972,66f2a7a9821e116aacb2a721,194,##1 the esm3 architecture........... 22 s2 geo...,"[[-0.0116119384765625, -0.01470947265625, -0.0..."


In [32]:
df_longbench_context_chunked_simple.to_csv('../data/longbench_context_chunked_simple.csv', sep = "§", index=False)


In [41]:
import sys
import csv

csv.field_size_limit(sys.maxsize)
df_test = pd.read_csv(
    '../data/longbench_context_chunked_simple.csv',
    sep='§',              
    engine='python'       
)
df_test.head()


Unnamed: 0,_id,chunk_id,chunk_text,embeddings
0,66f36490821e116aacb2cc22,0,contents preface.................................,"[[-0.01959228515625, -0.00420379638671875, -0...."
1,66f36490821e116aacb2cc22,1,system and mechanism of judicial service and s...,"[[-0.0224609375, 0.044097900390625, -0.0298461..."
2,66f36490821e116aacb2cc22,2,"2019 / 03 / 01, 星 [UNK] 五 17 : 42 : 01 中 国 法 [...","[[-0.01340484619140625, 0.0162353515625, -0.04..."
3,66f36490821e116aacb2cc22,3,"assessors, defense, and judgment of the second...","[[0.00482940673828125, 0.020172119140625, -0.0..."
4,66f36490821e116aacb2cc22,4,areas. the judicial reform has become an impor...,"[[-0.0039520263671875, -0.00231170654296875, -..."
