# Setup

In [1]:
import sys

if '..' not in sys.path:
    sys.path.append('..')

import pandas as pd 
import csv
csv.field_size_limit(sys.maxsize)

from tqdm import tqdm
from statistics import mean
from utils.embeddings_utils import compute_text_embedding 
from utils.chunking_utils import adaptive_semantic_chunking, simple_chunking, sentence_chunking

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

# Load LongBench Data

In [2]:
df_longbench_filtered = pd.read_csv('../data/longbench_filtered.csv', delimiter="§", engine='python')
df_longbench_filtered.head()

Unnamed: 0,_id,domain,sub_domain,difficulty,length,question,choice_A,choice_B,choice_C,choice_D,answer,context,context_tokens,within_context_window
0,66f36490821e116aacb2cc22,Single-Document QA,Financial,easy,short,"According to the report, how to promote the co...","Through technology empowerment, change the way...","Establish new types of courts, such as intelle...",Improve the work ability of office staff and s...,Use advanced information systems to improve th...,D,Contents\nPreface.\n.............................,38133,True
1,66ebed525a08c7b9b35e1cb4,Single-Document QA,Academic,hard,short,"When Miller tried to answer the question ""shou...",Each must read for himself or herself and test...,Readers must reach a high standrad to some deg...,"It is the readers' obligation to get the ""trut...",The performative interpretation of language tr...,B,Chapter Five\nJOSEPH CONRAD:\nSHOULD WE READ\n...,24007,True
2,671b3cabbb02136c067d5252,Long-dialogue History Understanding,Agent history QA,hard,short,Which player got the least utility in the game?,player_1,player_3,player_5,player_7,B,"{\n ""meta"": {\n ""name_exp"": ""gemini-1.0-pr...",43168,True
3,66ec0c4c821e116aacb1994a,Multi-Document QA,Academic,easy,medium,Which of the following statements is correct?,Both contractor data and data crawled from the...,All machine learning methods involved in the t...,Both voyager and VPT control Minecraft agents ...,VPT's modeling of action space is approximate ...,D,Video PreTraining (VPT): Learning to Act by\nW...,67185,True
4,66f920d8bb02136c067c4b81,Single-Document QA,Literary,hard,medium,What is mainly symbolized by the frequent chol...,Confusion of The Times,The impermanence of the character's fate,Love is dangerous and uncontrollable,Social indifference,C,Chapter 1\nIT WAS INEVITABLE: the scent of bit...,85218,True


In [3]:
df_longbench_filtered.shape

(303, 14)

# Apply Adaptive Semantic Chunking for filtered LongBench

In [4]:
df_longbench_context_chunked_asc = pd.DataFrame(columns=["_id", "chunk_id", "chunk_text", "embeddings"]) 

for _, row in tqdm(df_longbench_filtered[:5].iterrows(), total=len(df_longbench_filtered[:5])):
    id = row["_id"]
    context = row["context"]
    chunk_list, embeddings_list, cosine_similarity_list = adaptive_semantic_chunking(context, similarity_threshold=0.75)
    print(mean(cosine_similarity_list))
    for i in range(len(chunk_list)):
        chunk_text = chunk_list[i]
        chunk_embedding = embeddings_list[i]
        df_longbench_context_chunked_asc = pd.concat([df_longbench_context_chunked_asc, pd.DataFrame([{"_id": id, "chunk_id": i, "chunk_text": chunk_text, "embeddings": chunk_embedding}])], ignore_index=True)

  0%|          | 0/5 [00:00<?, ?it/s]

text_chunks length 169


 20%|██        | 1/5 [01:32<06:09, 92.44s/it]

output_chunks_list length 27
0.7060866950432272
text_chunks length 105


 40%|████      | 2/5 [02:13<03:06, 62.10s/it]

output_chunks_list length 55
0.6470268667754439
text_chunks length 228


 60%|██████    | 3/5 [04:12<02:55, 88.00s/it]

output_chunks_list length 11
0.8457554592341351
text_chunks length 287


 80%|████████  | 4/5 [06:17<01:42, 102.78s/it]

output_chunks_list length 104
0.6713605300739648
text_chunks length 376


100%|██████████| 5/5 [09:00<00:00, 108.07s/it]

output_chunks_list length 183
0.6500917495535873





In [5]:
df_longbench_context_chunked_asc.head()

Unnamed: 0,_id,chunk_id,chunk_text,embeddings
0,66f36490821e116aacb2cc22,0,Contents\nPreface.\n.............................,"[[-0.0235137939453125, 0.033905029296875, -0.0..."
1,66f36490821e116aacb2cc22,1,By following the \nidea of separating complica...,"[[-0.072509765625, -0.0204620361328125, -0.010..."
2,66f36490821e116aacb2cc22,2,Shenzhen Intermediate \nPeople’s Court in Guan...,"[[-0.0416259765625, -0.0021457672119140625, -0..."
3,66f36490821e116aacb2cc22,3,Siming District People’s Court and Lujiang Not...,"[[-0.06927490234375, -0.0158233642578125, -0.0..."
4,66f36490821e116aacb2cc22,4,The \nSupreme People’s Court has issued the re...,"[[-0.0762939453125, -0.00177764892578125, -0.0..."


# Apply Simple Chunking for filtered LongBench

In [None]:
# TODO: Edit here for issue #1 

# Apply Sentence Chunking for filtered LongBench

In [None]:
# TODO: Edit here for issue #2