CHUNK: guardian_articles

In [4]:
# !pip install transformers nltk pandas
import nltk

try:
    nltk.download('punkt')
    nltk.download('punkt_tab') # !!! punkt_tab
except Exception as e:
    print(e)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/wenhaozhou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/wenhaozhou/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Fix and upgrade：
1. If we use "chunk_size = 512", That means we only have 512 single letters, rather than 512 words. So we should use "tokenizer.encode" to generate the true 512 tokens.
2. "RecursiveCharacterTextSplitter" might cut a sentence in the middle of nowhere. We can use "nltk.sent_tokenize" to ensure that we recognize the whole sentence. Therefore, we can simply save more info.

In [5]:
# 0. import
import pandas as pd
import nltk
from transformers import AutoTokenizer
from tqdm import tqdm

# 1. get ready
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Flan-T5 Tokenizer
model_checkpoint = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def chunk_text(text, max_length=512):
    """
    Split long texts into segments of no more than 512 tokens, keeping sentences intact.
    """
    if not isinstance(text, str) or not text.strip():
        return []

    # A. split sentences
    sentences = nltk.tokenize.sent_tokenize(text)
    
    chunks = []
    current_chunk = []
    current_length = 0
    
    for sentence in sentences:
        # B. calculate the token numbers
        token_count = len(tokenizer.encode(" " + sentence, add_special_tokens=False))
        
        # C. Judge: is it longer than our threshold 512 ?
        if current_length + token_count <= max_length:
            current_chunk.append(sentence)
            current_length += token_count
        else:
            if current_chunk:
                chunks.append(" ".join(current_chunk))
            
            # open our new chunks
            current_chunk = [sentence]
            current_length = token_count
            
    # save chunks
    if current_chunk:
        chunks.append(" ".join(current_chunk))
        
    return chunks

# 2. read our guardian_articles
input_file = "guardian_articles.csv"
df = pd.read_csv(input_file)

print(f"original tokens amount: {len(df)}")

# 3. start splitting
new_rows = []

print("cutting...")
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    full_text = row['text'] if pd.notna(row['text']) else ""
    
    if not full_text:
        continue
        
    # make chunks
    chunked_texts = chunk_text(full_text, max_length=512)
    
    # save our results
    for i, chunk in enumerate(chunked_texts):
        new_rows.append({
            "original_index": index,
            "chunk_id": f"{index}_{i}",
            "chunk_index": i, # chunk sequence
            "text": chunk # text for each chunks
        })

# 4. generate our new file
df_chunked = pd.DataFrame(new_rows)
output_file = "guardian_articles_chunked.csv"
df_chunked.to_csv(output_file, index=False)

print(f"\n✅ mission accomplished！")
print(f"count our chunks: {len(df_chunked)}")
print(f"save as: {output_file}")

# small check
df_chunked.head()

original tokens amount: 360
cutting...


100%|████████████████████████████████████████| 360/360 [00:00<00:00, 614.36it/s]


✅ mission accomplished！
count our chunks: 1138
save as: guardian_articles_chunked.csv





Unnamed: 0,original_index,chunk_id,chunk_index,text
0,0,0_0,0,Celso Amorim joined Brazil’s foreign service n...
1,0,0_1,1,“Things which have been building up for years ...
2,0,0_2,2,Mass demonstrations rocked Mexico in 2014 afte...
3,0,0_3,3,“There hasn’t yet been a popular reaction [aga...
4,1,1_0,0,Protesters in Iraq have dealt a symbolic blow ...
