In [16]:
from datasets import load_dataset
import pandas as pd
import re
import string
# Import tokenizer
from transformers import AutoTokenizer

# Load the tokenizer for FLAN-T5 (same model we're using)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
print("✅ Tokenizer loaded!")

✅ Tokenizer loaded!


In [17]:
ag_news = load_dataset("ag_news")
train_df = pd.DataFrame(ag_news['train'])
test_df = pd.DataFrame(ag_news['test'])

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (120000, 2)
Test shape: (7600, 2)


In [18]:
train_df.head()

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


In [19]:
sst2 = load_dataset("glue", "sst2")
print("SST-2 loaded!")
print(f"Train: {len(sst2['train'])} samples")
print(f"Test: {len(sst2['test'])} samples")

SST-2 loaded!
Train: 67349 samples
Test: 1821 samples


In [20]:
# Convert SST-2 to DataFrame
sst2_train_df = pd.DataFrame(sst2['train'])
sst2_test_df = pd.DataFrame(sst2['test'])

print(f"Columns: {sst2_train_df.columns.tolist()}")
sst2_test_df.head()

Columns: ['sentence', 'label', 'idx']


Unnamed: 0,sentence,label,idx
0,uneasy mishmash of styles and genres .,-1,0
1,this film 's relationship to actual tension is...,-1,1
2,"by the end of no such thing the audience , lik...",-1,2
3,director rob marshall went out gunning to make...,-1,3
4,lathan and diggs have considerable personal ch...,-1,4


In [22]:
# Test tokenization on a few samples
sample_texts = train_df['text'].head(3).tolist()

print("Testing tokenization on 3 samples:")
for i, text in enumerate(sample_texts):
    # Tokenize the text
    tokens = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    
    print(f"\nSample {i+1}:")
    print(f"Original text: {text[:100]}...")
    print(f"Token IDs shape: {tokens['input_ids'].shape}")
    print(f"Number of tokens: {tokens['input_ids'].shape[1]}")
    print(f"First 10 tokens: {tokens['input_ids'][0][:10]}")

Testing tokenization on 3 samples:

Sample 1:
Original text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\b...
Token IDs shape: torch.Size([1, 49])
Number of tokens: 49
First 10 tokens: tensor([ 3556,   472,     5,  9034,     7,   205,  4207,  3195,     3, 27201])

Sample 2:
Original text: Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,...
Token IDs shape: torch.Size([1, 66])
Number of tokens: 66
First 10 tokens: tensor([ 1184,   120,   109,  3568,     7,     3, 28318,  9747, 12727,  6633])

Sample 3:
Original text: Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about th...
Token IDs shape: torch.Size([1, 55])
Number of tokens: 55
First 10 tokens: tensor([ 6067,    11, 22077,  5713,  6394,     7,    31, 19269,    41, 18844])


In [23]:
# Test batch tokenization for efficiency
print("Testing batch tokenization...")

# Take a small batch of texts
batch_texts = train_df['text'].head(10).tolist()

# Tokenize the whole batch at once
batch_tokens = tokenizer(batch_texts, 
                        return_tensors="pt", 
                        truncation=True, 
                        max_length=512,
                        padding=True)

print(f"Batch shape: {batch_tokens['input_ids'].shape}")
print(f"Processed {len(batch_texts)} texts at once!")
print("✅ Tokenization testing completed!")

Testing batch tokenization...
Batch shape: torch.Size([10, 104])
Processed 10 texts at once!
✅ Tokenization testing completed!
