# Check 1: Verify Variable Sequence Lengths

In [11]:
from datasets import load_from_disk

import numpy as np


# Load the datasets
train = load_from_disk("data/processed/train")
val = load_from_disk("data/processed/val")
test = load_from_disk("data/processed/test")

# Get all input_ids lengths
train_lengths = [len(ex['input_ids']) for ex in train]
val_lengths = [len(ex['input_ids']) for ex in val]
test_lengths = [len(ex['input_ids']) for ex in test]

print("=" * 50)
print("üìä SEQUENCE LENGTH STATISTICS")
print("=" * 50)

print("\nüîπ TRAIN SET:")
print(f"  Min length: {min(train_lengths)}")
print(f"  Max length: {max(train_lengths)}")
print(f"  Average: {np.mean(train_lengths):.1f}")
print(f"  Median: {np.median(train_lengths):.1f}")
print(f"  Std Dev: {np.std(train_lengths):.1f}")

print("\nüîπ VAL SET:")
print(f"  Min length: {min(val_lengths)}")
print(f"  Max length: {max(val_lengths)}")
print(f"  Average: {np.mean(val_lengths):.1f}")

print("\nüîπ TEST SET:")
print(f"  Min length: {min(test_lengths)}")
print(f"  Max length: {max(test_lengths)}")
print(f"  Average: {np.mean(test_lengths):.1f}")

# Check if all sequences are the same length (BAD)
if len(set(train_lengths)) == 1:
    print("\n‚ùå WARNING: All sequences are the SAME length!")
    print(f"   All sequences are {train_lengths[0]} tokens")
else:
    print(f"\n‚úÖ GOOD: Found {len(set(train_lengths))} different sequence lengths")
    print(f"   Range: {min(train_lengths)} to {max(train_lengths)} tokens")

print("=" * 50)

üìä SEQUENCE LENGTH STATISTICS

üîπ TRAIN SET:
  Min length: 5
  Max length: 223
  Average: 54.0
  Median: 31.0
  Std Dev: 47.0

üîπ VAL SET:
  Min length: 6
  Max length: 223
  Average: 54.1

üîπ TEST SET:
  Min length: 6
  Max length: 213
  Average: 53.8

‚úÖ GOOD: Found 216 different sequence lengths
   Range: 5 to 223 tokens


# Check 2: Analyze Length Distribution

In [15]:
import numpy as np
from collections import Counter

print("\n" + "=" * 60)
print("üìä DETAILED SEQUENCE LENGTH ANALYSIS")
print("=" * 60)

# Get lengths
train_lengths = [len(ex['input_ids']) for ex in train]

# Basic stats
print(f"\nüìà Basic Statistics:")
print(f"   Total examples: {len(train_lengths)}")
print(f"   Min length: {min(train_lengths)}")
print(f"   Max length: {max(train_lengths)}")
print(f"   Average: {np.mean(train_lengths):.1f}")
print(f"   Median: {np.median(train_lengths):.1f}")
print(f"   Std Dev: {np.std(train_lengths):.1f}")
print(f"   Unique lengths: {len(set(train_lengths))}")

# Distribution buckets
print(f"\nüìä Length Distribution:")
buckets = [
    (0, 50, "Very Short"),
    (50, 100, "Short"),
    (100, 150, "Medium"),
    (150, 200, "Long"),
    (200, 300, "Very Long"),
    (300, 600, "Extra Long")
]

for min_len, max_len, label in buckets:
    count = sum(1 for l in train_lengths if min_len <= l < max_len)
    pct = count / len(train_lengths) * 100
    bar = "‚ñà" * int(pct / 2)  # Visual bar
    print(f"   {label:12} ({min_len:3}-{max_len:3}): {count:6} ({pct:5.1f}%) {bar}")

# Most common lengths
print(f"\nüî¢ Top 10 Most Common Lengths:")
length_counts = Counter(train_lengths)
for length, count in length_counts.most_common(10):
    pct = count / len(train_lengths) * 100
    print(f"   {length:3} tokens: {count:5} times ({pct:5.2f}%)")

print("=" * 60)


üìä DETAILED SEQUENCE LENGTH ANALYSIS

üìà Basic Statistics:
   Total examples: 96000
   Min length: 5
   Max length: 223
   Average: 54.0
   Median: 31.0
   Std Dev: 47.0
   Unique lengths: 216

üìä Length Distribution:
   Very Short   (  0- 50):  59663 ( 62.1%) ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
   Short        ( 50-100):  18885 ( 19.7%) ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
   Medium       (100-150):  11204 ( 11.7%) ‚ñà‚ñà‚ñà‚ñà‚ñà
   Long         (150-200):   6122 (  6.4%) ‚ñà‚ñà‚ñà
   Very Long    (200-300):    126 (  0.1%) 
   Extra Long   (300-600):      0 (  0.0%) 

üî¢ Top 10 Most Common Lengths:
    25 tokens:  2502 times ( 2.61%)
    22 tokens:  2487 times ( 2.59%)
    24 tokens:  2445 times ( 2.55%)
    23 tokens:  2438 times ( 2.54%)
    21 tokens:  2330 times ( 2.43%)
    20 tokens:  2290 times ( 2.39%)
    12 tokens:  2289 times ( 2.38%)
    27 tokens:  2198 times ( 2.29%)
    26 tokens:  2197 times ( 2.29%)
    19 to

# Check 3: Sample Examples with Different Lengths

In [13]:
# Show examples of different lengths
print("\n" + "=" * 50)
print("üìù SAMPLE EXAMPLES (Different Lengths)")
print("=" * 50)

# Get examples with different lengths
sorted_train = sorted(enumerate(train), key=lambda x: len(x[1]['input_ids']))

# Short example
short_idx, short_ex = sorted_train[0]
print(f"\nüîπ SHORT Example (Index {short_idx}):")
print(f"   Length: {len(short_ex['input_ids'])} tokens")
print(f"   Text: {short_ex['text'][:200]}...")

# Medium example
mid_idx, mid_ex = sorted_train[len(sorted_train)//2]
print(f"\nüîπ MEDIUM Example (Index {mid_idx}):")
print(f"   Length: {len(mid_ex['input_ids'])} tokens")
print(f"   Text: {mid_ex['text'][:200]}...")

# Long example
long_idx, long_ex = sorted_train[-1]
print(f"\nüîπ LONG Example (Index {long_idx}):")
print(f"   Length: {len(long_ex['input_ids'])} tokens")
print(f"   Text: {long_ex['text'][:200]}...")

print("=" * 50)


üìù SAMPLE EXAMPLES (Different Lengths)

üîπ SHORT Example (Index 632):
   Length: 5 tokens
   Text: Ref 317299...

üîπ MEDIUM Example (Index 47015):
   Length: 31 tokens
   Text: Ship no Unit 0131 Box 8486, DPO AE 09091 for Michael Brooks from Wilson PLC by 1980-11-26O...

üîπ LONG Example (Index 52168):
   Length: 223 tokens
   Text: SSN: 308-28-3803; Phone:Y9 4 1 3 5 1 6 5 8 5 3 5 0g Email: jeffreywillis@exampoe.lom. SSn: 669-51-8018; Phone: 1 9 8 6 8 9 3 0 7 1 1 0 2 6 5; Email: nicholas29(at)example.com. yO snyone know how to co...


# Check 4: Verify No Padding Tokens

In [21]:
# Check if there are padding tokens in the data
print("\n" + "=" * 100)
print("üîç CHECKING FOR PADDING TOKENS")
print("=" * 100)

pad_token_id = 0  # RoBERTa's padding token

has_padding = False
for i, ex in enumerate(train):
    if pad_token_id in ex['input_ids']:
        has_padding = True
        print(f"‚ùå Found padding in example {i}")
        print(f"   Input IDs: {ex['input_ids']}")
        break

if not has_padding:
    print("‚úÖ GOOD: No padding tokens found in the dataset!")
    print("   Padding will be handled dynamically by DataCollator during training.")

print("=" * 100)




üîç CHECKING FOR PADDING TOKENS
‚úÖ GOOD: No padding tokens found in the dataset!
   Padding will be handled dynamically by DataCollator during training.


In [None]:
ex = train[0]
assert len(ex["input_ids"]) == len(ex["attention_mask"]) == len(ex["labels"])
print(ex)

{'text': 'contact Info: Olivia Hernandez, jcannon@example.org, 001 278 67080566809', 'spans': [{'end': 30, 'label': 'PERSON', 'start': 14}, {'end': 51, 'label': 'EMAIL', 'start': 32}, {'end': 72, 'label': 'PHONE', 'start': 53}], 'input_ids': [1, 32233, 17883, 35, 11924, 7816, 6, 1236, 438, 17375, 1039, 46781, 4, 1957, 6, 16273, 134, 37481, 38679, 34249, 4280, 36621, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 0, 0, 17, 19, 0, 1, 2, 2, 2, 2, 2, 3, 0, 5, 6, 6, 6, 6, 6, 7, -100], 'tokens': ['[CLS]', 'contact', 'ƒ†Info', ':', 'ƒ†Olivia', 'ƒ†Hernandez', ',', 'ƒ†j', 'c', 'annon', '@', 'example', '.', 'org', ',', 'ƒ†00', '1', 'ƒ†278', 'ƒ†670', '805', '66', '809', '[SEP]']}


In [None]:
import json

with open("data/processed/label2id.json") as f:
    label2id = json.load(f)
max_label_id = max(label2id.values())
assert max(ex["labels"]) <= max_label_id
print("All checks passed!")

All checks passed!
