In [None]:
# data_exploration.ipynb

# 1. Imports
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
import numpy as np
import random
from transformers import AutoTokenizer

# 2. Load datasets
print("Loading raw dataset...")
raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
print("Loading filtered dataset...")
filtered_dataset = load_from_disk("../data/filtered_wikitext2")
print("Loading tokenized dataset...")
tokenized_dataset = load_from_disk("../data/tokenized_wikitext2")

# 3. Basic info
print("Raw splits:", raw_dataset)
print("Filtered splits:", filtered_dataset)
print("Tokenized splits:", tokenized_dataset)

# 4. Show sample texts
def show_sample(dataset, split, n=3):
    print(f"\n--- {split.upper()} SAMPLES ---")
    for i in range(n):
        print(f"{i+1}: {dataset[split][i]['text']}")

show_sample(raw_dataset, 'train')
show_sample(filtered_dataset, 'train')

# 5. Text length distribution (filtered)
lengths = [len(sample['text'].split()) for sample in filtered_dataset['train']]
plt.figure(figsize=(8,4))
plt.hist(lengths, bins=50, color='skyblue', edgecolor='black')
plt.title("Token count per sample (Filtered Train Split)")
plt.xlabel("Tokens per sample")
plt.ylabel("Frequency")
plt.show()

# 6. Tokenized sequence length distribution
seq_lengths = [len(sample['input_ids']) for sample in tokenized_dataset['train']]
plt.figure(figsize=(8,4))
plt.hist(seq_lengths, bins=30, color='salmon', edgecolor='black')
plt.title("Input ID sequence length (Tokenized Train Split)")
plt.xlabel("Sequence length")
plt.ylabel("Frequency")
plt.show()

# 7. Vocabulary size
tokenizer = AutoTokenizer.from_pretrained("gpt2")
print("Vocabulary size (GPT-2):", tokenizer.vocab_size)

# 8. Show tokenized sample
print("\n--- Tokenized train sample ---")
sample = tokenized_dataset['train'][0]
print("Input IDs:", sample['input_ids'])
print("Attention mask:", sample['attention_mask'])
print("Decoded text:", tokenizer.decode(sample['input_ids']))

# 9. Random qualitative samples
print("\n--- Random filtered samples ---")
for _ in range(3):
    idx = random.randint(0, len(filtered_dataset['train']) - 1)
    print(filtered_dataset['train'][idx]['text'])

print("\n--- Random tokenized samples (decoded) ---")
for _ in range(3):
    idx = random.randint(0, len(tokenized_dataset['train']) - 1)
    ids = tokenized_dataset['train'][idx]['input_ids']
    print(tokenizer.decode(ids))