In [22]:
from datasets import load_dataset
from transformers import AutoTokenizer

# 1. Load WikiText-2 dataset
wikitext = load_dataset("wikitext", "wikitext-2-raw-v1")

# 2. Filter out empty texts and take subset
def filter_empty_text(example):
    return example['text'] is not None and len(example['text'].strip()) > 0

# Filter and select subsets
train_filtered = wikitext["train"].filter(filter_empty_text)
val_filtered = wikitext["validation"].filter(filter_empty_text)

train_subset = train_filtered.select(range(min(1000, len(train_filtered))))
val_subset = val_filtered.select(range(min(200, len(val_filtered))))

# 3. Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# 4. Corrected tokenization function
def tokenize_function(examples):
    # Ensure text is a list of strings, not nested lists
    texts = examples["text"]
    if isinstance(texts, list) and len(texts) > 0 and isinstance(texts[0], list):
        # Flatten if nested
        texts = [item for sublist in texts for item in sublist]

    return tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=64,
        return_tensors=None  # Don't convert to tensors yet
    )

# 5. Apply tokenization with proper error handling
try:
    tokenized_train = train_subset.map(tokenize_function, batched=True, remove_columns=["text"])
    tokenized_val = val_subset.map(tokenize_function, batched=True, remove_columns=["text"])

    # Create the tokenized_datasets dictionary
    tokenized_datasets = {
        "train": tokenized_train,
        "validation": tokenized_val
    }

    print("Tokenization successful!")
    print(f"Train dataset size: {len(tokenized_datasets['train'])}")
    print(f"Validation dataset size: {len(tokenized_datasets['validation'])}")

except Exception as e:
    print(f"Tokenization error: {e}")




Filter:   0%|          | 0/36718 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenization successful!
Train dataset size: 1000
Validation dataset size: 200


In [23]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned-fast",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,  # Reduced to avoid memory issues
    per_device_eval_batch_size=4,
    save_steps=100,
    save_total_limit=1,
    eval_strategy="no",
    logging_steps=50,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=False,
    dataloader_num_workers=0,
    remove_unused_columns=False,
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

# Start training
trainer.train()






Step,Training Loss
50,3.7031
100,3.72


Step,Training Loss
50,3.7031
100,3.72
150,3.4655
200,3.5259
250,3.4947


TrainOutput(global_step=250, training_loss=3.5818512573242187, metrics={'train_runtime': 1480.9851, 'train_samples_per_second': 0.675, 'train_steps_per_second': 0.169, 'total_flos': 32661504000000.0, 'train_loss': 3.5818512573242187, 'epoch': 1.0})

In [24]:
import math
import torch

# Evaluate the model and calculate perplexity
def evaluate_perplexity(trainer):
    # Run evaluation on validation set
    eval_results = trainer.evaluate()

    # Calculate perplexity from evaluation loss
    perplexity = math.exp(eval_results["eval_loss"])

    print(f"Evaluation Loss: {eval_results['eval_loss']:.4f}")
    print(f"Perplexity: {perplexity:.4f}")

    return perplexity

# Run perplexity evaluation
perplexity_score = evaluate_perplexity(trainer)


Evaluation Loss: 3.4518
Perplexity: 31.5566


In [31]:
def compute_top_k_accuracy_simple(model, tokenizer, dataset, k=5, batch_size=4):
    """
    Simplified top-k accuracy with manual batch processing
    """
    model.eval()

    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token

    correct = 0
    total = 0

    # Process samples one by one to avoid batch issues
    for i in tqdm(range(0, len(dataset), batch_size)):
        batch_samples = []

        # Collect batch samples
        for j in range(i, min(i + batch_size, len(dataset))):
            sample = dataset[j]
            if isinstance(sample["input_ids"], list):
                input_ids = torch.tensor(sample["input_ids"], dtype=torch.long)
            else:
                input_ids = sample["input_ids"].clone().long()

            if len(input_ids) > 1:  # Need at least 2 tokens for next-word prediction
                batch_samples.append(input_ids)

        if not batch_samples:
            continue

        # Pad sequences to same length
        max_len = max(len(seq) for seq in batch_samples)
        padded_batch = torch.zeros(len(batch_samples), max_len, dtype=torch.long)

        for idx, seq in enumerate(batch_samples):
            padded_batch[idx, :len(seq)] = seq
            if len(seq) < max_len:
                padded_batch[idx, len(seq):] = tokenizer.pad_token_id

        # Move to device and get predictions
        input_ids = padded_batch.to(model.device)

        with torch.no_grad():
            outputs = model(input_ids)
            logits = outputs.logits

            # Process each sequence in the batch
            for seq_idx in range(input_ids.size(0)):
                seq_input = input_ids[seq_idx]
                seq_logits = logits[seq_idx]

                # Find actual sequence length (excluding padding)
                non_pad_mask = (seq_input != tokenizer.pad_token_id)
                seq_len = non_pad_mask.sum().item()

                if seq_len < 2:
                    continue

                # Get predictions for each position
                for pos in range(seq_len - 1):
                    target_token = seq_input[pos + 1].item()
                    pos_logits = seq_logits[pos]

                    # Get top-k predictions
                    top_k_tokens = torch.topk(pos_logits, k).indices

                    if target_token in top_k_tokens:
                        correct += 1
                    total += 1

    accuracy = correct / total if total > 0 else 0.0
    return accuracy


In [33]:
# Try the robust version first
try:
    top_1_accuracy = compute_top_k_accuracy_fixed(model, tokenizer, tokenized_datasets["validation"], k=1, batch_size=4)
    top_5_accuracy = compute_top_k_accuracy_fixed(model, tokenizer, tokenized_datasets["validation"], k=5, batch_size=4)
    top_10_accuracy = compute_top_k_accuracy_fixed(model, tokenizer, tokenized_datasets["validation"], k=10, batch_size=4)

    print(f"Top-1 Accuracy: {top_1_accuracy:.4f}")
    print(f"Top-5 Accuracy: {top_5_accuracy:.4f}")
    print(f"Top-10 Accuracy: {top_10_accuracy:.4f}")

except Exception as e:
    print(f"Robust version failed: {e}")
    print("Trying simplified version...")

    # Fallback to simplified version
    top_1_accuracy = compute_top_k_accuracy_simple(model, tokenizer, tokenized_datasets["validation"], k=1, batch_size=2)
    top_5_accuracy = compute_top_k_accuracy_simple(model, tokenizer, tokenized_datasets["validation"], k=5, batch_size=2)

    print(f"Top-1 Accuracy: {top_1_accuracy:.4f}")
    print(f"Top-5 Accuracy: {top_5_accuracy:.4f}")



Robust version failed: name 'compute_top_k_accuracy_fixed' is not defined
Trying simplified version...


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Top-1 Accuracy: 0.3980
Top-5 Accuracy: 0.6019
