# DeepSeek V3 Mini Training Experiments

This notebook provides an interactive environment for experimenting with the mini DeepSeek V3 model training on the tiny Shakespeare dataset.


In [None]:
# Setup and imports
import os
import sys
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import matplotlib.pyplot as plt

# Add the mini_model directory to Python path
sys.path.append(os.path.join(os.getcwd(), '..', 'mini_model'))

from configuration_deepseek import DeepseekV3Config
from modeling_deepseek import DeepseekV3ForCausalLM
from data_utils import create_shakespeare_dataset, calculate_dataset_statistics

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")


## Load Model and Tokenizer


In [None]:
# Load tokenizer
tokenizer_path = "../mini_model"
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_path,
    use_fast=True,
    trust_remote_code=True,
)

# Add padding token if needed
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Tokenizer loaded. Vocab size: {len(tokenizer)}")

# Load model config
config_path = "../mini_model/config.json"
with open(config_path, 'r') as f:
    config_dict = json.load(f)

config = DeepseekV3Config(**config_dict)

# Initialize model
model = DeepseekV3ForCausalLM(config)
total_params = sum(p.numel() for p in model.parameters())
print(f"Model initialized. Total parameters: {total_params:,}")


## Explore the Dataset


In [None]:
# Load the tiny Shakespeare dataset
dataset = load_dataset("karpathy/tiny_shakespeare")

print("Dataset structure:")
print(dataset)

# Show sample text
print("\nSample from training set:")
print(dataset['train']['text'][0][:500] + "...")

# Dataset statistics
train_text = "\n".join(dataset['train']['text'])
val_text = "\n".join(dataset['validation']['text'])
test_text = "\n".join(dataset['test']['text'])

print(f"\nDataset sizes:")
print(f"Train: {len(train_text):,} characters")
print(f"Validation: {len(val_text):,} characters")
print(f"Test: {len(test_text):,} characters")


## Test Tokenization and Data Loading


In [None]:
# Create custom dataset
block_size = 128
train_dataset = create_shakespeare_dataset(
    tokenizer=tokenizer,
    split="train",
    block_size=block_size,
)

# Get dataset statistics
stats = calculate_dataset_statistics(train_dataset, tokenizer)

# Test a single sample
sample = train_dataset[0]
print(f"Sample input shape: {sample['input_ids'].shape}")
print(f"Sample labels shape: {sample['labels'].shape}")

# Decode to see what the model will learn
print("\nDecoded sample input:")
print(tokenizer.decode(sample['input_ids'][:50]) + "...")
print("\nDecoded sample labels (shifted by 1):")
print(tokenizer.decode(sample['labels'][:50]) + "...")


## Quick Training Test


In [None]:
# Quick training test with a few batches
from torch.utils.data import DataLoader
import torch.nn as nn

# Create a simple dataloader
train_loader = DataLoader(
    train_dataset,
    batch_size=2,
    shuffle=True,
    collate_fn=lambda x: {
        "input_ids": torch.stack([d["input_ids"] for d in x]),
        "labels": torch.stack([d["labels"] for d in x]),
    }
)

# Setup optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)

# Move model to appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Train for a few steps
model.train()
losses = []

print("Running quick training test...")
for i, batch in enumerate(train_loader):
    if i >= 10:  # Just 10 steps for testing
        break
    
    # Move batch to device
    batch = {k: v.to(device) for k, v in batch.items()}
    
    # Forward pass
    outputs = model(**batch)
    loss = outputs.loss
    
    # Backward pass
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    losses.append(loss.item())
    print(f"Step {i+1}, Loss: {loss.item():.4f}")

# Plot losses
plt.figure(figsize=(8, 4))
plt.plot(losses)
plt.xlabel("Step")
plt.ylabel("Loss")
plt.title("Quick Training Test - Loss Curve")
plt.grid(True)
plt.show()

print(f"\nAverage loss: {sum(losses)/len(losses):.4f}")


## Test Generation


In [None]:
# Test text generation (note: model is barely trained)
model.eval()

prompts = [
    "To be or not to be",
    "O Romeo, Romeo",
    "All the world's a stage",
]

print("Testing generation (note: model is barely trained):\n")

for prompt in prompts:
    print(f"Prompt: {prompt}")
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=50,
            temperature=0.8,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Generated: {generated}")
    print("-" * 50)
