In [None]:
import json
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling
)
import evaluate
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import os
from tqdm import tqdm

In [None]:
# Load the processed datasets you prepared in data_preparation.ipynb
train_dataset = load_from_disk("../data/processed/train_dataset")
val_dataset = load_from_disk("../data/processed/val_dataset")
test_dataset = load_from_disk("../data/processed/test_dataset")

print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(val_dataset)}")
print(f"Test examples: {len(test_dataset)}")

# Examine a sample to verify data structure
print("Sample training example:", train_dataset[0])

In [None]:
# Load the tokenizer for DeepSeek R1
model_id = "deepseek-ai/deepseek-llm-7b-base"  # Replace with appropriate DeepSeek R1 model
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def format_instruction(example):
    """
    Format inputs in the chat template format for DeepSeek models:
    <|im_start|>user
    {input_text}
    <|im_end|>
    <|im_start|>assistant
    {output_text}
    <|im_end|>
    """
    # Adjust these field names to match your actual data structure
    input_text = example["content"]
    output_text = example["quiz_flashcard"]
    
    formatted_text = f"<|im_start|>user\n{input_text}\n<|im_end|>\n<|im_start|>assistant\n{output_text}\n<|im_end|>"
    return {"formatted_text": formatted_text}

# Apply formatting to each dataset
train_formatted = train_dataset.map(format_instruction)
val_formatted = val_dataset.map(format_instruction)
test_formatted = test_dataset.map(format_instruction)

def tokenize_function(examples):
    """Tokenize the formatted texts"""
    return tokenizer(
        examples["formatted_text"],
        padding="max_length",
        truncation=True,
        max_length=512,  # Adjust based on your data length
        return_tensors="pt"
    )

# Tokenize datasets
train_tokenized = train_formatted.map(
    tokenize_function,
    batched=True,
    remove_columns=["formatted_text"] + list(train_formatted.features.keys())
)

val_tokenized = val_formatted.map(
    tokenize_function,
    batched=True,
    remove_columns=["formatted_text"] + list(val_formatted.features.keys())
)

# Print tokenized sample to verify
print("Tokenized sample shape:", {k: v.shape for k, v in train_tokenized[0].items()})

In [None]:
# Load model with quantization for efficient training
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    load_in_8bit=True,
    device_map="auto"
)

# Configure LoRA for parameter-efficient fine-tuning
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]  # May need adjustment for your specific model
)

# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

# Check trainable parameters
model.print_trainable_parameters()

In [None]:
# Initialize wandb for experiment tracking (optional)
wandb.init(project="deepseek-quiz-flashcard-generator", name="lora-finetune-run1")

# Configure training arguments
training_args = TrainingArguments(
    output_dir="./results/deepseek-quiz-flashcard",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    learning_rate=1e-4,
    fp16=True,
    load_best_model_at_end=True,
    report_to="wandb",  # Change to "none" if not using wandb
    run_name="lora-finetune-run1",
    seed=42
)

# Set up data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    data_collator=data_collator,
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Save the trained model and tokenizer
output_dir = "./models/deepseek-quiz-flashcard-generator"
os.makedirs(output_dir, exist_ok=True)

# Save the PEFT adapter
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")

In [None]:
# Test the model with a sample input
test_input = "Explain the concept of quantum entanglement in physics."
prompt = f"<|im_start|>user\n{test_input}\n<|im_end|>\n<|im_start|>assistant\n"

# Encode the input
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate with appropriate parameters
outputs = model.generate(
    inputs.input_ids,
    max_length=512,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

# Decode the output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

# Extract only the assistant's response
response = generated_text.split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0].strip()

print("Sample input:", test_input)
print("Generated quiz/flashcard:", response)