# DeepSeek-R1-Distill-Qwen-1.5B-SciQ
## Fine-Tuning DeepSeek-R1-Distill-Qwen-1.5B
This notebook demonstrates how to fine-tune the DeepSeek-R1-Distill-Qwen-1.5B model on the SciQ dataset for multiple-choice question answering (MCQA) tasks. The process includes setting up the environment, loading the model and tokenizer, processing the dataset, training the model, and evaluating its performance.

[![Open in Colab](https://img.shields.io/badge/Open%20in-Colab-orange?logo=google&logoColor=white)](https://colab.research.google.com/drive/1GVlPxUok2vym4Yku1-_tBmcA-STb-ouq?usp=sharing) [![Open in Kaggle](https://img.shields.io/badge/Open%20in-Kaggle-blue?logo=kaggle&logoColor=white)](https://www.kaggle.com/code/trungngthanh/deepseek-r1-distill-qwen-1-5b-mcqa)

## 1. Setting Up

In [None]:
import os
import sys
import platform

def check_environment():
    # Check for Google Colab
    if 'google.colab' in sys.modules:
        return "Google Colab"

    # Check for Kaggle
    if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        return "Kaggle"

    # Check for local machine
    if os.path.exists('/content') and 'COLAB_GPU' in os.environ:
        # Double-check Colab (some Colab envs may not import google.colab)
        return "Google Colab"
    else:
        # Assume local if neither Colab nor Kaggle
        return "Local Machine"

# Detect and print the current environment and system information
env = check_environment()
print(f"Running on: {env}")
print(f"Current working directory: {os.getcwd()}")
print(f"Python version: {sys.version}")
print(f"Platform: {platform.platform()}")

In [None]:
# Install required packages based on the detected environment (Kaggle, Colab, or local)
if env == "Kaggle":
    %pip install -U transformers
    %pip install -U datasets
    %pip install -U accelerate
    %pip install -U peft
    %pip install -U trl
    %pip install -U bitsandbytes
elif env == "Google Colab":
    %pip install python-dotenv
    %pip install -U transformers
    %pip install -U datasets
    %pip install -U accelerate
    %pip install -U peft
    %pip install -U trl
    %pip install -U bitsandbytes
else:
    print("Please install the required packages manually for local execution.")

In [None]:
# Import libraries for Hugging Face authentication and environment variable management
from huggingface_hub import login
from dotenv import load_dotenv
import os

# Load environment variables from .env file (if present)
load_dotenv()

hf_token = ''
# Retrieve Hugging Face token from the appropriate source depending on environment
if env == "Kaggle":
    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        hf_token = user_secrets.get_secret("HF_TOKEN")
        print("HF_TOKEN is set in Kaggle Secrets")
    except:
        print("HF_TOKEN is not set in Kaggle Secrets")
elif env == "Google Colab":
    try:
        from dotenv import load_dotenv
        load_dotenv()
        hf_token = os.environ.get("HF_TOKEN")
        print("HF_TOKEN is set in Colab .env file")
    except ImportError:
        print("python-dotenv not installed in Colab")
else:
    try:
        from dotenv import load_dotenv
        load_dotenv()
        hf_token = os.environ.get("HF_TOKEN")
        print("HF_TOKEN is set in local .env file")
    except ImportError:
        print("python-dotenv not installed. Please install it to manage environment variables.")

# Log in to Hugging Face Hub using the token from the environment (if available)
if hf_token:
    login(hf_token)
    print("✅ Hugging Face login successful.")
else:
    print("❌ HF_TOKEN not found. Please check your .env file.")

In [None]:
import torch
print("Torch version:", torch.__version__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Check GPU availability and status
!nvidia-smi

## 2. Loading the Model and Tokenizer

In [None]:
# Import model and tokenizer classes for tensor operations
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig


In [None]:
# Configure 4-bit quantization for efficient model loading
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:
# Load tokenizer and model from Hugging Face Hub with quantization settings
model_dir = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

model.config.use_cache = False
model.config.pretraining_tp = 1

## 3. Loading and Processing the Dataset

In [None]:
import random

def format_sciq_prompt(example, num_questions=1, include_cot=False):
    # Extract context and question details
    context = example['support']
    question = example['question']
    options = [example['distractor1'], example['distractor2'], example['distractor3'], example['correct_answer']]
    random.shuffle(options)
    # Assign correct answer to one of the options (e.g., D)
    correct_answer = example['correct_answer']
    option_labels = ['A', 'B', 'C', 'D']
    correct_label = chr(65 + options.index(example['correct_answer'])) # Convert index to label (A, B, C, D)

    # Create the prompt
    prompt = (
        f"Given the context: {context}\n"
        f"Generate {num_questions} multiple-choice question(s) with four options each and indicate the correct answer.\n"
    )

    # Format the completion
    completion = (
        f"Question: {question}?\n"
        f"{option_labels[0]}) {options[0]}\n"
        f"{option_labels[1]}) {options[1]}\n"
        f"{option_labels[2]}) {options[2]}\n"
        f"{option_labels[3]}) {options[3]}\n"
        f"Correct Answer: {correct_label}"
    )

    # If include_cot is True, add a reasoning explanation (example placeholder)
    if include_cot:
        cot_explanation = (
            f"\nExplanation: The correct answer is {correct_answer} because it directly corresponds to the information provided in the context."
        )
        completion += cot_explanation

    # If num_questions > 1, assume additional questions are provided or need to be generated
    # For simplicity, this example uses the single question from SciQ
    # Augmentation for multiple questions would require external generation (see notes below)
    if num_questions > 1:
        completion += (
            "\n\nNote: Additional questions would be generated here based on the context, "
            "each with four options and a correct answer."
        )

    return {"prompt": prompt, "completion": completion}

In [None]:
# Import the datasets library to load and process the training dataset
from datasets import load_dataset
from matplotlib import pyplot as plt

# Load the medical reasoning dataset
train_dataset = load_dataset(
    "allenai/sciq",
    split="train",
)

# FIXME: Take first 10 samples for training
# Take first 10 samples for training
# train_dataset = train_dataset.select(range(10))

In [None]:
import pandas as pd

df = pd.DataFrame(train_dataset)
df.head()

In [None]:
# Format the dataset using the defined prompt formatting function
formatted_train_dataset = train_dataset.map(format_sciq_prompt)

In [None]:
print("Prompt:\n" + formatted_train_dataset[0]['prompt'])
print("Completion:\n" + formatted_train_dataset[0]['completion'])

In [None]:
def tokenize_example(batch):
    texts = [f"{prompt}\n{completion}{tokenizer.eos_token}" for prompt, completion in zip(batch['prompt'], batch['completion'])]
    return tokenizer(texts, truncation=True, max_length=512, padding="max_length")

In [None]:
tokenized_dataset = formatted_train_dataset.map(tokenize_example, batched=True, remove_columns=["prompt", "completion"])

In [None]:
# Import data collator for language modeling tasks
from transformers import DataCollatorForLanguageModeling

# Define the data collator, disabling masked language modeling (mlm)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

## 4. Model Inference Before Fine-Tuning

In [None]:
# inference_prompt_style = (
#     "Given the context: {}.\n"
#     "Generate 1 multiple-choice question with four options and indicate the correct answer."
# )

In [None]:
def generate_mcqa(prompt, model, tokenizer, max_length=1000):
    inputs = tokenizer(
        [prompt+ tokenizer.eos_token],
        return_tensors="pt"
    ).to(device)
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        attention_mask=inputs.attention_mask,
        num_return_sequences=1,
        do_sample=True,
        temperature=0.7,
        eos_token_id=tokenizer.eos_token_id,
        use_cache=True,
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
prompt = formatted_train_dataset[0]['prompt']
print(prompt)
generated = generate_mcqa(prompt, model, tokenizer)

In [None]:
print(generated)

In [None]:
print("Inference Before Fine-Tuning:")
print(generated[0])

## 5. Setting up the model

In [None]:
# Import LoRA configuration and model wrapping utilities
from peft import LoraConfig, get_peft_model

# LoRA config
peft_config = LoraConfig(
    lora_alpha=16,  # Scaling factor for LoRA
    lora_dropout=0.1,  # Add a slight dropout for regularization
    r=4,  # Rank of the LoRA update matrices
    # use_rslora=True,  # Stabilize scaling
    # bias="none",  # No bias reparameterization
    init_lora_weights="gaussian",
    task_type="CAUSAL_LM",  # Task type: Causal Language Modeling
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        # "gate_proj",
        # "up_proj",
        # "down_proj",
    ],  # Target modules for LoRA
)

# Wrap the model with LoRA configuration
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Verify ~0.5% trainable params

In [None]:
# Import the SFTTrainer for supervised fine-tuning
from trl import SFTTrainer
from transformers import TrainingArguments

# Training Arguments
training_arguments = TrainingArguments(
    output_dir="../models/DeepSeek-R1-Distill-Qwen-1.5B-SciQ",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    save_steps=True,
    optim="paged_adamw_8bit",
    num_train_epochs=1,
    logging_steps=10,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=1e-4,
    fp16=False,  # Use FP16 if BF16 is not supported
    bf16=False,      # Use BF16 if supported
    group_by_length=True,
    report_to="tensorboard",  # tensorboard, wandb, etc. can be used here
)

# Initialize the Trainer
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_dataset,
    peft_config=peft_config,
    data_collator=data_collator,
)

## 6. Model Training

In [None]:
%load_ext tensorboard
%tensorboard --logdir ../models/DeepSeek-R1-Distill-Qwen-1.5B-SciQ/runs  # Point to your output_dir/runs

In [None]:
# Clear GPU memory and disable cache before training
import gc, torch

gc.collect()
torch.cuda.empty_cache()
model.config.use_cache = False

# Start the training process
trainer.train()

## 7. Model inference after fine-tuning

In [None]:
# Evaluate the model inferences after fine-tuning
prompt = formatted_train_dataset[0]['prompt']
generated = generate_mcqa(prompt, model, tokenizer)
print("Inference After Fine-Tuning:")
print(generated[0])

## 8. Saving the model

In [None]:
# Push the fine-tuned model and tokenizer to Hugging Face Hub
new_model_name = "DeepSeek-R1-Distill-Qwen-1.5B-SciQ"
trainer.model.push_to_hub(new_model_name)
trainer.processing_class.push_to_hub(new_model_name)

## 9. Loading the Adopter and testing the model

In [None]:
# Clean up model and trainer objects, and clear GPU memory
del model
del trainer
torch.cuda.empty_cache()

In [None]:
from huggingface_hub import whoami

# Get the current user's information
user_info = whoami()

# Extract the username
hf_user = user_info['name']
print(f"Username: {hf_user}")

In [None]:
# Re-import necessary libraries for loading the fine-tuned model and tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch

# Base model
base_model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

# Your fine-tuned LoRA adapter repository
lora_adapter_id = f"{hf_user}/DeepSeek-R1-Distill-Qwen-1.5B-SciQ"

# Load the model in 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    trust_remote_code=True,
)

# Attach the LoRA adapter
model = PeftModel.from_pretrained(
    base_model,
    lora_adapter_id,
    device_map="auto",
    trust_remote_code=True,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)

In [None]:
# Load the medical reasoning dataset
test_dataset = load_dataset(
    "allenai/sciq",
    split="test",
)

test_dataset = test_dataset.select(range(20))

formatted_test_dataset = test_dataset.map(format_sciq_prompt)

In [None]:
# Define a function to generate the response for each record
def add_generated_response(example):
    prompt = format_sciq_prompt(example, num_questions=1, include_cot=False)['prompt']
    generated = generate_mcqa(prompt, model, tokenizer)
    example['generated_response'] = generated[0]
    return example

# Use map to add the new column to the dataset
testing_set = test_dataset.map(add_generated_response)

In [None]:
# Convert the test set to a DataFrame and show the first 5 records with generated responses
df = pd.DataFrame(testing_set)
df.head()

In [None]:
import os
import pandas as pd
from datetime import datetime

if env == "Google Colab":

    from google.colab import drive

    drive.mount('/content/drive')

    # Get current timestamp
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

    # Save the testing set with generated responses to a CSV file
    output_file = f"sciq_test_with_responses_{timestamp}.csv"
    df.to_csv(output_file, index=False)
    print(f"Testing set with generated responses saved to {output_file}")

    # Create the directory if it doesn't exist
    output_dir = "/content/drive/My Drive/colab_outputs"
    os.makedirs(output_dir, exist_ok=True)

    # Move the file
    !mv {output_file} "{output_dir}/"
else:
    # Save output file
    # Get current timestamp
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

    # Save the testing set with generated responses to a CSV file
    output_file = f"sciq_test_with_responses_{timestamp}.csv"
    df.to_csv(output_file, index=False)

