# DeepSeek-R1-Distill-Qwen-1.5B-SciQ
## Fine-Tuning DeepSeek-R1-Distill-Qwen-1.5B
This notebook demonstrates how to fine-tune the DeepSeek-R1-Distill-Qwen-1.5B model on the SciQ dataset for multiple-choice question answering (MCQA) tasks. The process includes setting up the environment, loading the model and tokenizer, processing the dataset, training the model, and evaluating its performance.

[![Open in Colab](https://img.shields.io/badge/Open%20in-Colab-orange?logo=google&logoColor=white)](https://colab.research.google.com/drive/1GVlPxUok2vym4Yku1-_tBmcA-STb-ouq?usp=sharing)

## 1. Setting Up

In [None]:
import os
import sys
import platform

def check_environment():
    # Check for Google Colab
    if 'google.colab' in sys.modules:
        return "Google Colab"

    # Check for Kaggle
    if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        return "Kaggle"

    # Check for local machine
    if os.path.exists('/content') and 'COLAB_GPU' in os.environ:
        # Double-check Colab (some Colab envs may not import google.colab)
        return "Google Colab"
    else:
        # Assume local if neither Colab nor Kaggle
        return "Local Machine"

# Print environment details
env = check_environment()
print(f"Running on: {env}")
print(f"Current working directory: {os.getcwd()}")
print(f"Python version: {sys.version}")
print(f"Platform: {platform.platform()}")

# Check specific environment variables (e.g., from .env or Kaggle Secrets)
if env == "Kaggle":
    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        api_key = user_secrets.get_secret("API_KEY")
        print("API_KEY is set in Kaggle Secrets")
    except:
        print("API_KEY is not set in Kaggle Secrets")
elif env == "Google Colab":
    try:
        from dotenv import load_dotenv
        load_dotenv()
        print("API_KEY is set in Colab .env" if 'API_KEY' in os.environ else "API_KEY is not set in Colab .env")
    except ImportError:
        print("python-dotenv not installed in Colab")
else:
    print("API_KEY is set locally" if 'API_KEY' in os.environ else "API_KEY is not set locally")

In [1]:
import torch
print("Torch version:", torch.__version__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Torch version: 2.7.1+cu128
Using device: cuda


In [2]:
# Import required libraries for authentication and environment variable management
from huggingface_hub import login
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get token from environment variable
hf_token = os.environ.get("HF_TOKEN")

# Log in to Hugging Face Hub using the token from the environment
if hf_token:
    login(hf_token)
    print("✅ Hugging Face login successful.")
else:
    print("❌ HF_TOKEN not found. Please check your .env file.")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


✅ Hugging Face login successful.


In [3]:
# Check GPU availability and status
!nvidia-smi

Wed Jul 30 16:59:46 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 577.00                 Driver Version: 577.00         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   46C    P8              3W /   87W |     694MiB /   8188MiB |      1%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## 2. Loading the Model and Tokenizer

In [4]:
# Import model and tokenizer classes for tensor operations
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig


In [5]:
# Configure 4-bit quantization for efficient model loading
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [6]:
# Load tokenizer and model from Hugging Face Hub with quantization settings
model_dir = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

model.config.use_cache = False
model.config.pretraining_tp = 1

## 3. Loading and Processing the Dataset

In [7]:
def format_sciq_prompt(example, num_questions=1, include_cot=False):
    # Extract context and question details
    context = example['support']
    question = example['question']
    options = [example['distractor1'], example['distractor2'], example['distractor3'], example['correct_answer']]
    # Assign correct answer to one of the options (e.g., D)
    correct_answer = example['correct_answer']
    option_labels = ['A', 'B', 'C', 'D']
    correct_label = 'D'  # Assuming correct_answer is the last option for consistency

    # Create the prompt
    prompt = (
        f"Given the context: {context}\n"
        f"Generate {num_questions} multiple-choice question(s) with four options each and indicate the correct answer."
    )

    # Format the completion
    completion = (
        f"Question: {question}?\n"
        f"{option_labels[0]}) {options[0]}\n"
        f"{option_labels[1]}) {options[1]}\n"
        f"{option_labels[2]}) {options[2]}\n"
        f"{option_labels[3]}) {options[3]}\n"
        f"Correct Answer: {correct_label}"
    )

    # If include_cot is True, add a reasoning explanation (example placeholder)
    if include_cot:
        cot_explanation = (
            f"\nExplanation: The correct answer is {correct_answer} because it directly corresponds to the information provided in the context."
        )
        completion += cot_explanation

    # If num_questions > 1, assume additional questions are provided or need to be generated
    # For simplicity, this example uses the single question from SciQ
    # Augmentation for multiple questions would require external generation (see notes below)
    if num_questions > 1:
        completion += (
            "\n\nNote: Additional questions would be generated here based on the context, "
            "each with four options and a correct answer."
        )

    return {"prompt": prompt, "completion": completion}

In [8]:
# Import the datasets library to load and process the training dataset
from datasets import load_dataset
from matplotlib import pyplot as plt

# Load the medical reasoning dataset
dataset = load_dataset(
    "allenai/sciq",
    split="train",
    trust_remote_code=True,
)

# FIXME: Take first 10 samples for training
# Take first 10 samples for training
dataset = dataset.select(range(10))

In [9]:
import pandas as pd

df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,question,distractor3,distractor1,distractor2,correct_answer,support
0,What type of organism is commonly used in prep...,viruses,protozoa,gymnosperms,mesophilic organisms,"Mesophiles grow best in moderate temperature, ..."
1,What phenomenon makes global winds blow northe...,tropical effect,muon effect,centrifugal effect,coriolis effect,Without Coriolis Effect the global winds would...
2,Changes from a less-ordered state to a more-or...,endothermic,unbalanced,reactive,exothermic,Summary Changes of state are examples of phase...
3,What is the least dangerous radioactive decay?,zeta decay,beta decay,gamma decay,alpha decay,All radioactive decay is dangerous to living t...
4,Kilauea in hawaii is the world’s most continuo...,magma,greenhouse gases,carbon and smog,smoke and ash,Example 3.5 Calculating Projectile Motion: Hot...


In [None]:
# Format the dataset using the defined prompt formatting function
formatted_dataset = dataset.map(format_sciq_prompt)

In [12]:
print("Prompt:\n" + formatted_dataset[0]['prompt'])
print("Completion:\n" + formatted_dataset[0]['completion'])

Prompt:
Given the context: Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.
Generate 1 multiple-choice question(s) with four options each and indicate the correct answer.
Completion:
Question: What type of organism is commonly used in preparation of foods such as cheese and yogurt??
A) protozoa
B) gymnosperms
C) viruses
D) mesophilic organisms
Correct Answer: D


In [13]:
def tokenize_example(batch):
    texts = [f"{prompt}\n{completion}{tokenizer.eos_token}" for prompt, completion in zip(batch['prompt'], batch['completion'])]
    return tokenizer(texts, truncation=True, max_length=512)

In [14]:
tokenized_dataset = formatted_dataset.map(tokenize_example, batched=True, remove_columns=["prompt", "completion"])

In [15]:
# Split dataset for training and evaluation
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)

In [16]:
# Import data collator for language modeling tasks
from transformers import DataCollatorForLanguageModeling

# Define the data collator, disabling masked language modeling (mlm)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

## 4. Model Inference Before Fine-Tuning

In [None]:
inference_prompt_style = (
    "Given the context: {}.\n"
    "Generate 1 multiple-choice question with four options and indicate the correct answer."
)

In [17]:
def generate_mcqa(prompt, model, tokenizer, max_length=1000):
    inputs = tokenizer(
        [prompt.format(prompt)+ tokenizer.eos_token],
        return_tensors="pt"
    ).to(device)
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        attention_mask=inputs.attention_mask,
        num_return_sequences=1,
        do_sample=True,
        temperature=0.7,
        eos_token_id=tokenizer.eos_token_id,
        use_cache=True,
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
prompt = formatted_dataset[0]['prompt']
generated = generate_mcqa(prompt, model, tokenizer)

In [19]:
print("Inference Before Fine-Tuning:")
print(generated[0])

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Inference Before Fine-Tuning:
Given the context: The water cycle describes how water evaporates from the surface of the earth, rises into the atmosphere, cools and condenses into clouds, and falls back to the surface as precipitation.
Generate 1 multiple-choice question with four options and indicate the correct answer. The correct answer must be chosen by the user.
The question should be about the process of water evaporating from the surface of the earth.
The options should be: A) Water evaporates from the surface of the earth, which is the process of the water cycle. B) Water evaporates from the surface of the earth, which is not the process of the water cycle. C) Water evaporates from the surface of the earth, which is a process of the water cycle. D) Water evaporates from the surface of the earth, which is not a process of the water cycle.
To determine if a question is appropriate, you can look at the options and see if they are either both correct or both incorrect. If the option

## 5. Setting up the model

In [None]:
# Import LoRA configuration and model wrapping utilities
from peft import LoraConfig, get_peft_model

# LoRA config
peft_config = LoraConfig(
    lora_alpha=16,  # Scaling factor for LoRA
    lora_dropout=0.05,  # Add a slight dropout for regularization
    r=64,  # Rank of the LoRA update matrices
    bias="none",  # No bias reparameterization
    task_type="CAUSAL_LM",  # Task type: Causal Language Modeling
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # Target modules for LoRA
)

# Wrap the model with LoRA configuration
model = get_peft_model(model, peft_config)

In [None]:
# Import the SFTTrainer for supervised fine-tuning
from trl import SFTTrainer
from transformers import TrainingArguments

# Training Arguments
training_arguments = TrainingArguments(
    output_dir="../models/DeepSeek-R1-Distill-Qwen-1.5B-SciQ",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    # fp16=not is_bfloat16_supported(), # Use FP16 if BF16 is not supported
    # bf16=is_bfloat16_supported(),     # Use BF16 if supported
    group_by_length=True,
    report_to="none",  # Disable logging to external tools for simplicity
)

# Initialize the Trainer
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=formatted_dataset,
    peft_config=peft_config,
    data_collator=data_collator,
)

## 6. Model Training

In [None]:
# Clear GPU memory and disable cache before training
import gc, torch

gc.collect()
torch.cuda.empty_cache()
model.config.use_cache = False

# Start the training process
trainer.train()

In [None]:
# Access log history after training
log_history = trainer.state.log_history

# Convert to DataFrame for easier viewing and saving
df_logs = pd.DataFrame(log_history)

# Print the last few log entries
print(df_logs.tail())

# Optionally, save to CSV
df_logs.to_csv("training_metrics.csv", index=False)
print("Training metrics saved to training_metrics.csv")

## 7. Model inference after fine-tuning

In [None]:
# Evaluate the model inferences after fine-tuning
prompt = formatted_dataset[0]['prompt']
generated = generate_mcqa(prompt, model, tokenizer)
print("Inference Before Fine-Tuning:")
print(generated[0])

## 8. Saving the model

In [None]:
# Push the fine-tuned model and tokenizer to Hugging Face Hub
new_model_name = "DeepSeek-R1-Distill-Qwen-1.5B-SciQ"
trainer.model.push_to_hub(new_model_name)
trainer.processing_class.push_to_hub(new_model_name)

## 9. Loading the Adopter and testing the model

In [None]:
# Clean up model and trainer objects, and clear GPU memory
del model
del trainer
torch.cuda.empty_cache()

In [None]:
from huggingface_hub import whoami

# Get the current user's information
user_info = whoami()

# Extract the username
hf_user = user_info['name']
print(f"Username: {hf_user}")

In [None]:
# Re-import necessary libraries for loading the fine-tuned model and tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch

# Base model
base_model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

# Your fine-tuned LoRA adapter repository
lora_adapter_id = f"{hf_user}/DeepSeek-R1-Distill-Qwen-1.5B-SciQ"

# Load the model in 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    trust_remote_code=True,
)

# Attach the LoRA adapter
model = PeftModel.from_pretrained(
    base_model,
    lora_adapter_id,
    device_map="auto",
    trust_remote_code=True,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)

In [None]:
# Load the medical reasoning dataset
testing_set = load_dataset(
    "allenai/sciq",
    split="test",
    trust_remote_code=True,
)

In [None]:
for record in testing_set:
    # Format the prompt for inference
    prompt = format_sciq_prompt(record, num_questions=1, include_cot=False)['prompt']

    # Generate a response using the model
    generated = generate_mcqa(prompt, model, tokenizer)

    # Add generated response to the record
    record['generated_response'] = generated[0]

In [None]:
# Show the first 5 records with generated responses
df = pd.DataFrame(testing_set)
df.head()

In [None]:
# Save the testing set with generated responses to a CSV file
output_file = "sciq_test_with_responses.csv"
df.to_csv(output_file, index=False)
print(f"Testing set with generated responses saved to {output_file}")