# Fine-Tuning DeepSeek-R1-0528

## 1. Setting Up

In [None]:
# Import required libraries for authentication and environment variable management
from huggingface_hub import login
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get token from environment variable
hf_token = os.environ.get("HF_TOKEN")

# Log in to Hugging Face Hub using the token from the environment
if hf_token:
    login(hf_token)
    print("✅ Hugging Face login successful.")
else:
    print("❌ HF_TOKEN not found. Please check your .env file.")


## 2. Loading the Model and Tokenizer

In [None]:
# Import model and tokenizer classes, and torch for tensor operations
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch


In [None]:
# Configure 4-bit quantization for efficient model loading
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:
# Load tokenizer and model from Hugging Face Hub with quantization settings
model_dir = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
# Check GPU availability and status
!nvidia-smi

## 3. Loading and Processing the Dataset

In [None]:
# Define the prompt template for training
train_prompt_style = """
Please answer with one of the options in the bracket. Write reasoning in between <analysis></analysis>. Write the answer in between <answer></answer>.
### Question:
{}

### Response:
{}"""

In [None]:
# Get the end-of-sequence token from the tokenizer
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN


# Define a function to format prompts for the model
def formatting_prompts_func(examples):
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for question, response in zip(inputs, outputs):
        # Remove the "Q:" prefix from the question
        question = question.replace("Q:", "")

        # Append the EOS token to the response if it's not already there
        if not response.endswith(tokenizer.eos_token):
            response += tokenizer.eos_token

        text = train_prompt_style.format(question, response)
        texts.append(text)
    return {"text": texts}

In [None]:
# Import the datasets library to load and process the training dataset
from datasets import load_dataset

# Load the medical reasoning dataset
dataset = load_dataset(
    "mamachang/medical-reasoning",
    split="train",
    trust_remote_code=True,
)
# Format the dataset using the defined prompt formatting function
dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
)
print(dataset["text"][10])

In [None]:
# Import data collator for language modeling tasks
from transformers import DataCollatorForLanguageModeling

# Define the data collator, disabling masked language modeling (mlm)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

## 4. Model Inference Before Fine-Tuning

In [None]:
# Define the prompt template for inference
inference_prompt_style = """
Please answer with one of the options in the bracket. Write reasoning in between <analysis></analysis>. Write the answer in between <answer></answer>.

### Question:
{}

### Response:
<analysis>
"""

In [None]:
# Select a sample question from the dataset for inference
question = dataset[10]['input']
question = question.replace("Q:", "")

# Tokenize the input question and prepare tensors for the model
inputs = tokenizer(
    [inference_prompt_style.format(question) + tokenizer.eos_token],
    return_tensors="pt"
).to("cuda")

# Generate a response from the model
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)
# Decode and print the model's response
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(response[0].split("### Response:")[1])

In [None]:
# Import LoRA configuration and model wrapping utilities
from peft import LoraConfig, get_peft_model

# LoRA config
peft_config = LoraConfig(
    lora_alpha=16,  # Scaling factor for LoRA
    lora_dropout=0.05,  # Add a slight dropout for regularization
    r=64,  # Rank of the LoRA update matrices
    bias="none",  # No bias reparameterization
    task_type="CAUSAL_LM",  # Task type: Causal Language Modeling
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # Target modules for LoRA
)

# Wrap the model with LoRA configuration
model = get_peft_model(model, peft_config)

## 5. Setting up the model

In [None]:
# Import the SFTTrainer for supervised fine-tuning
from trl import SFTTrainer
from transformers import TrainingArguments

# Training Arguments
training_arguments = TrainingArguments(
    output_dir="../models/DeepSeek-R1-Distill-Qwen-1.5B-Medical-Reasoning",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=not is_bfloat16_supported(), # Use FP16 if BF16 is not supported
    bf16=is_bfloat16_supported(),     # Use BF16 if supported
    group_by_length=True,
    report_to="tensorboard",
)

# Initialize the Trainer
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset,
    peft_config=peft_config,
    data_collator=data_collator,
)

## 6. Model Training

In [None]:
# Clear GPU memory and disable cache before training
import gc, torch

gc.collect()
torch.cuda.empty_cache()
model.config.use_cache = False

# Start the training process
trainer.train()

## 7. Model inference after fine-tuning

In [None]:
# Evaluate the model inferences after fine-tuning
question = dataset[10]['input']
question = question.replace("Q:", "")

inputs = tokenizer(
    [inference_prompt_style.format(question, ) + tokenizer.eos_token],
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(response[0].split("### Response:")[1])

In [None]:
# Print the ground truth output for comparison
print(dataset[10]['output'])

In [None]:
# Evaluate the model on a different question from the dataset
question = dataset[100]['input']
question = question.replace("Q:", "")

inputs = tokenizer(
    [inference_prompt_style.format(question) + tokenizer.eos_token],
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(response[0].split("### Response:")[1])

In [None]:
# Print the ground truth output for the second question
print(dataset[100]['output'])

## 8. Saving the model

In [None]:
# Push the fine-tuned model and tokenizer to Hugging Face Hub
new_model_name = "DeepSeek-R1-Distill-Qwen-1.5B-Medical-Reasoning"
trainer.model.push_to_hub(new_model_name)
trainer.processing_class.push_to_hub(new_model_name)

## 9. Loading the Adopter and testing the model

In [None]:
# Clean up model and trainer objects, and clear GPU memory
del model
del trainer
torch.cuda.empty_cache()

In [None]:
# Re-import necessary libraries for loading the fine-tuned model and tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch

# Base model
base_model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

# Your fine-tuned LoRA adapter repository
lora_adapter_id = "kingabzpro/DeepSeek-R1-Distill-Qwen-1.5B-Medical-Reasoning"

# Load the model in 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    trust_remote_code=True,
)

# Attach the LoRA adapter
model = PeftModel.from_pretrained(
    base_model,
    lora_adapter_id,
    device_map="auto",
    trust_remote_code=True,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)


In [None]:
# Inference example
prompt = """
Please answer with one of the options in the bracket. Write reasoning in between <analysis></analysis>. Write the answer in between <answer></answer>.

### Question:
A research group wants to assess the relationship between childhood diet and cardiovascular disease in adulthood.
A prospective cohort study of 500 children between 10 to 15 years of age is conducted in which the participants' diets are recorded for 1 year and then the patients are assessed 20 years later for the presence of cardiovascular disease.
A statistically significant association is found between childhood consumption of vegetables and decreased risk of hyperlipidemia and exercise tolerance.
When these findings are submitted to a scientific journal, a peer reviewer comments that the researchers did not discuss the study's validity.
Which of the following additional analyses would most likely address the concerns about this study's design?
{'A': 'Blinding', 'B': 'Crossover', 'C': 'Matching', 'D': 'Stratification', 'E': 'Randomization'},
### Response:
<analysis>

"""

# Tokenize the prompt and prepare tensors for the model
inputs = tokenizer(
    [prompt + tokenizer.eos_token],
    return_tensors="pt"
).to("cuda")

# Generate a response from the model
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(response[0].split("### Response:")[1])

