In [1]:
pip install transformers datasets torch accelerate huggingface_hub

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [2]:
# Install the latest version
!pip install bitsandbytes --upgrade

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.2


In [3]:
import bitsandbytes
print(bitsandbytes.__version__)

0.45.2


In [4]:
!pip install peft



In [5]:
from huggingface_hub import login
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import torch

In [None]:
# Replace 'your-huggingface-token' with your actual token from Hugging Face
hf_token = "paste_ur_token"
login(token=hf_token)
print("Successfully logged in to Hugging Face!")

Successfully logged in to Hugging Face!


In [7]:
# Step 2: Load and subset dataset
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en")
if "validation" not in dataset:
    dataset = dataset["train"].train_test_split(test_size=0.1)
    train_dataset = dataset["train"].shuffle(seed=42).select(range(4000))  # Subset to 4,000 samples
    eval_dataset = dataset["test"].shuffle(seed=42).select(range(400))     # Subset to 400 samples
else:
    train_dataset = dataset["train"].shuffle(seed=42).select(range(4000))
    eval_dataset = dataset["validation"].shuffle(seed=42).select(range(400))
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

medical_o1_sft.json:   0%|          | 0.00/74.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25371 [00:00<?, ? examples/s]

Training samples: 4000
Validation samples: 400


In [8]:
# Step 3: Preprocess dataset
model_name = "deepseek-ai/deepseek-coder-6.7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    inputs = [f"Question: {q}\nAnswer: {a}" for q, a in zip(examples["Question"], examples["Response"])]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")  # Reduced max_length
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True, remove_columns=eval_dataset.column_names)
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_eval.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
print("Dataset tokenized successfully!")

tokenizer_config.json:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Dataset tokenized successfully!


In [9]:
# Step 4: Load model with 4-bit quantization
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto",
    token=hf_token,
)
print("Base model loaded with 4-bit quantization!")

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Base model loaded with 4-bit quantization!


In [10]:
# Step 5: Add LoRA adapters
lora_config = LoraConfig(
    r=8,  # Reduced rank for faster computation
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print("LoRA adapters added to the model!")

trainable params: 4,194,304 || all params: 6,744,707,072 || trainable%: 0.0622
LoRA adapters added to the model!


In [11]:
# Step 6: Training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,  # Increased to 2
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,  # Adjusted for effective batch size of 16
    evaluation_strategy="no",  # Skip evaluation to save time
    save_strategy="steps",
    save_steps=500,
    learning_rate=3e-5,  # Slightly higher LR for faster convergence
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,
    max_grad_norm=0.3,
    optim="paged_adamw_8bit",
)
print("Training arguments configured!")

Training arguments configured!




In [12]:
# Step 7: Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=None,  # No evaluation dataset
)
print("Starting fine-tuning...")
trainer.train()
print("Fine-tuning completed!")



Starting fine-tuning...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmudulisunil89[0m ([33mmudulisunil89-vssut[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss
10,2.8102
20,2.1595
30,1.7079
40,1.6442
50,1.5615
60,1.578
70,1.4492
80,1.6156
90,1.5065
100,1.4281


Fine-tuning completed!


In [16]:
model.save_pretrained("fine-tuned-deepseek-r1-1.5b")
tokenizer.save_pretrained("fine-tuned-deepseek-r1-1.5b")

('fine-tuned-deepseek-r1-1.5b/tokenizer_config.json',
 'fine-tuned-deepseek-r1-1.5b/special_tokens_map.json',
 'fine-tuned-deepseek-r1-1.5b/tokenizer.json')

In [20]:
# from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
# from peft import PeftModel
# import torch

# # Step 1: Define model path (where you saved it)
# model_path = "fine-tuned-deepseek-r1-1.5b"
# print(f"Loading fine-tuned model from: {model_path}")

# # Step 2: Load fine-tuned model with 4-bit quantization
# quant_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_use_double_quant=True,
# )

# # Load base model with quantization, forcing all layers to GPU
# base_model_name = "deepseek-ai/deepseek-coder-6.7b-instruct"
# model = AutoModelForCausalLM.from_pretrained(
#     base_model_name,
#     quantization_config=quant_config,
#     device_map={"": 0},  # Force all layers to GPU (cuda:0)
#     token="your-huggingface-token"  # Replace if needed
# )

# # Load your fine-tuned LoRA adapters from the saved path
# model = PeftModel.from_pretrained(model, model_path)

# # Load tokenizer from the same saved path
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# if tokenizer.pad_token is None:
#     tokenizer.pad_token = tokenizer.eos_token

# # Step 3: Medical CoT Prompt Template
# def generate_medical_cot_response(prompt, max_new_tokens=200):  # Reduced for memory
#     medical_cot_prompt = f"""
#     You are an AI assistant specialized in medical knowledge.
#     Please analyze the following question step by step using reliable medical reasoning.

#     Patient Query: {prompt}

#     Step 1: Identify the key medical terms in the query.
#     Step 2: Explain the underlying medical condition(s) involved.
#     Step 3: Discuss possible causes, symptoms, and risk factors.
#     Step 4: Suggest general diagnostic approaches and treatments.
#     Step 5: Provide preventive measures and lifestyle recommendations.

#     Now, provide a structured response based on the above steps.
#     """

#     inputs = tokenizer(medical_cot_prompt, return_tensors="pt").to("cuda")

#     with torch.no_grad():
#         output = model.generate(
#             **inputs,
#             max_new_tokens=max_new_tokens,
#             do_sample=True,
#             temperature=0.5,
#             top_k=50,
#             top_p=0.9,
#             use_cache=True
#         )

#     return tokenizer.decode(output[0], skip_special_tokens=True)

# # Step 4: Test the Medical CoT Prompt
# medical_prompt = "What are the symptoms and treatment options for Type 2 Diabetes?"
# output = generate_medical_cot_response(medical_prompt)
# print("Generated Response:")
# print(output)