In [None]:
#%%capture
#! pip install unsloth
#! pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git


In [None]:
#!pip install adapter-transformers


📝 Full Kaggle Notebook Code for Fine-Tuning DeepSeek R1          
I'll structure it in steps:                        
                         
1️⃣ Setup & Install Dependencies                
2️⃣ Authenticate with Hugging Face & W&B                  
3️⃣ Load DeepSeek R1 & Tokenizer                               
4️⃣ Load Three Hugging Face Datasets                                
5️⃣ Preprocess Data for Fine-Tuning                                       
6️⃣ Fine-Tune the Model                                           
7️⃣ Save & Download Fine-Tuned Model               

In [None]:
#!pip install datasets


In [None]:
#from datasets import load_dataset


In [None]:
#from kaggle_secrets import UserSecretsClient
#user_secrets = UserSecretsClient()
#secret_value_0 = user_secrets.get_secret("HF_TOKEN")
#secret_value_1 = user_secrets.get_secret("wnb ")


### import all relevant packages

In [None]:
#Modules for fine tuning
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer # trainer for supervised fine tuning
from unsloth import is_bf16_supported # checks if the hardware supports bfloat16 presion

#hugging face modlue
from huggingface_hub import login #lets you to login to api
from transformers import TrainingArguments #defines training hyperparameters
#from dataset import load_dataset #lets you to load fine tuning datasets
#import weights and biases
import wandb
#import keggle secrets
from kaggle_secrets import UserSecretsClient
from datasets import load_dataset, concatenate_datasets

In [None]:
#! pip uninstall torch torchvision torchaudio
#! pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


### create API keys and login to Hugging face and wights and biases

In [None]:
#initializing hugging face and w&b tokens
user_secrets=UserSecretsClient() #from keggle secrets import UserSecretClient
hugging_face_token=user_secrets.get_secret("HF_TOKEN")
wnb_token=user_secrets.get_secret("wnb")

#from kaggle_secrets import UserSecretsClient
#user_secrets = UserSecretsClient()
#secret_value_0 = user_secrets.get_secret("HF_TOKEN")
#secret_value_1 = user_secrets.get_secret("wnb ")


#login to hugging face
login(hugging_face_token) # from huggingface_hub import login

#login to wnb
wandb.login(key=wnb_token) #import wandb
run=wandb.init(
    project='deepseek',
    job_type='training',
    anonymous="allow"
)

### loading Deepseek r1 and the tokenizer

In [None]:
from transformers import AutoModelForCausalLM
#set parameters
max_seq_length=2048 #defining the maximum sequence length a model can handle 
dtype=None # default data type (usually auto-detected)
load_in_4bit=True #enables 4-bit quantization- a memory saving optimization
#load the deepseek r1 and tokenizer using unsloth
model,tokenizer=FastLanguageModel.from_pretrained(
    model_name="unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token=hugging_face_token,
    device_map="auto",  # This tries to intelligently place the model on available devices (GPU, CPU)
    #llm_int8_enable_fp32_cpu_offload=True,  # Enable offloading to CPU
)






###  Load Three Hugging Face Datasets

In [None]:


# Load datasets
dataset_1 = load_dataset("OpenAssistant/oasst1", split="train[:10%]")  # Example: Chatbot fine-tuning
dataset_2 = load_dataset("databricks/databricks-dolly-15k", split="train")  # Instruction-based dataset
dataset_3 = load_dataset("mlabonne/guanaco-llama2-1k", split="train")  # LLaMA-2 fine-tuning dataset

# Check dataset sizes
print("Dataset 1 size:", len(dataset_1))
print("Dataset 2 size:", len(dataset_2))
print("Dataset 3 size:", len(dataset_3))

# Limit the number of samples based on the dataset size
dataset_1 = dataset_1.shuffle().select(range(min(5000, len(dataset_1))))
dataset_2 = dataset_2.shuffle().select(range(min(5000, len(dataset_2))))
dataset_3 = dataset_3.shuffle().select(range(min(5000, len(dataset_3))))

# Concatenate datasets using concatenate_datasets()
combined_datasets = concatenate_datasets([dataset_1, dataset_2, dataset_3])



In [None]:
# Check the columns of the datasets
print(dataset_1.column_names)



dataset_1

In [None]:
print(dataset_2.column_names)


In [None]:

print(dataset_3.column_names)

### Preprocess Data for Fine-Tuning

In [None]:
# Preprocess function to handle the tokenization based on the column structure
def preprocess_function(examples, dataset_name):
    # For dataset_1, use the 'text' column
    if dataset_name == "dataset_1":
        return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=2048)
    # For dataset_2, concatenate 'instruction', 'context', and 'response' as one string
    elif dataset_name == "dataset_2":
        combined_text = [f"Instruction: {x} \nContext: {y} \nResponse: {z}" 
                         for x, y, z in zip(examples["instruction"], examples["context"], examples["response"])]
        return tokenizer(combined_text, truncation=True, padding="max_length", max_length=2048)
    # For dataset_3, use the 'text' column
    elif dataset_name == "dataset_3":
        return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=2048)

# Apply preprocessing function to each dataset
tokenized_dataset_1 = dataset_1.map(lambda examples: preprocess_function(examples, "dataset_1"), batched=True)
tokenized_dataset_2 = dataset_2.map(lambda examples: preprocess_function(examples, "dataset_2"), batched=True)
tokenized_dataset_3 = dataset_3.map(lambda examples: preprocess_function(examples, "dataset_3"), batched=True)

# Combine the datasets into one
from datasets import concatenate_datasets
combined_datasets = concatenate_datasets([tokenized_dataset_1, tokenized_dataset_2, tokenized_dataset_3])

# Split the combined dataset into train and test
train_test_split = combined_datasets.train_test_split(test_size=0.1)

# Access the training and testing datasets
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

# Optionally, print the first few examples after tokenization
print("Train dataset example:", train_dataset[:2])


In [None]:
#!pip install transformers>=4.8


In [None]:
#!pip install adapter-transformers


In [None]:
#!pip install adapter-transformers

In [None]:
# Check the column names of the dataset
#print(train_dataset.column_names)


In [None]:
from adapter_transformers import AdapterConfig, AdapterTrainer
from transformers import Trainer, TrainingArguments  # Keep this for the Trainer and TrainingArguments


### Fine-Tune the Model 

In [None]:
from transformers import AdapterConfig, Trainer, TrainingArguments
from unsloth import FastLanguageModel
from adapter_transformers import AdapterTrainer, AdapterConfig

# Step 1: Load Quantized Model and Add Adapters
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
    token=hugging_face_token
)

# Add an adapter
adapter_name = "adapter_1"
model.add_adapter(adapter_name, config=AdapterConfig.load("pfeiffer"))

# Step 2: Enable the adapter for training
model.set_active_adapters(adapter_name)

# Step 3: Preprocess Dataset 1 (same as before)
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=2048)

tokenized_dataset_1 = dataset_1.map(preprocess_function, batched=True)

# Split the dataset into train and test sets
train_test_split = tokenized_dataset_1.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

# Step 4: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=1000,
    save_total_limit=3,
)

# Step 5: Initialize the Trainer with AdapterTrainer
trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Step 6: Start Training
trainer.train()

# Step 7: Evaluate the Model
trainer.evaluate()


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./deepseek-finetuned",  # Output directory
    per_device_train_batch_size=2,  # Adjust batch size based on memory
    per_device_eval_batch_size=2,  # Evaluation batch size
    learning_rate=2e-5,  # Learning rate for fine-tuning
    weight_decay=0.01,  # Weight decay for regularization
    num_train_epochs=3,  # Number of epochs for training
    logging_dir="./logs",  # Logging directory
    logging_steps=10,  # Log every 10 steps
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save model at the end of each epoch
    report_to="wandb"  # Log metrics to W&B
)

# Ensure you specify the correct text field
# Ensure you specify the correct text field
train_dataset = train_dataset.map(lambda ex: {'text': ex['text']})
eval_dataset = eval_dataset.map(lambda ex: {'text': ex['text']})


# Now when passing to the trainer, the correct field will be used
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=training_args,
    tokenizer=tokenizer,
    dataset_text_field="text"  # Specify the correct text field here
)
# Start fine-tuning
trainer.train()



