### Installation

In [13]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

### Unsloth

In [14]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3", # "unsloth/mistral-7b" for 16bit loading
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2026.1.4: Fast Mistral patching. Transformers: 4.56.2.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [15]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",

                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM




Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


In [18]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("wickkiey/tamil-wikipedia-markdown", split="train")

# Split the dataset into train and test (e.g., 90% train, 10% test)
dataset_splits = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_splits['train']
val_dataset = dataset_splits['test']

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    return { "text" : [example + EOS_TOKEN for example in examples["text"]] }

# Apply formatting to both train and validation datasets
train_dataset = train_dataset.map(formatting_prompts_func, batched = True,)
val_dataset = val_dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/153913 [00:00<?, ? examples/s]

Map:   0%|          | 0/17102 [00:00<?, ? examples/s]

In [19]:
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

Training dataset size: 153913
Validation dataset size: 17102


### Save Finetuned Model
To save the finetuned model to your Google Drive, we first need to mount your Drive. Then, we can use the `save_pretrained` method for both the model and the tokenizer.

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
for row in dataset[:5]["text"]:
    print("=========================")
    print(row)

# விக்கிப்பீடியா:கலந்துரையாடல்

***இங்கு தமிழ் விக்கிப்பீடியாவைப் பற்றிய உங்கள் பொதுவான கருத்துக்கள், பாராட்டுக்கள் மற்றும் ஆலோசனைகளைத் தெரிவிக்கலாம். உங்கள் கருத்துகளுக்கு மற்ற விக்கிப்பீடியர்கள் பதில் அளிப்பார்கள். தகுந்த ஆலோசனைகளை உடனே செயற்படுத்தவும் செய்வோம். விக்கிப்பீடியா திட்டத்தின் வளர்ச்சி, மேலாண்மை குறித்த கலந்துரையாடலுக்கு விக்கிப்பீடியா:ஆலமரத்தடிக்குச் செல்லுங்கள். உங்கள் கருத்துக்களைப் பதிய மேலே காணும் *தொகு* இணைப்பை சொடுக்கவும். அடுத்து வரும் பக்கத்தில் உள்ள தொகுப்புப் பெட்டியில் கருத்தை உள்ளிட்டு "பக்கத்தைச் சேமிக்கவும்" என்பதை அழுத்தவும். நன்றி.***  

## முந்தைய கலந்துரையாடல்கள்
- **தொகுப்பு 01** (மயூரநாதனின் முதற் குறிப்பு!, தமிழ் விக்கிப்பீடியா 1000 கட்டுரைகள்.)

- **தொகுப்பு 02**
மிகவும் அருமையான முயற்சி வாழ்த்துக்கள்  

## பயனர் கருத்துகள்
உங்கள் கருத்துகளை இதன் கீழ் இடவும். கையெழுத்து இடப் பார்க்கவும்: விக்கிப்பீடியா:கையெழுத்து  

First time I saw the Tamil version of Wikimedia site. It's great...keep it up. Unfortunately I don't know how to use the tamil font her

In [9]:
import os

# Define the path in Google Drive
save_path = "/content/drive/MyDrive/mistral_tamil_finetuned_model"

# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)

# Save the model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Finetuned model and tokenizer saved to {save_path}")

Finetuned model and tokenizer saved to /content/drive/MyDrive/mistral_tamil_finetuned_model


In [20]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = val_dataset, # Added validation dataset
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        warmup_ratio = 0.1,
        num_train_epochs = 10,
        # max_steps = 2, # Removed for full training

        learning_rate = 5e-5,
        embedding_learning_rate = 5e-6,

        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "/content/drive/MyDrive/mistral_tamil_finetuned_checkpoints", # Updated to Google Drive path
        save_strategy = "steps", # Changed to save by steps
        save_steps = 30, # Added to save every 30 steps
        save_total_limit = 2, # Added to limit saved checkpoints to 2
        report_to = "none", # Use TrackIO/WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/153913 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/17102 [00:00<?, ? examples/s]

In [11]:
import os
import glob

# Define the output directory where checkpoints are saved
output_dir = "/content/drive/MyDrive/mistral_tamil_finetuned_checkpoints"

# Check for existing checkpoints
checkpoints = list(sorted(glob.glob(os.path.join(output_dir, "checkpoint-*"), recursive=True)))

latest_checkpoint = None
if len(checkpoints) > 0:
    # Get the latest checkpoint directory
    latest_checkpoint = checkpoints[-1]
    print(f"Resuming training from checkpoint: {latest_checkpoint}")
else:
    print("No checkpoints found, starting training from scratch.")

# Train the model, resuming from the latest checkpoint if available
trainer.train(resume_from_checkpoint=latest_checkpoint)


No checkpoints found, starting training from scratch.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 25 | Num Epochs = 10 | Total steps = 20
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 603,979,776 of 7,852,003,328 (7.69% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.1129
2,1.0125
3,1.1319
4,1.0573
5,0.9434
6,0.8465
7,0.7247
8,0.7937
9,0.6484
10,0.5484


TrainOutput(global_step=20, training_loss=0.6123763591051101, metrics={'train_runtime': 582.5707, 'train_samples_per_second': 0.429, 'train_steps_per_second': 0.034, 'total_flos': 2.181655972552704e+16, 'train_loss': 0.6123763591051101, 'epoch': 10.0})

## Update Final Model Saving Path


In [12]:
import os

# Define the path in Google Drive
save_path = "/content/drive/MyDrive/mistral_tamil_finetuned_model"

# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)

# Save the model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Finetuned model and tokenizer saved to {save_path}")

Finetuned model and tokenizer saved to /content/drive/MyDrive/mistral_tamil_finetuned_model


### Verify Finetuned Model by Loading and Inference
After saving the finetuned model to a distinct Google Drive path, it is crucial to verify that the model can be loaded correctly and performs as expected for inference. This step involves loading the model and tokenizer from the specified path and then using them to generate text based on a sample prompt.