# Fine-tuning TinyLlama/TinyLlama-1.1B-Chat-v1.0 for fluency in Assamese.
## Training in two stages.

### Stage 1: Domain-adaptive pretraining
Training the model on raw Assamese text data, exposing the model to the language's vocabulary, grammar, and cultural context, improving it generation quality in Assamese.  
### Stage 2: Tast-specific fine-tuning
Fine-tuning on conversation dataset, adaots the model to dialogue patterns, ensuring it responds appropriately in Assamese conversations.  

**This approach mitigates TinyLlama's limited pretraining exposure to Assamese, reducing issues like poor tokenization or unnatural responses.**

In [None]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth

In [None]:
from unsloth import FastLanguageModel, UnslothTrainer

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer
from transformers import TrainingArguments

In [None]:
from huggingface_hub import login
login()

In [None]:
# We are training with a context window of only 1024 due to limited hardware.
MAX_SEQ_LENGTH = 1024

### Loading the model in 4-bit quantized format for efficiency and applying lora adapters.

In [None]:
# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    max_seq_length=MAX_SEQ_LENGTH, 
    dtype=torch.float16,     # Mixed Precision
    load_in_4bit=True,        # 4-bit quantization
    # device_map="auto",
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing=True,
    random_state=1337,
)
model.print_trainable_parameters()

## Load dataset for both the stages.
### raw_dataset - contains 12k high quality Assamese sentences
### conv_dataset - contains 2k samples of user->assistant conversation

In [None]:
# Stage 1 & 2 datasets
raw_dataset = load_dataset("text", data_files="/kaggle/input/train-as/train-01.txt", split="train")
conv_dataset = load_dataset("json", data_files="/kaggle/input/as-train-processed/conversations.jsonl", split="train")

## Stage 1 setup and training.

In [None]:
training_args_stage1=TrainingArguments(
    output_dir="/kaggle/working/as/tinyllama-training-stage1",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    max_steps=1600,
    learning_rate=1e-4,  # Smaller learning rate for stage 1
    fp16=not torch.cuda.is_bf16_supported(),  # Use FP16 if BF16 is not supported
    bf16=torch.cuda.is_bf16_supported(),  # Use BF16 if supported
    save_steps=200,
    logging_dir="/kaggle/working/as/logs_stage1",
    logging_steps=100,
    optim="adamw_8bit",  # Memory-efficient optimizer
    report_to="none",
    warmup_steps=100,

    dataloader_num_workers=4,    # Added for faster data loading
    dataloader_pin_memory=True,  # Added for efficient GPU transfer
    
    seed=1337,
)

trainer_stage1 = SFTTrainer(
    model=model,
    train_dataset=raw_dataset,   # using the raw_text dataset
    tokenizer=tokenizer,
    dataset_text_field="text",  # Field name in the dataset containing text
    max_seq_length=MAX_SEQ_LENGTH,
    dataset_num_proc=2,  # Number of processes for data preprocessing
    args=training_args_stage1
)

In [None]:
trainer_stage1.train()

### Save domain adaptive model to local dir and hf

In [None]:
model.save_pretrained("/kaggle/working/as/trained-stage1/tinyllama-lora-adapters")
tokenizer.save_pretrained("/kaggle/working/as/trained-stage1/tinyllama-lora-adapters")

In [None]:
model.push_to_hub("themid6t/assamese-tinyllama-base")
tokenizer.push_to_hub("themid6t/assamese-tinyllama-base")

## Stage 2 setup and training

In [None]:
def format_conversations(examples):
    messages = [{"role": "system", "content": "You are a helpful assistant fluent in Assamese."}] + examples["messages"]
    formatted_text = tokenizer.apply_chat_template(messages, tokenize=False)
    return {"text": formatted_text}

conv_dataset = conv_dataset.map(format_conversations, batched=False)

In [None]:
conv_dataset[0]["text"][:1000]

In [None]:
# Configure training for supervised fine-tuning
training_args_stage2 = TrainingArguments(
    output_dir="./fine_tuned_tinyllama",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=10,
    # max_steps=500,  # Adjust based on dataset size
    # logging_steps=10,
    save_steps=100,
    fp16=not torch.cuda.is_bf16_supported(),  # Use FP16 if BF16 is not supported
    bf16=torch.cuda.is_bf16_supported(),  # Use BF16 if supported
    optim="adamw_8bit",
    report_to="none",
    # warmup_steps=50
)

trainer_stage2 = SFTTrainer(
    model=model,  # Reuse the same model instance
    tokenizer=tokenizer,
    train_dataset=conv_dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    args=training_args_stage2
)

In [None]:
trainer_stage2.train()

### Save the conversation model to hf and local dir

In [None]:
model.save_pretrained("/kaggle/working/as/trained-stage2/tinyllama-lora-adapters")
tokenizer.save_pretrained("/kaggle/working/as/trained-stage2/tinyllama-lora-adapters")

In [None]:
model.push_to_hub("themid6t/assamese-tinyllama-chat")
tokenizer.push_to_hub("themid6t/assamese-tinyllama-chat")

### Infer and test

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant fluent in Assamese."},
    {"role": "user", "content": "আপুনি কোন?"}
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False)

In [None]:
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7)
print(outputs[0]["generated_text"])

In [None]:
# Create a ZIP archive
!zip -r /kaggle/working/as.zip /kaggle/working/as

# Download directly from notebook
from IPython.display import FileLink
FileLink(r'/kaggle/working/as.zip')  # Click this link