### Continued Pretraining on a New Language using Unsloth

In [21]:
!pip install -U bitsandbytes accelerate unsloth transformers datasets



In [22]:
import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments, Trainer
import warnings
warnings.filterwarnings("ignore")

In [23]:
# Check and confirm bitsandbytes installation
try:
    import bitsandbytes as bnb
    print("bitsandbytes installed successfully:", bnb.__version__)
except ImportError:
    print("Installing bitsandbytes again to ensure proper setup...")
    !pip install -U bitsandbytes
    import bitsandbytes as bnb
    print("bitsandbytes installed successfully:", bnb.__version__)

# Now import Unsloth
try:
    from unsloth import FastLanguageModel
except ImportError:
    print("Installing unsloth...")
    !pip install -U unsloth
    from unsloth import FastLanguageModel

bitsandbytes installed successfully: 0.45.5


In [24]:
# Set up environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [25]:
# Define constants
MODEL_NAME = "meta-llama/Llama-2-7b-hf"  # Base model
NEW_MODEL_NAME = "llama2-7b-swedish"  # Name for the fine-tuned model
TARGET_LANGUAGE = "swedish"  # The new language to learn
MAX_SEQ_LENGTH = 1024  # Reduced from 2048 to save memory
BATCH_SIZE = 2  # Reduced batch size for memory efficiency
GRADIENT_ACCUMULATION_STEPS = 8  # Increased to compensate for smaller batch size
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 1  # Reduced for demonstration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Training on: {DEVICE}")

Training on: cuda


In [26]:
# Check for available memory
if DEVICE == "cuda":
    # Print GPU info
    !nvidia-smi

    # Consider a smaller model if memory is limited
    if torch.cuda.get_device_properties(0).total_memory < 15 * 1024 * 1024 * 1024:  # Less than 15GB
        print("Limited GPU memory detected! Switching to a smaller model...")
        MODEL_NAME = "facebook/opt-1.3b"  # Use a smaller model

Mon Apr 14 05:01:46 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   63C    P0             30W /   70W |    6740MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [27]:
# Load model and tokenizer with fallback options
try:
    # Try 4-bit first for maximum memory efficiency
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_NAME,
        max_seq_length=MAX_SEQ_LENGTH,
        dtype=torch.bfloat16,
        load_in_4bit=True,
    )
    print("Model loaded in 4-bit quantization")
except Exception as e:
    print(f"4-bit loading failed: {e}")
    try:
        # Try 8-bit as fallback
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=MODEL_NAME,
            max_seq_length=MAX_SEQ_LENGTH,
            dtype=torch.float16,
            load_in_8bit=True,
        )
        print("Model loaded in 8-bit quantization")
    except Exception as e:
        print(f"8-bit loading failed: {e}")
        # Final fallback to FP16
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=MODEL_NAME,
            max_seq_length=MAX_SEQ_LENGTH,
            dtype=torch.float16,
        )
        print("Model loaded in FP16 (no quantization)")

==((====))==  Unsloth 2025.3.19: Fast Opt patching. Transformers: 4.51.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded in 4-bit quantization


In [28]:
# Prepare the model for training with PEFT/LoRA adapters
# The API has changed - let's check documentation and current version
import unsloth
print(f"Unsloth version: {unsloth.__version__}")

# Using the correct API for the current version of Unsloth
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                   "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
)

# Load dataset - use a smaller subset for testing and faster training
print(f"Loading dataset for {TARGET_LANGUAGE}...")
try:
    # Try loading Oscar dataset first
    dataset = load_dataset("oscar", f"unshuffled_deduplicated_{TARGET_LANGUAGE}", split="train[:1000]")  # Small subset
except Exception as e:
    print(f"Failed to load Oscar dataset: {e}")
    # Fallback to OPUS books dataset or MC4 if available
    try:
        dataset = load_dataset("opus_books", "sv", split="train[:1000]")  # For Swedish example
    except:
        try:
            dataset = load_dataset("mc4", languages=[TARGET_LANGUAGE], split="train[:1000]")
        except:
            # Create a tiny synthetic dataset as a last resort
            print("Using synthetic data for demonstration")
            from datasets import Dataset

            # Sample Swedish sentences (replace with actual content in your target language)
            swedish_texts = [
                "Hej, hur mår du idag?",
                "Jag studerar svenska språket.",
                "Det är en vacker dag idag.",
                "Tack för din hjälp.",
                "Jag älskar att läsa böcker."
            ] * 200  # Repeat to make a larger dataset

            dataset = Dataset.from_dict({"text": swedish_texts})

Unsloth version: 2025.3.19
Unsloth: Making `model.base_model.model.model.decoder` require gradients
Loading dataset for swedish...
Failed to load Oscar dataset: BuilderConfig 'unshuffled_deduplicated_swedish' not found. Available: ['unshuffled_deduplicated_af', 'unshuffled_deduplicated_als', 'unshuffled_deduplicated_am', 'unshuffled_deduplicated_an', 'unshuffled_deduplicated_ar', 'unshuffled_deduplicated_arz', 'unshuffled_deduplicated_as', 'unshuffled_deduplicated_ast', 'unshuffled_deduplicated_av', 'unshuffled_deduplicated_az', 'unshuffled_deduplicated_azb', 'unshuffled_deduplicated_ba', 'unshuffled_deduplicated_bar', 'unshuffled_deduplicated_bcl', 'unshuffled_deduplicated_be', 'unshuffled_deduplicated_bg', 'unshuffled_deduplicated_bh', 'unshuffled_deduplicated_bn', 'unshuffled_deduplicated_bo', 'unshuffled_deduplicated_bpy', 'unshuffled_deduplicated_br', 'unshuffled_deduplicated_bs', 'unshuffled_deduplicated_bxr', 'unshuffled_deduplicated_ca', 'unshuffled_deduplicated_cbk', 'unshuffl

In [29]:
# Function to tokenize dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        return_tensors="pt",
        padding="max_length",
    )

# Process the dataset
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names,
    num_proc=4,
)

# Create a small validation split
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.05)

# Set up training arguments
training_args = TrainingArguments(
    output_dir=f"./output/{NEW_MODEL_NAME}",
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.03,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=2,
    fp16=(DEVICE == "cuda" and not torch.cuda.is_bf16_supported()),
    bf16=(DEVICE == "cuda" and torch.cuda.is_bf16_supported()),
    logging_steps=10,
    report_to="tensorboard",
    optim="adamw_torch",
    max_grad_norm=1.0,
)

Tokenizing dataset...


Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [30]:
# Create data collator for language modeling
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We want causal language modeling, not masked language modeling
)

In [31]:
# Start training
print("Starting training...")

# Use standard HuggingFace Trainer - Unsloth optimizations are already applied to the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 950 | Num Epochs = 1 | Total steps = 59
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 4,718,592/3,000,000,000 (0.16% trained)


Step,Training Loss,Validation Loss
50,5.9588,5.364227


TrainOutput(global_step=59, training_loss=7.157562449827033, metrics={'train_runtime': 545.3183, 'train_samples_per_second': 1.742, 'train_steps_per_second': 0.108, 'total_flos': 7037185400242176.0, 'train_loss': 7.157562449827033, 'epoch': 0.9936842105263158})

In [32]:
# Save the final model
print("Saving model...")
model_path = f"./output/{NEW_MODEL_NAME}_final"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

Saving model...


('./output/llama2-7b-swedish_final/tokenizer_config.json',
 './output/llama2-7b-swedish_final/special_tokens_map.json',
 './output/llama2-7b-swedish_final/vocab.json',
 './output/llama2-7b-swedish_final/merges.txt',
 './output/llama2-7b-swedish_final/added_tokens.json',
 './output/llama2-7b-swedish_final/tokenizer.json')

In [33]:
# Test the model
print("Testing model...")
test_prompt = f"Translate this to {TARGET_LANGUAGE}: Hello, how are you today?"
inputs = tokenizer(test_prompt, return_tensors="pt").to(DEVICE)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,
        temperature=0.7,
        top_p=0.9,
    )

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Input: {test_prompt}")
print(f"Output: {generated_text}")

print("Done!")

Testing model...
Input: Translate this to swedish: Hello, how are you today?
Output: Translate this to swedish: Hello, how are you today?
















       ..,,,, but----,-\\\\\\\\\\ Macca
Done!
