In [None]:
!pip install datasets pandas huggingface_hub -q
!pip install -U datasets -q

In [None]:
!huggingface-cli login

# **Training Tokenizer From Scratch**

This way ensure the best performance and doesn't require too much compute.

In [None]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers, decoders, processors
from transformers import PreTrainedTokenizerFast, AutoTokenizer
from datasets import load_dataset
import tempfile
import os
import time

In [None]:
def train_kikuyu_tokenizer_from_scratch(
    dataset_name="thirtyninetythree/kikuyu_monolingual_sentences",  # This is a combination of Kikuyu sentences datasets available on huggingface, deduplicated and cleaned
    text_column="text",
    vocab_size=32000,
    model_name="kikuyu-tokenizer",
):
    """
    Train a BPE tokenizer from scratch optimized for Kikuyu language
    """

    print(f"🚀 Training Kikuyu tokenizer from scratch")
    print(f"📊 Target vocab size: {vocab_size:,}")
    print("="*60)

    # Step 1: Load dataset from Hugging Face
    print("Loading dataset from Hugging Face...")
    try:
        dataset = load_dataset(dataset_name, split="train", download_mode="force_redownload",)
        print(f"Loaded {len(dataset):,} examples")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        print("Make sure your dataset is public or you're logged in")
        return None

    # Step 2: Prepare training data
    print("\n📝 Preparing training text...")

    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8') as f:
        temp_file = f.name

        total_chars = 0
        total_lines = 0

        for example in dataset:
            text = example[text_column].strip()
            if text:  # Skip empty texts
                f.write(text + '\n')
                total_chars += len(text)
                total_lines += 1

    print(f"Training data prepared:")
    print(f"   Temp file: {temp_file}")
    print(f"   Lines: {total_lines:,}")
    print(f"   Characters: {total_chars:,}")
    print(f"   Avg chars/line: {total_chars/total_lines:.1f}")

    # Step 3: Initialize tokenizer
    print("\nInitializing BPE tokenizer...")

    tokenizer = Tokenizer(models.BPE())

    # Normalizer: Critical for Kikuyu diacritics (ũ, ĩ, etc.)
    tokenizer.normalizer = normalizers.Sequence([
        normalizers.NFKC(),  # Unicode normalization for diacritics
        # NOT lowercasing - preserve Kikuyu capitalization
    ])

    # Pre-tokenizer: How to split text before BPE
    tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
        pre_tokenizers.WhitespaceSplit(),  # Split on whitespace
        pre_tokenizers.Punctuation(),     # Separate punctuation
    ])

    # Decoder: COnverts tokens back to text
    tokenizer.decoder = decoders.BPEDecoder(suffix="</w>")

    # Step 4: Set up trainer with Kikuyu-specific special tokens
    print("⚙️ Setting up BPE trainer...")

    special_tokens = [
        "<pad>",    # Padding token
        "<unk>",    # Unknown token
        "<s>",      # Start of sequence (Llama compatibility)
        "</s>",     # End of sequence (Llama compatibility)
        "[INST]",   # Instruction start (for future chat models)
        "[/INST]",  # Instruction end
        "<<SYS>>",  # System message start
        "<</SYS>>", # System message end
    ]

    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=special_tokens,
        min_frequency=2,  # Don't include very rare tokens
        continuing_subword_prefix="",  # BPE suffix
        end_of_word_suffix="</w>",     # End of word marker
        show_progress=True,
    )

    # Step 5: Train the tokenizer
    print(f"\nTraining tokenizer on {total_lines:,} lines...")
    start_time = time.time()

    tokenizer.train([temp_file], trainer)

    training_time = time.time() - start_time
    print(f"raining completed in {training_time:.1f} seconds")

    # Step 6: Add post-processor for proper token handling
    tokenizer.post_processor = processors.TemplateProcessing(
        single="<s> $A </s>",
        pair="<s> $A </s> <s> $B </s>",
        special_tokens=[
            ("<s>", tokenizer.token_to_id("<s>")),
            ("</s>", tokenizer.token_to_id("</s>")),
        ],
    )

    # Step 7: Save raw tokenizer
    tokenizer_file = f"{model_name}.json"
    tokenizer.save(tokenizer_file)
    print(f"Raw tokenizer saved: {tokenizer_file}")

    # Step 8: Create Hugging Face compatible tokenizer
    print("\nCreating Hugging Face compatible tokenizer...")

    hf_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        unk_token="<unk>",
        pad_token="<pad>",
        bos_token="<s>",
        eos_token="</s>",
        additional_special_tokens=["[INST]", "[/INST]", "<<SYS>>", "<</SYS>>"],
        clean_up_tokenization_spaces=True
    )

    # Save HuggingFace tokenizer
    hf_tokenizer.save_pretrained(model_name)
    print(f"💾 HuggingFace tokenizer saved: {model_name}/")

    # Step 9: Test the tokenizer
    print("\n🧪 Testing tokenizer...")
    test_kikuyu_tokenizer(hf_tokenizer)

    # Cleanup
    os.unlink(temp_file)

    print(f"\n🎉 SUCCESS! Kikuyu tokenizer trained from scratch")
    print(f"📁 Files created:")
    print(f"   • {tokenizer_file} (raw tokenizer)")
    print(f"   • {model_name}/ (HuggingFace format)")

    return hf_tokenizer

def test_kikuyu_tokenizer(tokenizer):
    """Test the tokenizer with Kikuyu text samples"""

    test_texts = [
        "Wanjiku nĩ mũrũithia wa kũheshimu.",
        "Kamau anapenda gũthoma vitabu vingi.",
        "Mũndũ ũũ nĩ mũheshimiwa mũno gũkũ.",
        "Andũ aya nĩ marafiki ma ma.",
        "Kĩndũ kĩu nĩ kĩega mũno kũrĩ andũ othe.",
        "Nyambura aracokia mũciĩ kũuma wĩra-inĩ.",
    ]

    print("="*50)
    print("KIKUYU TOKENIZER TEST")
    print("="*50)

    total_chars = 0
    total_tokens = 0

    for i, text in enumerate(test_texts, 1):
        # Tokenize
        tokens = tokenizer.tokenize(text)
        token_ids = tokenizer.encode(text, add_special_tokens=False)

        # Stats
        total_chars += len(text)
        total_tokens += len(tokens)

        print(f"\n{i}. Text: {text}")
        print(f"   Tokens ({len(tokens)}): {tokens}")
        print(f"   IDs: {token_ids}")

        # Test decoding
        decoded = tokenizer.decode(token_ids, skip_special_tokens=True)
        if decoded.strip() == text.strip():
            print(f"   ✅ Decoding: Perfect")
        else:
            print(f"   ⚠️  Decoding: '{decoded}' (slight difference)")

    # Overall stats
    compression_ratio = total_chars / total_tokens if total_tokens > 0 else 0

    print(f"\n📊 OVERALL PERFORMANCE:")
    print(f"   Characters: {total_chars}")
    print(f"   Tokens: {total_tokens}")
    print(f"   Compression: {compression_ratio:.2f} chars/token")
    print(f"   Vocabulary size: {tokenizer.vocab_size}")

    return compression_ratio

In [None]:
def main():
    print("🚀 Kikuyu Tokenizer Training from Scratch")
    print("="*50)

    # Configure your dataset
    DATASET_NAME = "thirtyninetythree/kikuyu_monolingual_sentences"
    TEXT_COLUMN = "text"

    print(f"📊 Dataset: {DATASET_NAME}")
    print(f"📝 Text column: {TEXT_COLUMN}")
    print(f"🎯 Target: 32,000 vocabulary size")

    # Train tokenizer
    custom_tokenizer = train_kikuyu_tokenizer_from_scratch(
        dataset_name=DATASET_NAME,
        text_column=TEXT_COLUMN,
        vocab_size=32000,
        model_name="kikuyu-bpe-tokenizer"
    )

    custom_tokenizer.save_pretrained("kikuyu-bpe-tokenizer")
    custom_tokenizer.push_to_hub("kikuyu-bpe-tokenizer")

    print(f"\nTraining complete!")


if __name__ == "__main__":
    main()

Testing the tokenizer

In [None]:
from transformers import AutoModelForCausalLM, PreTrainedTokenizerFast
import torch

tokenizer = PreTrainedTokenizerFast.from_pretrained("thirtyninetythree/kikuyu-bpe-tokenizer")

model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model.eval()  # Set model to evaluation mode

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Example Kikuyu inpu text
prompt = "Niwega muno"
inputs = tokenizer(prompt, return_tensors="pt", return_token_type_ids=False).to(device)

output_ids = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=True,
    top_p=0.95,
    temperature=0.8,
    eos_token_id=tokenizer.eos_token_id,
)

output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(output_text)


# **LoRa Fine tune**



In [None]:
import torch
from transformers import AutoModelForCausalLM, PreTrainedTokenizerFast, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model # type: ignore
import os


HUB_REPO_NAME = "thirtyninetythree/TinyLlama-1.1B-Kikuyu-LoRA"
TOKENIZER_PATH = "thirtyninetythree/kikuyu-bpe-tokenizer"
DATASET_PATH = "thirtyninetythree/kikuyu_monolingual_sentences"
BASE_MODEL_PATH = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# --- 1. Load Tokenizer and Base Model ---
print("Loading tokenizer...")
tokenizer = PreTrainedTokenizerFast.from_pretrained(TOKENIZER_PATH)

# Add padding token if it doesn't exist (common for Llama-like models for training)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<pad>"})
    print(f"Added pad_token: '{tokenizer.pad_token}' with ID {tokenizer.pad_token_id}")

print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True,
)

# 2. Resize Token Embeddings (Crucial for new vocabulary from your tokenizer)
print(f"Original model embedding size: {model.get_input_embeddings().weight.shape[0]}")
model.resize_token_embeddings(len(tokenizer))
print(f"Resized model embedding size: {model.get_input_embeddings().weight.shape[0]}")

# --- 3. Configure LoRA ---
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# --- 4. Get the PEFT (LoRA) Model ---
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# --- 5. Prepare Dataset ---
print("\nLoading and preparing dataset...")
dataset = load_dataset(DATASET_PATH, split="train")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=["text"],
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# --- 6. Set up Training Arguments ---
training_args = TrainingArguments(
    output_dir="./fine_tuned_kikuyu_llama_lora", # Temporary output directory for trainer
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs_lora",
    logging_steps=100,
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    fp16=False,
    bf16=True,
    report_to="none",
    push_to_hub=False,
)

# --- 7. Start Fine-tuning with Trainer ---
print("\nStarting LoRA fine-tuning...")
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

trainer.train()
print("\nLoRA Fine-tuning complete.")

# --- 8. Push LoRA Adapters and Tokenizer to Hugging Face Hub ---
print(f"\nPushing LoRA adapters and tokenizer to Hugging Face Hub at: {HUB_REPO_NAME}")
try:
   trainer.model.push_to_hub(HUB_REPO_NAME)
   tokenizer.push_to_hub(HUB_REPO_NAME)
   print("Successfully pushed model adapters and tokenizer to Hugging Face Hub!")
   print(f"You can view your model at: https://huggingface.co/{HUB_REPO_NAME}")
except Exception as e:
   print(f"Failed to push to Hugging Face Hub: {e}")
   print("Please ensure you are logged in: `huggingface-cli login`")



In [None]:
# Test out the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move model to device (trainer.train() should have done this, but explicit for clarity)
model.to(device)
model.eval()

if tokenizer.pad_token_id is None:
   tokenizer.pad_token_id = tokenizer.eos_token_id

prompts = [
   "Niwega muno",
   "Mathomo ma Kikuyu ni",
   "Mwana wa ndege"
]

for i, prompt in enumerate(prompts):
   inputs = tokenizer(prompt, return_tensors="pt", return_token_type_ids=False).to(device)
   output_ids = model.generate(
       **inputs,
       max_new_tokens=50,
       do_sample=True,
       top_p=0.95,
       temperature=0.8,
       eos_token_id=tokenizer.eos_token_id,
       pad_token_id=tokenizer.pad_token_id,
   )
   output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
   print(f"\n--- Prediction {i+1} ---")
   print(f"Prompt: {prompt}")
   print(f"Generated Output: {output_text}")
