In [None]:
!pip install torch transformers peft bitsandbytes accelerate trl datasets "huggingface_hub[cli]"



In [None]:
# To import necessary libs
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from huggingface_hub import login
import os
from datasets import load_dataset
from trl import SFTTrainer


In [None]:
!pip install --upgrade trl
!pip show trl

Name: trl
Version: 0.18.1
Summary: Train transformer language models with reinforcement learning.
Home-page: https://github.com/huggingface/trl
Author: Leandro von Werra
Author-email: leandro.vonwerra@gmail.com
License: 
Location: /usr/local/lib/python3.11/dist-packages
Requires: accelerate, datasets, transformers
Required-by: 


In [None]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
model.config.pretraining_tp = 1
model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("--- Initial Model Setup Complete ---")

print("Loading dataset")
raw_dataset = load_dataset("WangX0111/MLIR-Passes", split="train")


def format_mlir_example(example):
    system_prompt = "You are an expert MLIR compiler optimizer. Your goal is to select the single most effective MLIR pass to apply to the provided code."
    user_prompt = f"MLIR Source Code:\n```\n{example['source']}\n```"

    assistant_response = f"Optimal MLIR Pass: `{example['pass']}`"

    return {
        "text": f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_prompt}\n<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n{user_prompt}\n<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n{assistant_response}<|eot_id|>"
    }

print("formatting dataset")
formatted_dataset = raw_dataset.map(format_mlir_example, remove_columns=raw_dataset.column_names)

training_args = TrainingArguments(
    output_dir="./mlir_optimizer_results",  # Directory to save checkpoints and logs
    num_train_epochs=1,                     # Train for only 1 epoch (very minimal, good for quick test)
                                            # You'll likely need more for actual performance
    per_device_train_batch_size=1,          # Batch size per GPU (start with 1 due to 8GB VRAM)
    gradient_accumulation_steps=4,          # Accumulate gradients over 4 steps to simulate a larger batch size of 4
    optim="paged_adamw_8bit",               # Optimizer designed for QLoRA
    logging_steps=100,                      # Log training progress every 100 steps
    learning_rate=2e-4,                     # Standard learning rate for fine-tuning LLMs
    fp16=True,                              # Enable mixed precision training (important for VRAM)
    bf16=False,                             # Set to True if your GPU supports bfloat16, otherwise False
    max_grad_norm=0.3,                      # Clip gradients to prevent exploding gradients
    warmup_ratio=0.03,                      # Warmup learning rate over first 3% of training
    lr_scheduler_type="cosine",             # Learning rate schedule
    report_to="none",                       # Don't report to external tools (wandb etc.) for simplicity
    save_strategy="steps",                  # Save checkpoints based on steps
    save_steps=1000,                        # Save every 1000 steps
)
print("TrainingArguments configured.")

trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_dataset,
    peft_config=lora_config,             # Pass the LoRA configuration
    args=training_args,
)
print("SFTTrainer initialized. Starting training...")

try:
    trainer.train()
    print("\nTraining complete! Model saved to:", training_args.output_dir)
except RuntimeError as e:
    print(f"\nTraining failed with a RuntimeError: {e}")
    if "out of memory" in str(e).lower():
        print("This often indicates GPU VRAM is insufficient.")
        print("Try reducing 'per_device_train_batch_size' to 1 (if not already),")
        print("increasing 'gradient_accumulation_steps', or reducing 'max_seq_length'.")
        print("Consider using cloud GPUs if the issue persists.")
    else:
        print("Check error message for more details.")


trainer.save_model(os.path.join(training_args.output_dir, "final_lora_adapters"))



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5196
--- Initial Model Setup Complete ---
Loading dataset from Hugging Face...
Dataset loaded with 8201 examples.
Applying formatting to dataset examples...
Dataset formatting complete.
Example formatted text:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are an expert MLIR compiler optimizer. Your goal is to select the single most effective MLIR pass to apply to the provided code.
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
MLIR Source Code:
```
module {
  func.func @bar() {
    return
  }
  func.func @standalone_types(%arg0: !standalone.custom<"10">) {
    return
  }
}
```
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Optimal MLIR Pass: ` mlir-opt  --load-dialect-plugin=tandalone_libs/StandalonePluginhlibext --pass-pipeline="builtin.module(standalone-switch-bar-foo)" `<|eot_id|>
TrainingArguments configured.


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


SFTTrainer initialized. Starting training...


  return fn(*args, **kwargs)


Step,Training Loss


KeyboardInterrupt: 

In [None]:
import os

base_model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

lora_adapter_path = "./mlir_optimizer_results/final_lora_adapters"

output_merged_model_path = "./merged_llama3_mlir_optimizer"

os.makedirs(output_merged_model_path, exist_ok=True)

print(f"Attempting to merge LoRA adapters from: {lora_adapter_path}")
print(f"Saving merged model to: {output_merged_model_path}")


try:
    print("Loading base model...")
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        return_dict=True,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    print("Base model loaded.")

    tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right" # Ensure consistency
    print("Tokenizer loaded.")

except Exception as e:
    print(f"Error loading base model or tokenizer: {e}")
    print("Ensure you have access to Llama 3 on Hugging Face and your HF_TOKEN is set.")
    print("Also, check your GPU VRAM. Loading the base model (even if not quantized) can be VRAM intensive.")
    exit()

try:
    print(f"Loading PEFT adapters from {lora_adapter_path}...")
    from peft import PeftModel
    ft_model = PeftModel.from_pretrained(base_model, lora_adapter_path)
    print("PEFT adapters loaded.")

    print("Merging LoRA adapters into the base model...")
    merged_model = ft_model.merge_and_unload()
    print("Merging complete. The model is now a standard Hugging Face model.")

except Exception as e:
    print(f"Error loading PEFT adapters or merging: {e}")
    print("Check if the 'lora_adapter_path' is correct and contains valid PEFT weights.")
    exit()

try:
    print(f"Saving merged model to {output_merged_model_path}...")
    merged_model.save_pretrained(output_merged_model_path)
    tokenizer.save_pretrained(output_merged_model_path)
    print("Merged model and tokenizer saved successfully!")
    print(f"\nYour downloadable model is now available at: {os.path.abspath(output_merged_model_path)}")
except Exception as e:
    print(f"Error saving merged model: {e}")

Attempting to merge LoRA adapters from: ./mlir_optimizer_results/final_lora_adapters
Saving merged model to: ./merged_llama3_mlir_optimizer
Loading base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Base model loaded.
Tokenizer loaded.
Loading PEFT adapters from ./mlir_optimizer_results/final_lora_adapters...
PEFT adapters loaded.
Merging LoRA adapters into the base model...
Merging complete. The model is now a standard Hugging Face model.
Saving merged model to ./merged_llama3_mlir_optimizer...
Merged model and tokenizer saved successfully!

Your downloadable model is now available at: /content/merged_llama3_mlir_optimizer


In [None]:
from huggingface_hub import HfApi, create_repo
import os

api = HfApi()

repo_id = "4skin/llama3-8b-mlir-optimizer"

api.upload_folder(
    folder_path=output_merged_model_path,
    repo_id=repo_id,
    repo_type="model",
)
print(f"Model successfully uploaded to https://huggingface.co/{repo_id}")

Uploading...:   0%|          | 0.00/16.1G [00:00<?, ?B/s]

Model successfully uploaded to https://huggingface.co/4skin/llama3-8b-mlir-optimizer
