In [None]:
!pip install torch transformers peft bitsandbytes accelerate trl datasets "huggingface_hub[cli]"

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl
  Downloading trl-0.18.1-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  D

In [None]:
# To import necessary libs
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from huggingface_hub import login
import os
from datasets import load_dataset
from trl import SFTTrainer


In [None]:
!pip install --upgrade trl
!pip show trl

Name: trl
Version: 0.18.1
Summary: Train transformer language models with reinforcement learning.
Home-page: https://github.com/huggingface/trl
Author: Leandro von Werra
Author-email: leandro.vonwerra@gmail.com
License: 
Location: /usr/local/lib/python3.11/dist-packages
Requires: accelerate, datasets, transformers
Required-by: 


In [None]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
model.config.pretraining_tp = 1
model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("--- Initial Model Setup Complete ---")

print("Loading dataset")
raw_dataset = load_dataset("WangX0111/MLIR-Passes", split="train")


def format_mlir_example(example):
    system_prompt = "You are an expert MLIR compiler optimizer. Your goal is to select the single most effective MLIR pass to apply to the provided code."
    user_prompt = f"MLIR Source Code:\n```\n{example['source']}\n```"

    assistant_response = f"Optimal MLIR Pass: `{example['pass']}`"

    return {
        "text": f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_prompt}\n<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n{user_prompt}\n<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n{assistant_response}<|eot_id|>"
    }

print("formatting dataset")
formatted_dataset = raw_dataset.map(format_mlir_example, remove_columns=raw_dataset.column_names)

training_args = TrainingArguments(
    output_dir="./mlir_optimizer_results",
    num_train_epochs=3,

    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    logging_steps=100,
    learning_rate=2e-4,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    report_to="none",
    save_strategy="steps",
    save_steps=1000,
)
print("TrainingArguments configured.")

trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_dataset,
    peft_config=lora_config,             # Pass the LoRA configuration
    args=training_args,
)
print("SFTTrainer initialized. Starting training...")

try:
    trainer.train()
    print("\nTraining complete! Model saved to:", training_args.output_dir)
except RuntimeError as e:
    print(f"\nTraining failed with a RuntimeError: {e}")
    if "out of memory" in str(e).lower():
        print("This often indicates GPU VRAM is insufficient.")
        print("Try reducing 'per_device_train_batch_size' to 1 (if not already),")
        print("increasing 'gradient_accumulation_steps', or reducing 'max_seq_length'.")
        print("Consider using cloud GPUs if the issue persists.")
    else:
        print("Check error message for more details.")


trainer.save_model(os.path.join(training_args.output_dir, "final_lora_adapters"))



config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5196
--- Initial Model Setup Complete ---
Loading dataset


README.md:   0%|          | 0.00/364 [00:00<?, ?B/s]

pass.csv:   0%|          | 0.00/5.90M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8201 [00:00<?, ? examples/s]

formatting dataset


Map:   0%|          | 0/8201 [00:00<?, ? examples/s]

TrainingArguments configured.


Converting train dataset to ChatML:   0%|          | 0/8201 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/8201 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/8201 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/8201 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


SFTTrainer initialized. Starting training...


  return fn(*args, **kwargs)


Step,Training Loss
100,1.2624
200,0.5865
300,0.5082
400,0.441
500,0.4047
600,0.3939
700,0.3623
800,0.3427
900,0.3506
1000,0.3255


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)



Training complete! Model saved to: ./mlir_optimizer_results


In [None]:
import os

base_model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

lora_adapter_path = "./mlir_optimizer_results/final_lora_adapters"

output_merged_model_path = "./merged_llama3_mlir_optimizer"

os.makedirs(output_merged_model_path, exist_ok=True)

print(f"Attempting to merge LoRA adapters from: {lora_adapter_path}")
print(f"Saving merged model to: {output_merged_model_path}")


try:
    print("Loading base model...")
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        return_dict=True,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    print("Base model loaded.")

    tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right" # Ensure consistency
    print("Tokenizer loaded.")

except Exception as e:
    print(f"Error loading base model or tokenizer: {e}")
    print("Ensure you have access to Llama 3 on Hugging Face and your HF_TOKEN is set.")
    print("Also, check your GPU VRAM. Loading the base model (even if not quantized) can be VRAM intensive.")
    exit()

try:
    print(f"Loading PEFT adapters from {lora_adapter_path}...")
    from peft import PeftModel
    ft_model = PeftModel.from_pretrained(base_model, lora_adapter_path)
    print("PEFT adapters loaded.")

    print("Merging LoRA adapters into the base model...")
    merged_model = ft_model.merge_and_unload()
    print("Merging complete. The model is now a standard Hugging Face model.")

except Exception as e:
    print(f"Error loading PEFT adapters or merging: {e}")
    print("Check if the 'lora_adapter_path' is correct and contains valid PEFT weights.")
    exit()

try:
    print(f"Saving merged model to {output_merged_model_path}...")
    merged_model.save_pretrained(output_merged_model_path)
    tokenizer.save_pretrained(output_merged_model_path)
    print("Merged model and tokenizer saved successfully!")
    print(f"\nYour downloadable model is now available at: {os.path.abspath(output_merged_model_path)}")
except Exception as e:
    print(f"Error saving merged model: {e}")

Attempting to merge LoRA adapters from: ./mlir_optimizer_results/final_lora_adapters
Saving merged model to: ./merged_llama3_mlir_optimizer
Loading base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Base model loaded.
Tokenizer loaded.
Loading PEFT adapters from ./mlir_optimizer_results/final_lora_adapters...
PEFT adapters loaded.
Merging LoRA adapters into the base model...
Merging complete. The model is now a standard Hugging Face model.
Saving merged model to ./merged_llama3_mlir_optimizer...
Merged model and tokenizer saved successfully!

Your downloadable model is now available at: /content/merged_llama3_mlir_optimizer


In [None]:
from huggingface_hub import HfApi, create_repo
import os

api = HfApi()

repo_id = "4skin/llama3-8b-mlir-optimizer"

api.upload_folder(
    folder_path=output_merged_model_path,
    repo_id=repo_id,
    repo_type="model",
)
print(f"Model successfully uploaded to https://huggingface.co/{repo_id}")

Uploading...:   0%|          | 0.00/16.1G [00:00<?, ?B/s]

Model successfully uploaded to https://huggingface.co/4skin/llama3-8b-mlir-optimizer
