## Install core libraries

In [1]:
!pip install -U peft accelerate bitsandbytes transformers datasets trl



## Set your Hugging Face access token

In [2]:
import os, getpass
os.environ["HF_TOKEN"] = getpass.getpass("Paste the HF token here: ") # my HF token: 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

Paste the HF token here: ··········


## Load training and validation data

In [3]:
from datasets import load_dataset

data = load_dataset("json", data_files={
    "train": "/content/train.jsonl",
    "validation": "/content/dev.jsonl"
})

## Load 4-bit Llama-3 and attach a LoRA adapter

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import (
    LoraConfig, get_peft_model, prepare_model_for_kbit_training
)

model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_cfg,
    trust_remote_code=True,
)


model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"],
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


trainable params: 6,815,744 || all params: 8,037,076,992 || trainable%: 0.0848


## Tokenize dataset

In [5]:
lengths = [len(tokenizer(t).input_ids) for t in data["train"]["text"]]
print("avg", sum(lengths)/len(lengths))
print("p95", sorted(lengths)[int(0.95*len(lengths))-1])

avg 1612.625
p95 3565


In [6]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Tokenize
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

"""
NOTE: current `max_length` is intentionally kept far below the
average prompt length because of our 15 GB GPU limit.  As a result,
many tokens are truncated.  Revisit this setting when more GPU
memory (or a smaller prompt) is available.
"""

tokenized_dataset = data.map(tokenize_function, batched=True)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

## Define training arguments

In [7]:
training_args = TrainingArguments(
    output_dir           = "./llama-finetuned",
    per_device_train_batch_size = 1,
    gradient_accumulation_steps  = 3,  # helpful to simluate large batches when memory can't fit one
    per_device_eval_batch_size  = 1,
    num_train_epochs      = 5,
    do_eval              = True,
    eval_steps           = 500,
    save_steps           = 500,
    logging_steps        = 1,
    learning_rate        = 2e-5,
    weight_decay         = 0.01, # seems necesary to prevent overfitting
    fp16                 = True,
    report_to            = "none",
    lr_scheduler_type    = "linear",
    warmup_ratio         = 0.05
)

## Train

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [9]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
1,1.9917
2,1.9202
3,1.9181
4,1.4032
5,1.63
6,1.657
7,1.5618
8,1.7681
9,1.7012
10,1.8803


TrainOutput(global_step=70, training_loss=1.6349440574645997, metrics={'train_runtime': 579.3075, 'train_samples_per_second': 0.345, 'train_steps_per_second': 0.121, 'total_flos': 4615213311590400.0, 'train_loss': 1.6349440574645997, 'epoch': 5.0})

In [10]:
eval_results = trainer.evaluate()
print(eval_results["eval_loss"])

1.470529317855835


In [11]:
import re, numpy as np

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    texts = tokenizer.batch_decode(preds, skip_special_tokens=True)

    numbers = []
    for t in texts:
        m = re.search(r"\d+(\.\d)?", t)
        numbers.append(float(m.group()) if m else np.nan)

    mae  = np.mean(np.abs(np.array(numbers) - labels))
    rmse = np.sqrt(np.mean((np.array(numbers) - labels) ** 2))
    return {"mae": mae, "rmse": rmse}

## Save the LoRA adapter

In [12]:
adapter_dir = "./llama3_lora_adapter"
model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(adapter_dir)

('./llama3_lora_adapter/tokenizer_config.json',
 './llama3_lora_adapter/special_tokens_map.json',
 './llama3_lora_adapter/tokenizer.json')

## Load base model + LoRA for inference

In [13]:
from peft import PeftModel
from transformers import TextIteratorStreamer

BASE_MODEL = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
LORA_DIR   = "./llama3_lora_adapter"

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
base      = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_cfg,
    device_map="auto",
    trust_remote_code=True,
)

model = PeftModel.from_pretrained(base, LORA_DIR).eval()



## Build prompt using identical field order

In [14]:
import json, pandas as pd, re
from pathlib import Path
from threading import Thread

first_line = Path("train.jsonl").read_text(encoding="utf-8").splitlines()[0]
sample_text = json.loads(first_line)["text"]
feature_cols = [
    ln.split(":")[0].strip()
    for ln in sample_text.splitlines()
    if ln and not ln.startswith("###")
]

def safe(v):
    if pd.isna(v):
        return ""
    if isinstance(v, (list, dict)):
        return json.dumps(v, ensure_ascii=False)
    return str(v).replace("\n", " ").strip()

def row_to_prompt(row):
    lines = [f"{col}: {safe(row.get(col, ''))}" for col in feature_cols]
    return (
        "### MOVIE FEATURES\n" +
        "\n".join(lines) +
        "\n\n### TASK\n"
        "Predict this movie's TMDB rating on a 0–10 scale (one decimal place).\n\n"
        "### ANSWER\n"
    )

example = {
    "title": "The Bourne Identity",
    "budget": 60000000,
    "release_date": "2002-06-14",
    "runtime": 119,
    "genres_names": "Action, Thriller",
    "actor_1_name": "Matt Damon",
    "director_1_name": "Doug Liman",
    "imdb_rating": 7.9
}
prompt = row_to_prompt(example)

## Stream generation

In [15]:
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

Thread(target=model.generate,
       kwargs=dict(**inputs, streamer=streamer, max_new_tokens=8)).start()

print("\nModel output:")
generated_text = "".join(tok for tok in streamer)
match = re.search(r"\d+(\.\d)?", generated_text)
print("Predicted rating:", match.group() if match else "N/A")


Model output:
Predicted rating: 8.0


## Push LoRA adapter to Hugging Face Hub

In [16]:
from getpass import getpass
HF_TOKEN = os.environ["HF_TOKEN"]
REPO_LORA = "YijingOlivia/llama3-movie-rating-lora"

model.push_to_hub(REPO_LORA, token=HF_TOKEN)
tokenizer.push_to_hub(REPO_LORA, token=HF_TOKEN)

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/YijingOlivia/llama3-movie-rating-lora/commit/fac33bfb5da2b561e03838ff36ad72869dc3a06a', commit_message='Upload tokenizer', commit_description='', oid='fac33bfb5da2b561e03838ff36ad72869dc3a06a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/YijingOlivia/llama3-movie-rating-lora', endpoint='https://huggingface.co', repo_type='model', repo_id='YijingOlivia/llama3-movie-rating-lora'), pr_revision=None, pr_num=None)