<a href="https://colab.research.google.com/github/withpi/cookbook-withpi/blob/main/colabs/Low_Rank_Adaptation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://withpi.ai"><img src="https://withpi.ai/logoFullBlack.svg" width="240"></a>

<a href="https://code.withpi.ai"><font size="4">Documentation</font></a>

<a href="https://play.withpi.ai"><font size="4">Technique Catalog</font></a>

# Low Rank Adaptation

This is the companion to the Low Rank Adaptation playground

Assuming you have some "golden" responses, this helps you fine tune an adapter that helps a small model align more with the preference expressed in those responses.

## Install and initialize SDK

This notebook needs a T4 GPU.  Make sure to select this explicitly above before proceeding.

You'll need a WITHPI_API_KEY from https://play.withpi.ai.  Add it to your notebook secrets (the key symbol) on the left.

Run the cell below to install packages and load the SDK

In [None]:
%%capture

import os
from google.colab import files, userdata

# Load the notebook secret into the environment so the Pi Client can access it.
os.environ["WITHPI_API_KEY"] = userdata.get('WITHPI_API_KEY')

%pip install withpi litellm httpx datasets jinja2 tqdm

# Import a bunch of useful libraries for later.
from concurrent.futures import ThreadPoolExecutor
import json
from pathlib import Path
import re

import datasets
import httpx
import jinja2
from litellm import completion
from tqdm.notebook import tqdm
from withpi import PiClient
from withpi.types import Contract

# Load the notebook secret into the environment so the Pi Client can access it.
os.environ["WITHPI_API_KEY"] = userdata.get('WITHPI_API_KEY')

client = PiClient()

def print_contract(contract: Contract):
  """print_contract pretty-prints a contract"""
  for dimension in contract.dimensions:
    print(dimension.label)
    for sub_dimension in dimension.sub_dimensions:
      print(f"\t{sub_dimension.description}")

def generate(system: str, user: str, model: str) -> str:
  """generate passes the provided system and user prompts into the given model
  via LiteLLM"""
  messages = [
    {
      "content": system,
      "role": "system"
    },
    {
      "content": user,
      "role": "user"
    }
  ]
  return completion(model=model,
                    messages=messages).choices[0].message.content

class printer(str):
  """printer makes strings with embedded newlines print more nicely"""
  def __repr__(self):
    return self
def print_response(response: str):
  """print_response pretty-prints an LLM response, respecting newlines"""
  display(printer(response))

def print_scores(pi_scores):
  """print_scores pretty-prints a Pi Score response as a table."""
  for dimension_name, dimension_scores in pi_scores.dimension_scores.items():
    print(f"{dimension_name}: {dimension_scores.total_score}")
    for subdimension_name, subdimension_score in dimension_scores.subdimension_scores.items():
      print(f"\t{subdimension_name}: {subdimension_score}")
    print("\n")
  print("---------------------")
  print(f"Total score: {pi_scores.total_score}")

def save_file(filename: str, model: str):
  """save_file offers to download the model with the given filename"""
  Path(filename).write_text(model)
  files.download(filename)

def load_contract(url: str) -> Contract:
  """load_contract pulls a Contract JSON blob locally with validation."""
  resp = httpx.get(url)
  return Contract.model_validate_json(resp.content)

def load_and_split_dataset(url: str) -> datasets.DatasetDict:
  """load_and_split_dataset pulls in the Parquet file at url and does a 90/10 split"""
  return datasets.load_dataset('parquet', data_files=url, split="train").train_test_split(test_size=0.1)

def do_bulk_inference(dataset, system, model):
  """do_bulk_inference performs inference on the 'input' column of dataset, using
  the provided system prompt.  The model identified will be used via LiteLLM"""

  def do_generate(user, pbar):
    result = generate(system, user, model)
    pbar.update(1)
    return result

  futures = []
  pbar = tqdm(total=len(dataset))
  with ThreadPoolExecutor(max_workers=4) as executor:
    for row in dataset:
      futures.append(executor.submit(do_generate, row["input"], pbar))
  return [future.result() for future in futures]

def do_bulk_templated_inference(dataset, optimized, model):
  """do_bulk_templated_inference performs inference on the 'input' column of dataset,
  using the provided optimized prompt.  It should be a Jinja2 template as returned
  by DSPy"""
  prompt_template = jinja2.Template(optimized)
  result_extractor = re.compile(r".*\[\[ ## response ## \]\](.*)\[\[ ## completed ## \]\]", re.DOTALL)

  def do_generate(prompt: str, pbar) -> str:
    messages = json.loads(prompt_template.render(input=prompt))
    result = completion(model=model,
                        messages=messages).choices[0].message.content

    pbar.update(1)
    return result_extractor.match(result).group(1)

  futures = []
  pbar = tqdm(total=len(dataset))
  with ThreadPoolExecutor(max_workers=4) as executor:
    for row in dataset:
      futures.append(executor.submit(do_generate, row["input"], pbar))
  return [future.result() for future in futures]

def stream_response(job_id: str, method):
  """stream_response streams messages from the provided method

  method should be a Pi client object with `retrieve` and `stream_messages`
  endpoints.  This is primarily for convenience."""

  while True:
    response = method.retrieve(job_id=job_id)
    if (response.state != 'QUEUED') and (response.state != 'RUNNING'):
      return response

    with method.with_streaming_response.stream_messages(
        job_id=job_id, timeout=None) as response:
      for line in response.iter_lines():
        print(line)

%pip install vllm huggingface_hub bitsandbytes


# Load a contract and dataset

We have a pre-existing contract you can play with.


In [None]:
aesop_contract = load_contract("https://raw.githubusercontent.com/withpi/cookbook-withpi/refs/heads/main/contracts/aesop_ai.json")
aesop = datasets.load_dataset('parquet', data_files="https://raw.githubusercontent.com/withpi/cookbook-withpi/refs/heads/main/datasets/aesop_ai_examples.parquet", split="train")
aesop

## Kick off the job

The SFT job internally performs a 90/10 train-test split, which is why the loader is not splitting the input data.

This process takes a while, please be patient as a cloud GPU is aquired, fine tuning is performed, and a result is returned.

In [None]:
status = client.model.sft.start_job(
    contract=aesop_contract,
    examples=[{'llm_input': row['input'], 'llm_output': row['output']} for row in aesop],
    base_sft_model='LLAMA_3.2_3B',
    num_train_epochs=1
)

response = stream_response(status.job_id, client.model.sft)
repo_name = response.trained_models[0].hf_model_name

## Load Model

You'll need to acknowledge access to the Llama gated model on Hugging Face and generate a token. You can put this into your notebook secrets under "HF_TOKEN" and run the cell below.

This will load the model into the local GPU.  It takes a few minutes.

In [None]:
# Load the notebook secret into the environment so the Pi Client can access it.
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

from huggingface_hub import snapshot_download

adapter_path = snapshot_download(repo_id=repo_name)

from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest

llm = LLM(model="meta-llama/Llama-3.2-3B-Instruct", dtype='half', max_model_len=2048, enable_lora=True)

## Try it out!

Now lets just do some local inference with this model.

In [None]:
sampling_params = SamplingParams(
    temperature=0,
    max_tokens=512,
    stop=["[/assistant]"]
)

response = llm.chat(
    messages = [
      {
        "content": aesop_contract.description,
        "role": "system"
      },
      {
        "content": "The importance of sharing",
        "role": "user"
      }
    ],
    sampling_params=sampling_params,
    lora_request=LoRARequest("adapter", 1, adapter_path),
    use_tqdm=False
)

print_response(response[0].outputs[0].text)

## Next steps

You can load this model onto more powerful hardware than the T4 included in Colab or your own inference provider.