# Fine-tune LLM to follow instructions


## Load open weights

In [22]:
from gpt2_v2 import GPT2Model, GPT_CONFIG_124M, complete_text, generate_text_simple, tensor_to_text, text_to_tensor

GPT_CONFIG_124M.update({"qkv_bias": True})
model = GPT2Model(GPT_CONFIG_124M)


In [4]:
for name, param in model.named_parameters():
    print(name, param.shape)

tok_emb.weight torch.Size([50257, 768])
pos_emb.weight torch.Size([1024, 768])
blocks.0.attn.W_Q.weight torch.Size([768, 768])
blocks.0.attn.W_Q.bias torch.Size([768])
blocks.0.attn.W_K.weight torch.Size([768, 768])
blocks.0.attn.W_K.bias torch.Size([768])
blocks.0.attn.W_V.weight torch.Size([768, 768])
blocks.0.attn.W_V.bias torch.Size([768])
blocks.0.attn.out_proj.weight torch.Size([768, 768])
blocks.0.attn.out_proj.bias torch.Size([768])
blocks.0.ff.layers.0.weight torch.Size([3072, 768])
blocks.0.ff.layers.0.bias torch.Size([3072])
blocks.0.ff.layers.2.weight torch.Size([768, 3072])
blocks.0.ff.layers.2.bias torch.Size([768])
blocks.0.ln1.weight torch.Size([768])
blocks.0.ln1.bias torch.Size([768])
blocks.0.ln2.weight torch.Size([768])
blocks.0.ln2.bias torch.Size([768])
blocks.1.attn.W_Q.weight torch.Size([768, 768])
blocks.1.attn.W_Q.bias torch.Size([768])
blocks.1.attn.W_K.weight torch.Size([768, 768])
blocks.1.attn.W_K.bias torch.Size([768])
blocks.1.attn.W_V.weight torch.Size(

In [25]:
model.eval()
result = complete_text("at the start of", model,15)
print("Output text:\n", result)

Output text:
 at the start ofYears Gadget bookuggisheffectsdozenispers POW sang Nikolonic Springfield morbid signallingug


### Download GPT2 from OpenAI

In [9]:
from tqdm import tqdm
import urllib
import os
import json
from urllib.parse import urljoin
import tensorflow as tf
import numpy as np

def download_file(url, destination, backup_url=None):
    def _attempt_download(download_url):
        with urllib.request.urlopen(download_url) as response:
            total_size = int(response.headers.get("Content-Length", 0))
            if os.path.exists(destination) and os.path.getsize(destination) == total_size:
                print(f"File already exists and is up-to-date: {destination}")
                return True

            with tqdm(total=total_size, unit="iB", unit_scale=True, desc=os.path.basename(download_url)) as pbar, \
                 open(destination, "wb") as f:
                for chunk in iter(lambda: response.read(1024), b""):
                    f.write(chunk)
                    pbar.update(len(chunk))
            return True

    try:
        if _attempt_download(url):
            return
    except (urllib.error.HTTPError, urllib.error.URLError):
        if backup_url:
            print(f"Primary URL failed. Trying backup URL: {backup_url}")
            try:
                if _attempt_download(backup_url):
                    return
            except (urllib.error.HTTPError, urllib.error.URLError):
                pass
        print(f"Failed to download from primary URL ({url})"
              + (f" and backup URL ({backup_url})" if backup_url else "") + ".\n"
              "Check your internet connection or the file availability.\n"
              "For help, visit: https://github.com/rasbt/LLMs-from-scratch/discussions/273")
    except Exception as e:
        print(f"Unexpected error: {e}")


def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
    # Initialize parameters dictionary with empty blocks for each layer
    params = {"blocks": [{} for _ in range(settings["n_layer"])]}

    # Iterate over each variable in the checkpoint
    for name, _ in tf.train.list_variables(ckpt_path):
        # Load the variable and remove singleton dimensions
        variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))

        # Process the variable name to extract relevant parts
        variable_name_parts = name.split("/")[1:]  # Skip the 'model/' prefix

        # Identify the target dictionary for the variable
        target_dict = params
        if variable_name_parts[0].startswith("h"):
            layer_number = int(variable_name_parts[0][1:])
            target_dict = params["blocks"][layer_number]

        # Recursively access or create nested dictionaries
        for key in variable_name_parts[1:-1]:
            target_dict = target_dict.setdefault(key, {})

        # Assign the variable array to the last key
        last_key = variable_name_parts[-1]
        target_dict[last_key] = variable_array

    return params


def download_and_load_gpt2(model_size, models_dir):
    allowed_sizes = {"124M", "355M", "774M", "1558M"}
    if model_size not in allowed_sizes:
        raise ValueError(f"Model size must be one of {allowed_sizes}")

    model_dir = os.path.join(models_dir, model_size)
    os.makedirs(model_dir, exist_ok=True)

    base_url = f"https://openaipublic.blob.core.windows.net/gpt-2/models/{model_size}/"
    backup_url = f"https://f001.backblazeb2.com/file/LLMs-from-scratch/gpt2/{model_size}/"

    filenames = [
        "checkpoint", "encoder.json", "hparams.json",
        "model.ckpt.data-00000-of-00001", "model.ckpt.index",
        "model.ckpt.meta", "vocab.bpe"
    ]

    for fname in filenames:
        dst = os.path.join(model_dir, fname)
        if os.path.exists(dst):
            print(f"Already exists: {fname}, skipping download.")
            continue
        primary = urljoin(base_url, fname)
        backup = urljoin(backup_url, fname)
        print(f"Downloading {fname} ...")
        download_file(primary, dst, backup)

    tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
    with open(os.path.join(model_dir, "hparams.json"), "r", encoding="utf-8") as f:
        settings = json.load(f)

    params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)
    return settings, params


In [26]:
settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")
print("Settings:", settings)
print("Params:", params.keys())

Already exists: checkpoint, skipping download.
Already exists: encoder.json, skipping download.
Already exists: hparams.json, skipping download.
Already exists: model.ckpt.data-00000-of-00001, skipping download.
Already exists: model.ckpt.index, skipping download.
Already exists: model.ckpt.meta, skipping download.
Already exists: vocab.bpe, skipping download.
Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}
Params: dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


In [30]:
import torch
import numpy as np

def assign_(left, right):
    if right is None:
        raise ValueError("'right' cannot be None")
    right_tensor = torch.as_tensor(right, dtype=left.dtype, device=left.device)
    if right_tensor.numel() == 0:
        raise ValueError("'right' cannot be Empty")
    if left.shape != right_tensor.shape:
        raise ValueError(f"Shape mismatch: {left.shape} vs {right_tensor.shape}")
    with torch.no_grad():
        left.copy_(right_tensor)

def load_weights_into_gpt(gpt, params):
    assign_(gpt.pos_emb.weight, params["wpe"])
    assign_(gpt.tok_emb.weight, params["wte"])

    for b, (block, pblock) in enumerate(zip(gpt.blocks, params["blocks"])):
        # Attention QKV
        qw, kw, vw = np.split(pblock["attn"]["c_attn"]["w"], 3, axis=-1)
        qb, kb, vb = np.split(pblock["attn"]["c_attn"]["b"], 3, axis=-1)
        assign_(block.attn.W_Q.weight, qw.T)
        assign_(block.attn.W_K.weight, kw.T)
        assign_(block.attn.W_V.weight, vw.T)
        assign_(block.attn.W_Q.bias, qb)
        assign_(block.attn.W_K.bias, kb)
        assign_(block.attn.W_V.bias, vb)

        # Attention output projection
        assign_(block.attn.out_proj.weight, pblock["attn"]["c_proj"]["w"].T)
        assign_(block.attn.out_proj.bias,   pblock["attn"]["c_proj"]["b"])

        # Feedforward
        assign_(block.ff.layers[0].weight, pblock["mlp"]["c_fc"]["w"].T)
        assign_(block.ff.layers[0].bias,   pblock["mlp"]["c_fc"]["b"])
        assign_(block.ff.layers[2].weight, pblock["mlp"]["c_proj"]["w"].T)
        assign_(block.ff.layers[2].bias,   pblock["mlp"]["c_proj"]["b"])

        # LayerNorms
        assign_(block.ln1.weight, pblock["ln_1"]["g"])
        assign_(block.ln1.bias, pblock["ln_1"]["b"])
        assign_(block.ln2.weight, pblock["ln_2"]["g"])
        assign_(block.ln2.bias, pblock["ln_2"]["b"])

    assign_(gpt.final_norm.weight, params["g"])
    assign_(gpt.final_norm.bias, params["b"])
    assign_(gpt.out_head.weight,  params["wte"])


In [41]:
load_weights_into_gpt(model, params)
model.to("cpu")
model.eval()


GPT2Model(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_Q): Linear(in_features=768, out_features=768, bias=True)
        (W_K): Linear(in_features=768, out_features=768, bias=True)
        (W_V): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (ln1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlo

In [42]:
for name, param in model.named_parameters():
    print(name, param.shape, param.mean().item(), param.std().item())


tok_emb.weight torch.Size([50257, 768]) 0.00037981756031513214 0.14369554817676544
pos_emb.weight torch.Size([1024, 768]) -0.0006787165184505284 0.1226913258433342
blocks.0.attn.W_Q.weight torch.Size([768, 768]) 0.00015374351642094553 0.2386905699968338
blocks.0.attn.W_Q.bias torch.Size([768]) -0.007821076549589634 0.3427544832229614
blocks.0.attn.W_K.weight torch.Size([768, 768]) 1.2351122677500825e-05 0.2432965785264969
blocks.0.attn.W_K.bias torch.Size([768]) 0.0048723239451646805 0.18297071754932404
blocks.0.attn.W_V.weight torch.Size([768, 768]) -5.968316145299468e-06 0.05811797454953194
blocks.0.attn.W_V.bias torch.Size([768]) 0.0008267878438346088 0.04772818833589554
blocks.0.attn.out_proj.weight torch.Size([768, 768]) -0.0001613790518604219 0.1474614143371582
blocks.0.attn.out_proj.bias torch.Size([768]) -0.00691022165119648 0.2589662969112396
blocks.0.ff.layers.0.weight torch.Size([3072, 768]) -0.0007485305541194975 0.14116929471492767
blocks.0.ff.layers.0.bias torch.Size([307

In [94]:
from gpt2_v2 import complete_text
result = complete_text("at the start of", model,15)
print("Output text:\n", result)

Output text:
 at the start of the game, and then the game ends.

The game is a


In [101]:
torch.manual_seed(123)

token_ids = generate_text_simple(
    model=model,
    idx=text_to_tensor("at the start of", tokenizer),
    max_new_tokens=15,
    context_size=GPT_CONFIG_124M["context_length"],
    top_k=25,
    temperature=1.4
)

print("Output text:\n", tensor_to_text(token_ids, tokenizer))

Output text:
 at the start of an international series of events. You don't have to worry about who has


## Instruction Finetuning