# Fine-tune LLM to follow instructions


## Load open weights

In [11]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load pretrained GPT-2 (124M, "gpt2")
hf_model = GPT2LMHeadModel.from_pretrained("gpt2")  # or "gpt2-medium", "gpt2-large", "gpt2-xl"
hf_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
hf_model.eval()  # Set to evaluation mode


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [13]:
# just use the model from Hugging Face
start_context = "at the start of"
inputs = hf_tokenizer(start_context, return_tensors="pt")
outputs = hf_model.generate(**inputs, max_new_tokens=50)
print(hf_tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


at the start of the game, and then the game ends.

The game is a bit of a mess, but it's not a bad one. The game is a bit of a mess, but it's not a bad one. The game is a bit


In [14]:
for name, param in hf_model.named_parameters():
    print(name, param.shape)

transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias torch.Size([2304])
transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
transformer.h.1.attn.c_proj.bias 

In [12]:
from gpt2_v2 import GPT2Model, GPT_CONFIG_124M, complete_text

GPT_CONFIG_124M.update({"qkv_bias": True})
model = GPT2Model(GPT_CONFIG_124M)
for name, param in model.named_parameters():
    print(name, param.shape)

tok_emb.weight torch.Size([50257, 768])
pos_emb.weight torch.Size([1024, 768])
blocks.0.attn.W_Q.weight torch.Size([768, 768])
blocks.0.attn.W_Q.bias torch.Size([768])
blocks.0.attn.W_K.weight torch.Size([768, 768])
blocks.0.attn.W_K.bias torch.Size([768])
blocks.0.attn.W_V.weight torch.Size([768, 768])
blocks.0.attn.W_V.bias torch.Size([768])
blocks.0.attn.out_proj.weight torch.Size([768, 768])
blocks.0.attn.out_proj.bias torch.Size([768])
blocks.0.ff.layers.0.weight torch.Size([3072, 768])
blocks.0.ff.layers.0.bias torch.Size([3072])
blocks.0.ff.layers.2.weight torch.Size([768, 3072])
blocks.0.ff.layers.2.bias torch.Size([768])
blocks.0.ln1.weight torch.Size([768])
blocks.0.ln1.bias torch.Size([768])
blocks.0.ln2.weight torch.Size([768])
blocks.0.ln2.bias torch.Size([768])
blocks.1.attn.W_Q.weight torch.Size([768, 768])
blocks.1.attn.W_Q.bias torch.Size([768])
blocks.1.attn.W_K.weight torch.Size([768, 768])
blocks.1.attn.W_K.bias torch.Size([768])
blocks.1.attn.W_V.weight torch.Size(

In [14]:

model.eval()
result = complete_text("at the start of", model,15)
print("Output text:\n", result)

Output text:
 at the start of keynotefl regex delic footnoteopausal Mankindyp ISBN Tyson sinking(-digy AlexandriaLuckily


### Download GPT2 from OpenAI

In [18]:
from collections import defaultdict
from tqdm import tqdm
import urllib
import os
import json
from urllib.parse import urljoin
import tensorflow as tf
import numpy as np

def download_file(url, destination, backup_url=None):
    def _attempt_download(download_url):
        with urllib.request.urlopen(download_url) as response:
            total_size = int(response.headers.get("Content-Length", 0))
            if os.path.exists(destination) and os.path.getsize(destination) == total_size:
                print(f"File already exists and is up-to-date: {destination}")
                return True

            with tqdm(total=total_size, unit="iB", unit_scale=True, desc=os.path.basename(download_url)) as pbar, \
                 open(destination, "wb") as f:
                for chunk in iter(lambda: response.read(1024), b""):
                    f.write(chunk)
                    pbar.update(len(chunk))
            return True

    try:
        if _attempt_download(url):
            return
    except (urllib.error.HTTPError, urllib.error.URLError):
        if backup_url:
            print(f"Primary URL failed. Trying backup URL: {backup_url}")
            try:
                if _attempt_download(backup_url):
                    return
            except (urllib.error.HTTPError, urllib.error.URLError):
                pass
        print(f"Failed to download from primary URL ({url})"
              + (f" and backup URL ({backup_url})" if backup_url else "") + ".\n"
              "Check your internet connection or the file availability.\n"
              "For help, visit: https://github.com/rasbt/LLMs-from-scratch/discussions/273")
    except Exception as e:
        print(f"Unexpected error: {e}")


def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
    # Helper: recursively convert nested dict to normal dict
    def dictify(d):
        if isinstance(d, defaultdict):
            d = {k: dictify(v) for k, v in d.items()}
        return d

    params = defaultdict(dict)
    params["blocks"] = [defaultdict(dict) for _ in range(settings["n_layer"])]

    for name, _ in tf.train.list_variables(ckpt_path):
        array = np.squeeze(tf.train.load_variable(ckpt_path, name))
        parts = name.split("/")[1:]  # remove 'model/' prefix

        # Decide where to put the variable
        if parts and parts[0].startswith("h"):
            try:
                layer_idx = int(parts[0][1:])
                target = params["blocks"][layer_idx]
                parts = parts[1:]  # remove layer part
            except (ValueError, IndexError):
                # Fallback to top-level params dict if parsing fails
                target = params
        else:
            target = params

        # Drill down nested dicts except for last key
        for key in parts[:-1]:
            target = target.setdefault(key, {})

        target[parts[-1]] = array

    # Convert defaultdict back to normal dict
    return dictify(params)


def download_and_load_gpt2(model_size, models_dir):
    allowed_sizes = {"124M", "355M", "774M", "1558M"}
    if model_size not in allowed_sizes:
        raise ValueError(f"Model size must be one of {allowed_sizes}")

    model_dir = os.path.join(models_dir, model_size)
    os.makedirs(model_dir, exist_ok=True)

    base_url = f"https://openaipublic.blob.core.windows.net/gpt-2/models/{model_size}/"
    backup_url = f"https://f001.backblazeb2.com/file/LLMs-from-scratch/gpt2/{model_size}/"

    filenames = [
        "checkpoint", "encoder.json", "hparams.json",
        "model.ckpt.data-00000-of-00001", "model.ckpt.index",
        "model.ckpt.meta", "vocab.bpe"
    ]

    for fname in filenames:
        dst = os.path.join(model_dir, fname)
        if os.path.exists(dst):
            print(f"Already exists: {fname}, skipping download.")
            continue
        primary = urljoin(base_url, fname)
        backup = urljoin(backup_url, fname)
        print(f"Downloading {fname} ...")
        download_file(primary, dst, backup)

    tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
    with open(os.path.join(model_dir, "hparams.json"), "r", encoding="utf-8") as f:
        settings = json.load(f)

    params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)
    return settings, params


In [37]:
settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")
print("Settings:", settings)
print("Params:", params.keys())

Already exists: checkpoint, skipping download.
Already exists: encoder.json, skipping download.
Already exists: hparams.json, skipping download.
Already exists: model.ckpt.data-00000-of-00001, skipping download.
Already exists: model.ckpt.index, skipping download.
Already exists: model.ckpt.meta, skipping download.
Already exists: vocab.bpe, skipping download.
Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}
Params: dict_keys(['blocks', 'ln_f', 'wpe', 'wte'])


In [22]:
# Model-specific configuration overrides
model_configs = {
    "gpt2-small":  {"emb_dim": 768,  "n_layers": 12, "n_heads": 12},
    "gpt2-medium": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large":  {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl":     {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

# Select model and build final config
model_name = "gpt2-small"
config = {
    **GPT_CONFIG_124M,
    **model_configs[model_name],
    "qkv_bias": True
}

model = GPT2Model(config)
model.eval()


GPT2Model(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_Q): Linear(in_features=768, out_features=768, bias=True)
        (W_K): Linear(in_features=768, out_features=768, bias=True)
        (W_V): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (ln1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlo

In [23]:
from gpt2_v2 import complete_text
result = complete_text("at the start of", model,15)
print("Output text:\n", result)

Output text:
 at the start of predomintes questionnairehh sucking\", refund bitters her Mexicansush GHz renovravingsect


In [35]:
import torch
import numpy as np

def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch: {left.shape} vs {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))


def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])

    for b, (block, pblock) in enumerate(zip(gpt.blocks, params["blocks"])):
        # Attention QKV
        qw, kw, vw = np.split(pblock["attn"]["c_attn"]["w"], 3, axis=-1)
        qb, kb, vb = np.split(pblock["attn"]["c_attn"]["b"], 3, axis=-1)
        block.attn.W_Q.weight = assign(block.attn.W_Q.weight, qw.T)
        block.attn.W_K.weight   = assign(block.attn.W_K.weight, kw.T)
        block.attn.W_V.weight = assign(block.attn.W_V.weight, vw.T)
        block.attn.W_Q.bias   = assign(block.attn.W_Q.bias, qb)
        block.attn.W_K.bias     = assign(block.attn.W_K.bias, kb)
        block.attn.W_V.bias   = assign(block.attn.W_V.bias, vb)

        # Attention output projection
        block.attn.out_proj.weight = assign(block.attn.out_proj.weight, pblock["attn"]["c_proj"]["w"].T)
        block.attn.out_proj.bias   = assign(block.attn.out_proj.bias,   pblock["attn"]["c_proj"]["b"])

        # Feedforward
        block.ff.layers[0].weight = assign(block.ff.layers[0].weight, pblock["mlp"]["c_fc"]["w"].T)
        block.ff.layers[0].bias   = assign(block.ff.layers[0].bias,   pblock["mlp"]["c_fc"]["b"])
        block.ff.layers[2].weight = assign(block.ff.layers[2].weight, pblock["mlp"]["c_proj"]["w"].T)
        block.ff.layers[2].bias   = assign(block.ff.layers[2].bias,   pblock["mlp"]["c_proj"]["b"])

        # LayerNorms
        block.ln1.weight = assign(block.ln1.weight, pblock["ln_1"]["g"])
        block.ln1.bias = assign(block.ln1.bias, pblock["ln_1"]["b"])
        block.ln2.weight = assign(block.ln2.weight, pblock["ln_2"]["g"])
        block.ln2.bias = assign(block.ln2.bias, pblock["ln_2"]["b"])

    gpt.final_norm.weight = assign(gpt.final_norm.weight, params["g"])
    gpt.final_norm.bias = assign(gpt.final_norm.bias, params["b"])
    gpt.out_head.weight  = assign(gpt.out_head.weight,  params["wte"])


In [36]:
load_weights_into_gpt(model, params)
model.to("cpu")
result = complete_text("at the start of", model,15)
print("Output text:\n", result)

KeyError: 'g'