In [2]:
!pip install tiktoken

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [1]:
from importlib.metadata import version

pkgs = [
    "blobfile",         # to download pretrained weights
    "huggingface_hub",  # to download pretrained weights
    "tiktoken",         # to implement the tokenizer
    "torch",            # to implement the model
]
for p in pkgs:
    print(f"{p} version: {version(p)}")

blobfile version: 3.0.0
huggingface_hub version: 0.32.0
tiktoken version: 0.9.0
torch version: 2.4.0a0+3bcc3cddb5.nv24.7


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [5]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.fc1 = nn.Linear(cfg.emb_dim, cfg.hidden_size, dtype=cfg.dtype, bias=False)
        self.fc2 = nn.Linear(cfg.emb_dim, cfg.hidden_size, dtype=cfg.dtype, bias=False)
        self.fc3 = nn.Linear(cfg.hidden_size, cfg.emb_dim, dtype=cfg.dtype, bias=False)


    def forward(self, x):
        gate = F.silu(self.fc1(x))
        b = self.fc2(x)
        c = gate * b
        return self.fc3(c)

In [6]:
original_context_length = 8192
low_freq_factor = 1.0
high_freq_factor = 4.0

In [7]:
low_freq_wavelen = original_context_length / low_freq_factor
high_freq_wavelen = original_context_length / high_freq_factor

In [9]:
high_freq_wavelen

2048.0

In [10]:
# modified RoPE
# base 값이 500_000으로 증가 -> 차원마다 더 천천히 주파수(회전 각)가 줄어듬
def compute_rope_params(head_dim, theta_base=10_000.0, max_seq_len=4096, scaling_cfg=None, dtype=torch.float32):
    assert head_dim % 2 == 0
    
    # a) 기본 각 주파수(w_k) 계산 : rad / token
    # k = 0, 2, 4, ..., head_dim - 2 (짝수 인덱스만 사용)
    k = torch.arange(0, head_dim, 2, dtype=dtype)
    angular_freq = 1.0 / (theta_base ** (k.float() / head_dim)) # (head_dim//2. )

    # b) NTK-Style 스케일으로 저주파 영역 압축
    if scaling_cfg is not None:
        angular_freq = _apply_ntk_scaling(
            angular_freq,
            cfg=scaling_cfg,
            seq_len_original=max_seq_len,
            dtype=dtype
        )

    # c) 위치별 절대각(θ = n·ω) → cos/sin LookUp
    positions = torch.arange(max_seq_len, dtype=dtype) # (L,)
    angles = positions[:, None] * angular_freq[None :] # (L, head_dim/2)
    anlges = torch.cat([angles, angles], dim=1) # (L, head_dim)

    return torch.cos(angles), torch.sin(angles)

def _apply_ntk_scaling(
        angular_freq,
        cfg,
        seq_len_original,
        dtype
):
    lmb = (2 * torch.pi) / angular_freq # 파장: tokens / cycle

    lmb_low_cut = seq_len_original / cfg.low_freq_factor
    lmb_high_cut = seq_len_original / cfg.high_freq_factor

    # 1) 저주파(λ > λ_low_cut) ➜ 주파수 factor 배 ↑
    w_scaled = torch.where(lmb > lmb_low_cut, angular_freq / cfg.factor, angular_freq)

    # 2) 중간 주파수 영역을 부드럽게 선형 보간
    smooth_ratio = (seq_len_original / lmb - cfg.low_freq_factor) / (cfg.high_freq_factor - cfg.low_freq_factor)

    w_interp = (1 - smooth_ratio) * (angular_freq / cfg.factor) +  smooth_ratio * angular_freq

    is_mid_band = (lmb < = lmb_)


def apply_rope(
        x,
        cos_lut,
        sin_lut
):
    B, H, L, D = x.shape
    assert D % 2 != 0

    # split along dim-D
    x_even, x_odd = x[..., : D//2], x[..., D//2:]

    cos = cos_lut[:L, :].view(1, 1, L, D) # (1,1,L,D)
    sin = sin_lut[:L, :].view(1, 1, L, D)

    x_rot = torch.cat([-x_odd, x_even], dim=-1) # B, H, L, D
    out = x * cos + x_rot * sin 
    return out.to(dtpye=x.dtype)



def resacle_theta(theta_odl, context_length_old, context_length_new):
    

SyntaxError: invalid syntax (1771613717.py, line 19)

In [None]:
# Instantiate RoPE parameters

llama_2_context_len = 4096
llama_3_context_len = 8192

llama_2_theta_base = 10_000
llama_3_theta_base = 500_000

In [None]:
batch_size = 2
num_heads = 4
head_dim = 16

cos, sin = precompute_rope_params(
    head_dim=head_dim,
    theta_base=llama_3_theta_base,
    context_length=llama_3_context_len
)

In [None]:
queries = torch.randn(batch_size, num_heads, llama_3_context_len, head_dim)
keys = torch.randn(batch_size, num_heads, llama_3_context_len, head_dim)

# Apply rotary position embeddings
queries_rot = compute_rope(queries, cos, sin)
keys_rot = compute_rope(keys, cos, sin)

In [None]:
# GQA(Grouped-query attention)
# reduce the number of key and value projections by sharing them among multiple attention heads
# Each attention head still has its unique query, but these queries attend to the same group of keys and values
# matmul과 파라미터수를 줄이도록 함

In [None]:
class SharedBuffers:
    _buffers = {}
    @staticmethod
    def get_buffers(context_length, head_dim, rope_base, freq_config, dtype=torch.float32):
        key = (context_length, head_dim, rope_base, tuple(freq_config.values()) if freq_config else freq_config, dtype)

        if key not in 

In [None]:
class GroupedQueryAttention(nn.Module):
    def __init__(
        self, 
        d_in,
        d_out,
        context_length,
        num_heads,
        num_kv_groups,
        rope_base=10_000,
        rope_config=None,
        dtype=None
    ):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
        assert num_heads % num_kv_groups == 0, "num_heads must be divisible by num_kv_groups"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.W_key = nn.Linear(d_in, num_kv_groups * self.head_dim, bias=False, dtype=dtype)
        self.W_value = nn.Linear(d_in, num_kv_groups * self.head_dim, bias=False, dtype=dtype)
        self.num_kv_groups = num_kv_groups
        self.group_size = num_heads // num_kv_groups

        self.W_query = nn.Linear(d_in, d_out, bias=False, dtype=dtype)
        self.out_proj = nn.Linear(d_out, d_out, bias=False, dtype=dtype)

        mask, cos, sin = SharedBuffers.get_buffers(context_length, self.head_dim, rope_base, rope_config, dtype)

        self.register_buffer("mask", mask)
        self.register_buffer("cos", cos)
        self.register_buffer("sin", sin)


    def forward(self, x):
        b, num_tokens, d_in = x.shape

        queries = self.W_query(x)
        keys = self.W_key(x)
        values = self.W_value(x)

        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        keys = keys.view(b, num_tokens, self.num_kv_groups, self.head_dim)
        values = values.view(b, num_tokens, self.num_kv_groups, self.head_dim)

        keys = keys.transpose(1, 2) # (b, num_heads, num_tokens, head_dim)
        values = values.transpose(1, 2) # (b, num_heads, num_tokens, head_dim)
        queries = queries.transpose(1, 2) # (b, num_query_groups, num_tokens, head_dim)

        keys = compute_rope(keys, self.cos, self.sin)
        queries = compute_rope(queries, self.cos, self.sin)

        keys = keys.repeat_interleave(self.group_size, dim=1)
        values = valeus.repeat_interleave(self.group_size, dim=1)

        # 
        attn_scores = queries @ keys.transpose(2, 3)

        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        assert keys.shape[-1] == self.head_dim

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.reshape(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)  # optional projection

        return context_vec



In [None]:
class Llama3Model(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.emb_dim, dtype=cfg.dtype)

        self.blocks = nn.ModuleList(
            [TransformerBlock(cfg) for _ in range(cfg.n_layres)]
        )

        self.final_form = nn.RMSNorm(cfg.emb_dim, eps=1e-5, dtype=cfg.dtype)
        self.out_head = nn.Linear(cfg.emb_dim, cfg.vocab_size, bias=False, dtype=cfg.dtype)

        self.register_buffer(
            "mask", torch.triu(torch.ones(cfg.context_length, cfg.context_length), diagonal=1).bool(), persistent=False
        )

        cfg.rope_base = rescale_theta(cfg.rope_base, cfg.orig_context_length, cfg.context_length)

        cos, sin = 


In [None]:
# tokenizer
# Llama 3, however, reverted back to using the BPE tokenizer from Tiktoken; specifically, it uses the GPT-4 tokenizer with an extended vocabulary


In [None]:
from pathlib import Path

import Tiktoken
from tiktoken.load import load_tiktoken_bpe


class Tokenizer:
    def __init__(self, model_path):
        assert os.path.isfile(model_path), f"Model file {model_path} not found"
        mergeable_ranks = load_tiktoken_bpe(model_path)

        self.special_tokens = {
            "<|begin_of_text|>": 128000,
            "<|end_of_text|>": 128001,
            "<|start_header_id|>": 128006,
            "<|end_header_id|>": 128007,
            "<|eot_id|>": 128009,
        }

        self.special_tokens.update({
            f"<|reserved_{i}|>": 128002 + i for i in range(256) if (128002 + i) not in self.special_tokens.values()
        })

        self.model = tiktoken.Encoding(
            name=Path(model_path).name,
            pat_str=r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+",
            mergeable_ranks=mergeable_ranks,
            special_tokens=self.special_tokens
        )

    def encode(self, text, bos=False, eos=False, allowed_special=set(), disallowed_special=()):
        if bos:
            tokens = [self.special_tokens["<|begin_of_text|>"]]
        else:
            tokens = []

        tokens += self.model.encode(text, allowed_special=allowed_special, disallowed_special=disallowed_special)

        if eos:
            tokens.append(self.special_tokens["<|end_of_text|>"])
        return tokens

    def decode(self, tokens):
        return self.model.decode(tokens)

In [None]:
from huggingface_hub import hf_hub_download

tokenizer_file_path = hf_hub_download(
    repo_id="meta-llama/Meta-Llama-3-8B",
    filename="original/tokenizer.model",
    local_dir="Llama-3-8B
)

In [None]:
# llam3 files을 쓰려면 blobfile이 필요함. 클라우드 저장소에 있는 데이터셋, 모델을 다루는데 필요함
# !pip install blobfile

In [None]:
tokenizer = Tokenizer(tokenizer_file_path)

In [None]:
token_ids = generate(
    model=model,
    idx=text_to_token_ids("Every effort", tokenizer).to(device),
    max_new_tokens=30,
    context_size=cfg.context_length,
    top_k=1,
    temperature=0.
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

In [None]:
from safetensors.torch import load_file

combined_weights = []

for i in range(1, 5):
    weights_file = hf_hub_download(
        repo_id="meta-llama/Meta-Llama-3-8B",
        filename=f"model-0000{i}-of-00004.safetensors",
        local_dir="Llama-3-8B"
    )
    current_weights = load_file(weights_file)
    combined_weights.update(current_weights)