<a href="https://colab.research.google.com/github/yuzhipeng588/llm/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import collections

import ast
import json
import os
import regex

from typing import Dict, List, Tuple

from transformers import PreTrainedTokenizer
from transformers import AutoTokenizer

def get_stats(tokens_trunk: list[list[int]]):
  token_count = collections.defaultdict(int)
  for tokens in tokens_trunk:
    for pair in zip(tokens[:-1], tokens[1:]):
      token_count[pair] = token_count.get(pair, 0) + 1

  return token_count

def merge(tokens_trunk: list[list[int]], new_token_pair: tuple, new_token: int) -> list[list[int]]:
  new_tokens_trunk = []
  for tokens in tokens_trunk:
    new_tokens = []
    i = 0
    while i < len(tokens):
      if tokens[i:i+2] == list(new_token_pair):
        new_tokens.append(new_token)
        i+=2
      else:
        new_tokens.append(tokens[i])
        i+=1
    new_tokens_trunk.append(new_tokens)
  return new_tokens_trunk

class SimpleUtf8Tokenizer(PreTrainedTokenizer):
  def __init__(self, vocab_file=None,
        merges_file=None,
        vocab=None,
        merges=None, **kwargs):

    # Handle both loading from files and instantiation from in-memory data
    if vocab_file and merges_file:
        with open(vocab_file, "r", encoding="utf-8") as f:
            vocab_lines = f.read().splitlines()
            self.vocab = {ast.literal_eval(items[0]): int(items[1]) for items in [line.rsplit(" ", 1) for line in vocab_lines]}
            self.reverse_vocab = {v: k for k, v in self.vocab.items()}
        with open(merges_file, "r", encoding="utf-8") as f:
            merges_lines = f.read().splitlines()
            self.merges = {(item[0], item[1]): int(item[2]) for item in [line.split() for line in merges_lines]}
    elif vocab and merges:
        self.reverse_vocab = vocab
        self.vocab = {v: k for k, v in vocab.items()}
        self.merges = merges
    else:
        raise ValueError(
            "You must provide either vocab/merges data or file paths."
        )
    print(self.vocab)
    self.id_to_token_str = {k: v.decode('latin-1') for k, v in self.reverse_vocab.items()}
    super().__init__(**kwargs)

  @classmethod
  def preprocess_to_token(cls, text: str) -> list[list[int]]:
    text_trunks = regex.findall(r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s""", text)
    try:
      tokens_trunk = [ list(map(int, t.encode('utf-8'))) for t in text_trunks]
    except UnicodeEncodeError:
      raise ValueError("Text must be encoded in UTF-8")
    return tokens_trunk

  def preprocess_to_bytes(cls, text: str) -> list[bytes]:
    text_trunks = regex.findall(r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s""", text)
    try:
      bytes_trunk = [ t.encode('utf-8') for t in text_trunks]
    except UnicodeEncodeError:
      raise ValueError("Text must be encoded in UTF-8")
    return bytes_trunk

  @classmethod
  def train(cls, text: str, vocab_size: int, **kwargs):
    tokens_trunk = cls.preprocess_to_token(text)

    special_tokens = set()
    if 'unk_token' in kwargs:
      special_tokens.add(kwargs['unk_token'])
    if 'pad_token' in kwargs:
      special_tokens.add(kwargs['pad_token'])
    if 'eos_token' in kwargs:
      special_tokens.add(kwargs['eos_token'])
    if 'sep_token' in kwargs:
      special_tokens.add(kwargs['sep_token'])

    assert vocab_size > 256 + len(special_tokens), "Token limit must be greater than 256 + special tokens"
    bytes_vocab = {i: bytes([i]) for i in range(256)}
    for token in special_tokens:
      bytes_vocab[len(bytes_vocab)] = token.encode('utf-8')
    reverse_bytes_vocab = {v: k for k, v in bytes_vocab.items()}
    merges = {}
    while vocab_size > len(bytes_vocab):
      stats = get_stats(tokens_trunk)
      new_token_pair = max(stats, key=stats.get)
      new_token = len(bytes_vocab)
      new_token_bytes = bytes_vocab[new_token_pair[0]] + bytes_vocab[new_token_pair[1]]

      bytes_vocab[new_token] = new_token_bytes
      reverse_bytes_vocab[new_token_bytes] = new_token
      tokens_trunk = merge(tokens_trunk, new_token_pair, new_token)
      merges[(new_token_pair[0], new_token_pair[1])] = new_token
      print("Vocab Size: ", len(bytes_vocab))

    #str_vocab = {v.decode('latin-1'): k for k, v in bytes_vocab.items()}
    #str_merges = [(k.decode('latin-1'), v.decode('latin-1')) for k, v in merges]
    return cls(vocab=bytes_vocab, merges=merges, **kwargs)

  @property
  def vocab_size(self) -> int:
      # The vocabulary consists of all 256 possible bytes.
      return len(self.vocab)

  def get_vocab(self) -> Dict[str, int]:
      """
      Returns the vocabulary as a dictionary of strings to integers.
      """
      # Create a mapping from the string representation of each byte to its integer value.
      return self.vocab

  def _convert_token_to_id(self, token: str) -> int:
      """
      Converts a token (a single-byte string) into its integer byte value.
      """
      # The token is a single character, and its ord() value is its byte value.
      return self.vocab[token]

  def _convert_id_to_token(self, index: int) -> str:
      """
      Converts an integer byte value into its single-byte string representation.
      """
      # Convert the integer to its character representation using latin-1.
      return self.id_to_token_str[index]

  def save_vocabulary(
      self, save_directory: str, filename_prefix: str | None = None
  ) -> Tuple[str]:
      """Saves the vocabulary and merges to files."""
      if not os.path.isdir(save_directory):
          os.makedirs(save_directory)

      # Save the vocabulary file
      vocab_file = os.path.join(
          save_directory, (filename_prefix or "") + "vocab.txt"
      )
      with open(vocab_file, "w", encoding="utf-8") as f:
          for p1, p2 in self.vocab.items():
              f.write(f"{p1} {p2}\n")

      # Save the merges file
      merges_file = os.path.join(
          save_directory, (filename_prefix or "") + "merges.txt"
      )
      with open(merges_file, "w", encoding="utf-8") as f:
          for k, v in self.merges.items():
              f.write(f"{k[0]} {k[1]} {v}\n")

      return (vocab_file, merges_file)

  # Encode the text with the longest tokens.
  def encode(self, text):
    encoded_text_trunks = self.preprocess_to_token(text)

    while True:
      stats = get_stats(encoded_text_trunks)
      next_token_pair = max(stats, key=lambda p: float('-inf') if p not in self.merges else stats.get(p))
      if next_token_pair in self.merges:
        encoded_text_trunks = merge(encoded_text_trunks, next_token_pair, self.merges[next_token_pair])
      else:
        break
    new_list = []
    for tokens in encoded_text_trunks:
      new_list.extend(tokens)
    return new_list

  def decode(self, tokens):
    print(self.reverse_vocab)
    return b''.join([self.reverse_vocab[token] for token in tokens]).decode('utf-8', errors='replace')

AutoTokenizer.register("SimpleUtf8Tokenizer", slow_tokenizer_class=SimpleUtf8Tokenizer)

In [None]:
from transformers import AutoTokenizer

from huggingface_hub import hf_hub_download

# Download a specific file from a model repository
file_path_vocab = hf_hub_download(repo_id="thaitea2021/experimental", filename="vocab.txt")
print(f"File downloaded to: {file_path_vocab}")
file_path_merges = hf_hub_download(repo_id="thaitea2021/experimental", filename="merges.txt")
print(f"File downloaded to: {file_path_merges}")

# Download a file from a dataset repository
# file_path = hf_hub_download(repo_id="google/fleurs", filename="README.md", repo_type="dataset")
# print(f"File downloaded to: {file_path}")

AutoTokenizer.register("SimpleUtf8Tokenizer", slow_tokenizer_class=SimpleUtf8Tokenizer)

save_directory = "./simple_utf8_tokenizer"
print("\nLoading tokenizer back...")
loaded_tokenizer = AutoTokenizer.from_pretrained(
    "thaitea2021/experimental", trust_remote_code=True, vocab_file=file_path_vocab, merges_file=file_path_merges
)

# Verify it works
test_text = "hello custom"
encoded = loaded_tokenizer.encode(test_text)
print(f"\nOriginal text: '{test_text}'")
print(f"Encoded with loaded tokenizer: {encoded}")
decoded = loaded_tokenizer.decode(encoded)
print(f"Decoded text: '{decoded}'")

File downloaded to: /root/.cache/huggingface/hub/models--thaitea2021--experimental/snapshots/d556d1a512c3229b97edfbcc4a74c0949e0904f4/vocab.txt
File downloaded to: /root/.cache/huggingface/hub/models--thaitea2021--experimental/snapshots/d556d1a512c3229b97edfbcc4a74c0949e0904f4/merges.txt

Loading tokenizer back...
{b'\x00': 0, b'\x01': 1, b'\x02': 2, b'\x03': 3, b'\x04': 4, b'\x05': 5, b'\x06': 6, b'\x07': 7, b'\x08': 8, b'\t': 9, b'\n': 10, b'\x0b': 11, b'\x0c': 12, b'\r': 13, b'\x0e': 14, b'\x0f': 15, b'\x10': 16, b'\x11': 17, b'\x12': 18, b'\x13': 19, b'\x14': 20, b'\x15': 21, b'\x16': 22, b'\x17': 23, b'\x18': 24, b'\x19': 25, b'\x1a': 26, b'\x1b': 27, b'\x1c': 28, b'\x1d': 29, b'\x1e': 30, b'\x1f': 31, b' ': 32, b'!': 33, b'"': 34, b'#': 35, b'$': 36, b'%': 37, b'&': 38, b"'": 39, b'(': 40, b')': 41, b'*': 42, b'+': 43, b',': 44, b'-': 45, b'.': 46, b'/': 47, b'0': 48, b'1': 49, b'2': 50, b'3': 51, b'4': 52, b'5': 53, b'6': 54, b'7': 55, b'8': 56, b'9': 57, b':': 58, b';': 59, b'<

In [None]:
!pip install datatrove
from datatrove.pipeline.readers import ParquetReader

# limit determines how many documents will be streamed (remove for all)
# to fetch a specific dump: hf://datasets/HuggingFaceFW/fineweb/data/CC-MAIN-2024-10
# replace "data" with "sample/100BT" to use the 100BT sample
data_reader = ParquetReader("hf://datasets/HuggingFaceFW/fineweb/data", limit=5000)



In [None]:
data = []
for i, doc in enumerate(data_reader()):
  data.append(loaded_tokenizer.encode(doc.text))

n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

[32m2025-09-14 05:30:25.804[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file CC-MAIN-2013-20/000_00000.parquet, 1/27468[0m


In [None]:
print(val_data[:100])

[[83, 111, 109, 101, 116, 104, 105, 110, 103, 32, 97, 32, 98, 105, 116, 46, 46, 46, 111, 117, 116, 32, 111, 102, 32, 116, 104, 101, 32, 111, 114, 100, 105, 110, 97, 114, 121, 32, 102, 111, 114, 32, 109, 121, 32, 119, 101, 101, 107, 101, 110, 100, 32, 109, 117, 115, 105, 110, 103, 115, 46, 10, 73, 32, 97, 109, 32, 103, 111, 105, 110, 103, 32, 116, 111, 32, 104, 97, 118, 101, 32, 97, 32, 108, 105, 116, 116, 108, 101, 32, 34, 71, 105, 114, 108, 115, 32, 67, 97, 109, 112, 34, 32, 97, 116, 32, 104, 111, 109, 101, 44, 32, 97, 108, 108, 32, 116, 111, 32, 109, 121, 115, 101, 108, 102, 46, 10, 73, 32, 97, 109, 32, 103, 111, 105, 110, 103, 32, 116, 111, 32, 108, 111, 97, 100, 32, 117, 112, 32, 111, 110, 32, 121, 117, 109, 109, 121, 32, 118, 101, 103, 101, 116, 97, 114, 105, 97, 110, 32, 102, 111, 111, 100, 115, 32, 40, 104, 111, 109, 101, 32, 109, 97, 100, 101, 32, 97, 110, 100, 32, 110, 111, 116, 32, 104, 111, 109, 101, 32, 109, 97, 100, 101, 41, 44, 32, 112, 117, 116, 32, 111, 117, 116, 32, 97

In [None]:
import torch

torch.manual_seed(1337)

token_limit = loaded_tokenizer.vocab_size
batch_size = 16
context_length = 128
embed_size = 64
num_heads = 8
n_layers = 4
head_size = embed_size // num_heads
vocab_size = token_limit + 1
dropout = 0.2

eval_interval = 50
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 10

def padding(one_data):
  padding_data = None
  if len(one_data) < context_length:
    padding_data = torch.cat((one_data, torch.zeros(context_length - len(one_data), dtype=one_data.dtype, device=one_data.device)))
  else:
    padding_data = one_data[:context_length]
  return padding_data

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-batch_size, size=())
    x = torch.stack([padding(torch.tensor(data[i])) for i in range(ix, ix+batch_size)])
    y = torch.stack([padding(torch.tensor(data[i][1:] + loaded_tokenizer.encode('<unk>'))) for i in range(ix, ix+batch_size)])
    return x, y


@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:

class Head(torch.nn.Module):
    """
    One head of self-attention.
    """

    def __init__(self, head_size):
        super().__init__()
        # y = xAT + b, no bias -> b = 0
        self.key = torch.nn.Linear(embed_size, head_size, bias=False)
        self.query = torch.nn.Linear(embed_size, head_size, bias=False)
        self.value = torch.nn.Linear(embed_size, head_size, bias=False)
        # lower triangle, self.tril
        self.register_buffer('tril', torch.tril(torch.ones(context_length, context_length)))


    def forward(self, x):
        B,C,E = x.shape
        k = self.key(x) # B * C * H
        q = self.query(x) # B * C * H
        weight = q @ k.transpose(-2, -1) * E**-0.5 # B * C * C
        weight = weight.masked_fill(self.tril[:C, :C] == 0, float('-inf'))
        weight = torch.nn.functional.softmax(weight, dim=-1)
        v = self.value(x) # B * C * H
        self.dropout = torch.nn.Dropout(dropout)
        out = weight @ v # B * C * H
        return out

class MultiHeadAttention(torch.nn.Module):
    """
    Multiple heads of self-attention in parallel.
    """
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = torch.nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = torch.nn.Linear(embed_size, embed_size)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(torch.nn.Module):
    """
    A simple linear layer followed by a non-linearity
    """
    def __init__(self, embed_size):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(embed_size, 4 * embed_size),
            torch.nn.ReLU(),
            torch.nn.Linear(4 * embed_size, embed_size),
            torch.nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(torch.nn.Module):
    """
    Transformer block: communication followed by computation
    """

    def __init__(self, num_heads, embed_size):
      super().__init__()
      head_size = embed_size // num_heads
      self.ma = MultiHeadAttention(num_heads, head_size)
      self.ffwd = FeedForward(embed_size)
      # normalize the embedding dimension, this is better(?) than batch norm because data in the batch might have different length
      self.ln1 = torch.nn.LayerNorm(embed_size)
      self.ln2 = torch.nn.LayerNorm(embed_size)

    def forward(self, x):
      x = x + self.ma(x)
      x = self.ln1(x)
      x = x + self.ffwd(x)
      x = self.ln2(x)
      return x

class LMHead(torch.nn.Module):
    def __init__(self, shared_weights):
        super().__init__()
        shape = shared_weights.shape
        self.lm_head = torch.nn.Linear(shape[1], shape[0]) # input:E output: V, for linear model -> weight: V * E
        self.lm_head.weight = shared_weights

    def forward(self, x):
        return self.lm_head(x)

In [None]:
class TinyGPT(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = torch.nn.Embedding(vocab_size, embed_size) # V * E
        self.position_embedding_table = torch.nn.Embedding(context_length, embed_size) # C * E
        self.blocks = torch.nn.Sequential(*[Block(num_heads, embed_size) for _ in range(n_layers)])
        self.layer_norm = torch.nn.LayerNorm(embed_size)
        self.lm_head = LMHead(self.token_embedding_table.weight)

    def forward(self, input, targets=None):
        B, C = input.shape
        tok_emb = self.token_embedding_table(input) # B * C * E
        pos_emb = self.position_embedding_table(torch.arange(context_length, device=device))
        # typ_emb = self.token_embedding_table(token_type_input) # B * C * E for special token like start, end, separator etc
        hidden_layer = tok_emb + pos_emb # + typ_emb
        hidden_layer = self.blocks(hidden_layer)
        hidden_layer = self.layer_norm(hidden_layer)
        logits = self.lm_head(hidden_layer)
        loss = None
        if targets is not None:
            B, C, VocabSize = logits.shape
            logits = logits.view(B*C, VocabSize)
            targets = targets.view(B*C)
            loss = torch.nn.functional.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens=500):
        # idx is (B, C) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last context_length tokens
            idx_cond = idx[:, -context_length:]
            # get the predictions
            logits, _ = self(idx_cond) # logits B * C * VocabSize
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, 1, VocabSize)
            probs = torch.nn.functional.softmax(logits, dim=-1) # (B, 1, VocabSize)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B,1)

            idx = torch.cat((idx, idx_next), dim=1) # (B, C+1)
        return idx

In [None]:
model = TinyGPT()
m = model.to(device)

epoch = 0

# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

0.373953 M parameters
step 0: train loss 40.2059, val loss 40.4856
step 10: train loss 10.5619, val loss 10.7978
step 20: train loss 8.1467, val loss 8.0847
step 30: train loss 6.6570, val loss 6.7441
step 40: train loss 5.8690, val loss 5.8300
step 50: train loss 5.4452, val loss 5.5392
step 60: train loss 5.0096, val loss 5.1285
step 70: train loss 4.9245, val loss 4.7813
step 80: train loss 4.5234, val loss 4.5445
step 90: train loss 4.3714, val loss 4.6010
step 99: train loss 4.1858, val loss 4.3227


In [None]:
learning_rate = 1e-3
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
!pip install huggingface_hub
from huggingface_hub import hf_hub_download

CHECKPOINT_PATH = "tiny_gpt_checkpoint.pt"

# The repository you want to download from
repo_id = "thaitea2021/experimental"

# Download the file and get its local cached path
local_file_path = hf_hub_download(
    repo_id=repo_id,
    filename=CHECKPOINT_PATH
)

print(f"File downloaded to: {local_file_path}")


# --- Load the checkpoint ---
print(f"\nLoading checkpoint from {local_file_path}...")
checkpoint = torch.load(local_file_path)

model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch_loaded = checkpoint['epoch']
loss_loaded = checkpoint['loss']

print("Checkpoint loaded successfully.")
print(f"Resuming training from epoch {epoch_loaded + 1}")



tiny_gpt_checkpoint.pt:   0%|          | 0.00/6.77M [00:00<?, ?B/s]

File downloaded to: /root/.cache/huggingface/hub/models--thaitea2021--experimental/snapshots/e14ccd4c656060fe026486b2af9e6f576946e100/tiny_gpt_checkpoint.pt

Loading checkpoint from /root/.cache/huggingface/hub/models--thaitea2021--experimental/snapshots/e14ccd4c656060fe026486b2af9e6f576946e100/tiny_gpt_checkpoint.pt...
Checkpoint loaded successfully.
Resuming training from epoch 10001


In [None]:
max_iter = 10000

for iter in range(max_iter):
    if iter % eval_interval == 0 or iter == max_iter - 1 or iter % 100 == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    X, Y = get_batch('train')
    logits, loss = model(X, Y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    epoch += 1

step 0: train loss 1.8339, val loss 1.9207
step 50: train loss 1.8407, val loss 1.9261
step 100: train loss 1.8279, val loss 1.9311
step 150: train loss 1.8791, val loss 1.9000
step 200: train loss 1.8090, val loss 1.9401
step 250: train loss 1.8106, val loss 1.9545
step 300: train loss 1.8331, val loss 1.9321
step 350: train loss 1.8615, val loss 1.9256
step 400: train loss 1.8311, val loss 1.9545
step 450: train loss 1.8960, val loss 1.9126
step 500: train loss 1.8280, val loss 1.9307
step 550: train loss 1.8417, val loss 1.9419
step 600: train loss 1.8779, val loss 1.8962
step 650: train loss 1.8496, val loss 1.9086
step 700: train loss 1.8781, val loss 1.9679
step 750: train loss 1.8238, val loss 1.9377
step 800: train loss 1.8713, val loss 1.9258
step 850: train loss 1.8704, val loss 1.9586
step 900: train loss 1.8331, val loss 1.9056
step 950: train loss 1.8606, val loss 1.9064
step 1000: train loss 1.9402, val loss 1.9584
step 1050: train loss 1.8448, val loss 1.9207
step 1100: 

In [None]:
#context = padding(torch.tensor(loaded_tokenizer.encode("Requirement already satisfied: datatrove")))
#print(context)
#tokens = model.generate(context.unsqueeze(0), max_new_tokens=100)[0]
context = get_batch('val')[0][:1]
print(loaded_tokenizer.decode(list(map(int, context[0]))))
tokens = model.generate(context, max_new_tokens=200)[0]
print(loaded_tokenizer.decode(list(map(int, tokens))))

In [None]:
CHECKPOINT_PATH = "tiny_gpt_checkpoint.pt"

# --- Save the checkpoint ---
print(f"Saving checkpoint to {CHECKPOINT_PATH}...")
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
}, CHECKPOINT_PATH)

print("Checkpoint saved successfully.")

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import HfApi

api = HfApi(token=os.getenv("HF_TOKEN"))
api.upload_folder(
    folder_path="/content/",
    repo_id="thaitea2021/experimental",
    repo_type="model",
)

In [None]:
torch.arange(3, device=device)
input_ids = torch.randint(0, vocab_size, (3, 4))
print(input_ids.shape)
past_length = 2
position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long,
                                        device=input_ids.device)
print(position_ids)
print(position_ids.unsqueeze(0))
print(position_ids.unsqueeze(1))
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
print(position_ids)

input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_ids.size(-1))
position_ids = position_ids.view(-1, position_ids.size(-1))
print(position_ids)

input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_ids.size(-1))
position_ids = position_ids.view(-1, position_ids.size(-1))
print(position_ids)

torch.Size([3, 4])
tensor([2, 3, 4, 5])
tensor([[2, 3, 4, 5]])
tensor([[2],
        [3],
        [4],
        [5]])


IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)

In [None]:
t = torch.ones(context_length, context_length)
t.dim_order
torch.empty((2, 3, 5, 7)).dim_order

In [None]:
tril = torch.tril(torch.ones(2,2)).transpose(-2,-1)
tril

tensor([[1., 1.],
        [0., 1.]])

In [None]:
w = torch.randn(3,2,2)
print(w)
print(tril)
w = w.masked_fill(tril[:2, :2] == 0, float('-inf'))
print(w)
torch.nn.functional.softmax(w, dim=-1)

tensor([[[ 0.2673,  0.2212],
         [ 1.1675, -0.4633]],

        [[-0.5700, -0.2226],
         [-3.2702, -0.3554]],

        [[-0.0994, -1.0010],
         [-1.0627, -0.2304]]])
tensor([[1., 1.],
        [0., 1.]])
tensor([[[ 0.2673,  0.2212],
         [   -inf, -0.4633]],

        [[-0.5700, -0.2226],
         [   -inf, -0.3554]],

        [[-0.0994, -1.0010],
         [   -inf, -0.2304]]])


tensor([[[0.5115, 0.4885],
         [0.0000, 1.0000]],

        [[0.4140, 0.5860],
         [0.0000, 1.0000]],

        [[0.7113, 0.2887],
         [0.0000, 1.0000]]])

In [None]:
batch, sentence_length, embedding_dim = 4, 3, 2
embedding = torch.randn(batch, sentence_length, embedding_dim)
layer_norm = torch.nn.LayerNorm(embedding_dim)
print(embedding)
a = embedding[0, 0, :]
print(a.mean())
print(a.var()**0.5)
print(a)
#print((a[:embedding_dim]-a.mean())/a.std**0.5)
# Activate module
layer_norm(embedding)

tensor([[[-1.1499, -0.5856],
         [ 1.8937, -0.7417],
         [ 0.8896, -1.3776]],

        [[ 2.8078, -0.3774],
         [ 2.4030,  0.1610],
         [ 1.5563,  0.5282]],

        [[ 0.2871, -0.4187],
         [ 0.5799,  0.4286],
         [ 0.7362, -0.9675]],

        [[ 1.9276,  0.5747],
         [ 0.8793, -0.4157],
         [-0.0928,  1.2049]]])
tensor(-0.8677)
tensor(0.3991)
tensor([-1.1499, -0.5856])


tensor([[[-0.9999,  0.9999],
         [ 1.0000, -1.0000],
         [ 1.0000, -1.0000]],

        [[ 1.0000, -1.0000],
         [ 1.0000, -1.0000],
         [ 1.0000, -1.0000]],

        [[ 1.0000, -1.0000],
         [ 0.9991, -0.9991],
         [ 1.0000, -1.0000]],

        [[ 1.0000, -1.0000],
         [ 1.0000, -1.0000],
         [-1.0000,  1.0000]]], grad_fn=<NativeLayerNormBackward0>)

In [None]:
N, C, H, W = 20, 5, 10, 10
input = torch.randn(N, C, H, W)
# Normalize over the last three dimensions (i.e. the channel and spatial dimensions)
# as shown in the image below
layer_norm = torch.nn.LayerNorm([C, H, W])
output = layer_norm(input)
print(output)

tensor([[[[ 1.3781e+00,  7.6709e-01, -5.8921e-01,  ...,  2.4763e-01,
            1.7541e+00, -1.6234e-01],
          [ 1.0229e+00, -9.3623e-01, -1.7670e+00,  ..., -9.0122e-01,
            8.1851e-01, -8.8513e-01],
          [ 2.3566e-02, -9.5233e-01,  3.3461e-01,  ..., -8.3450e-01,
            2.9249e+00,  1.2571e+00],
          ...,
          [ 6.5726e-01,  1.7548e+00,  1.4702e+00,  ...,  5.3477e-01,
            4.9672e-01, -1.0733e+00],
          [ 3.6879e-02,  2.4085e+00,  2.0079e-01,  ...,  7.8090e-01,
           -3.7058e-01,  1.6860e-01],
          [ 1.1246e+00, -3.3149e-01, -2.3218e-01,  ..., -2.1252e-01,
            3.4155e-01,  8.9181e-01]],

         [[-1.3383e-01, -1.8847e+00, -9.4479e-01,  ...,  1.6915e+00,
           -8.7425e-01,  4.2550e-01],
          [ 3.0735e-01,  1.4472e-01, -3.1667e+00,  ...,  2.9933e-02,
           -1.0822e+00,  3.2222e-01],
          [-4.3885e-01,  8.7548e-01,  9.2323e-01,  ...,  8.7133e-01,
           -1.9355e+00, -1.8186e+00],
          ...,
     

In [None]:
import torch
x = torch.randn(3, 4, 5)
mean = x.mean(dim=1, keepdim=True)
var = x.var(dim = 1, keepdim=True)
print(mean)
print(var)
y = torch.randn(3, 6, 4)
print(torch.sqrt(var+1e-5))
(x - mean)/torch.sqrt(var+1e-5)
(y - mean)/torch.sqrt(var+1e-5)

tensor([[[-0.9183, -0.4827, -0.1189,  0.1915, -0.7799]],

        [[-0.5887, -0.1683,  0.5328, -0.3851,  0.2132]],

        [[-0.1058,  0.4779,  0.0840, -0.2263,  0.0799]]])
tensor([[[0.7177, 0.5801, 1.3385, 0.4480, 1.1285]],

        [[0.1726, 0.1709, 0.8791, 0.9982, 1.1172]],

        [[0.2937, 1.2579, 0.2427, 0.3702, 2.2105]]])
tensor([[[0.8472, 0.7616, 1.1569, 0.6694, 1.0623]],

        [[0.4154, 0.4134, 0.9376, 0.9991, 1.0570]],

        [[0.5419, 1.1215, 0.4927, 0.6084, 1.4868]]])


RuntimeError: The size of tensor a (4) must match the size of tensor b (5) at non-singleton dimension 2