In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import time
import numpy as np
import mmap
import random
import pickle
import argparse

In [2]:
device = torch.device("mps")
print(device)

mps


In [4]:
chars = ""
with open('openwebtext/vocab.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    chars = sorted(set(text))
    
vocab_size = len(chars)

In [6]:
import torch
import pickle

# Ensure all required class definitions are included before loading the model
class Head(torch.nn.Module):
    """ One head of self-attention """
    def __init__(self, head_size):
        super().__init__()
        self.key = torch.nn.Linear(n_embd, head_size, bias=False)
        self.query = torch.nn.Linear(n_embd, head_size, bias=False)
        self.value = torch.nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
        tril_mask = torch.tril(torch.ones(T, T, device=x.device))
        wei = wei.masked_fill(tril_mask == 0, float('-inf'))
        wei = torch.nn.functional.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        return wei @ v

class MultiHeadAttention(torch.nn.Module):
    """ Multiple heads of self-attention """
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = torch.nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = torch.nn.Linear(head_size * num_heads, n_embd)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.dropout(self.proj(out))

class FeedForward(torch.nn.Module):
    """ A simple linear layer followed by a non-linearity """
    def __init__(self, n_embd):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(n_embd, 4 * n_embd),
            torch.nn.ReLU(),
            torch.nn.Linear(4 * n_embd, n_embd),
            torch.nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(torch.nn.Module):
    """ Transformer block """
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = torch.nn.LayerNorm(n_embd)
        self.ln2 = torch.nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(torch.nn.Module):
    """ A GPT-style language model """
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = torch.nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = torch.nn.Embedding(block_size, n_embd)
        self.blocks = torch.nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = torch.nn.LayerNorm(n_embd)
        self.lm_head = torch.nn.Linear(n_embd, vocab_size)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, torch.nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, torch.nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, index, targets=None):
        B, T = index.shape
        tok_emb = self.token_embedding_table(index)
        pos_emb = self.position_embedding_table(torch.arange(T, device=index.device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        if targets is None:
            return logits, None
        B, T, C = logits.shape
        logits = logits.view(B * T, C)
        targets = targets.view(B * T)
        loss = torch.nn.functional.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(index)
            logits = logits[:, -1, :]
            probs = torch.nn.functional.softmax(logits, dim=-1)
            index_next = torch.multinomial(probs, num_samples=1)
            index = torch.cat((index, index_next), dim=1)
        return index

# Load hyperparameters
block_size = 128
n_embd = 384
n_layer = 8
n_head = 8
dropout = 0.2

# Device configuration
device = torch.device("mps")
print(f"Using device: {device}")

# Load model
print('Loading model parameters...')
with open('model-01-20000.pkl', 'rb') as f:
    model = pickle.load(f)
print('Model loaded successfully!')

# Move the model to the appropriate device
model = model.to(device)

# Load tokenizer
chars = ""
with open('openwebtext/vocab.txt', 'r', encoding='utf-8') as f:
    chars = sorted(set(f.read()))
string_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_string = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

Using device: mps
Loading model parameters...
Model loaded successfully!
Prompt: The devil 
Generated text:
The devil    hI     uf k-  P W[).Y   n Lጳnv Q u  Tm  N    I分e5.e ae.E eue0. 0d DC
 N Qe  Re1 u   Y  Xa w Iнe w s fMm JRE    ,TE AWl    i  peR s
KLh y TS    sw  


In [9]:
# Generate text
prompt = "The devil "
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_text = decode(model.generate(context.unsqueeze(0), max_new_tokens=150)[0].tolist())
print(f"Prompt: {prompt}")
print("Generated text:")
print(generated_text)

Prompt: The devil 
Generated text:
The devil  clvR  .  “T–  n”Ie6F   Ini  v I%  x  zh o  en u  y vL sRv E %,oR𝗹0i [ 0 X7 nB47,0euWcrRd VhoAEEt 6e?oL p i e  а 
    F E
 ti    y iTei? O Ie. ” 
r   


In [12]:
import torch
import pickle

# Define model hyperparameters (must match training)
block_size = 128
n_embd = 384
n_layer = 8
n_head = 8
dropout = 0.2

# Device configuration
device = torch.device("mps")

# Define tokenizer
chars = ""
with open('openwebtext/vocab.txt', 'r', encoding='utf-8') as f:
    chars = sorted(set(f.read()))
string_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_string = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

# Load model
print('Loading model parameters...')
with open('model-01-20000.pkl', 'rb') as f:
    model = pickle.load(f)
print('Model loaded successfully!')

# Move model to device and set evaluation mode
model = model.to(device)
model.eval()

# Text generation with temperature
def generate_text(prompt, max_new_tokens=150, temperature=1.0):
    context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
    for _ in range(max_new_tokens):
        logits, _ = model(context.unsqueeze(0))  # Add batch dimension
        logits = logits[:, -1, :] / temperature  # Focus on the last time step
        probs = torch.nn.functional.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)  # (1, 1)
        next_token = next_token.squeeze(0)  # Remove batch dimension, shape becomes (1,)
        context = torch.cat((context, next_token), dim=0)  # Concatenate along time dimension
    return decode(context.tolist())

Loading model parameters...
Model loaded successfully!


In [13]:
# Generate text
prompt = "The devil "
generated_text = generate_text(prompt)
print("Generated text:")
print(generated_text)

Generated text:
The devil  T n .eanp. I n4e Rc[… M y  I  ,Den  B   Kn  eeS .n 3      3e1r m H u  B ui e    lC,   c[e.e01 fs    aJ  nν-e   
 i   и     Ke 0 G ,Ti. o It  4     h 


In [14]:
chars = ""
with open('openwebtext/vocab.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    chars = sorted(set(text))
    
vocab_size = len(chars)

In [15]:
vocab_size

32172

In [16]:
string_to_int = {ch:i for i, ch in enumerate(chars)}
int_to_string = {i:ch for i, ch in enumerate(chars)}

# encoding-decoding tokenizer (character level)
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

In [17]:
encode

<function __main__.<lambda>(s)>