In [1]:
import pickle
import datetime

from ipywidgets import IntProgress
from IPython.display import display

import tiktoken
import torch
import pathlib as pl
import pandas as pd

# Data visualization
import plotly.express as px

from sklearn.model_selection import train_test_split

In [2]:
today = datetime.date.today().strftime("%Y%m%d")

In [3]:
torch.manual_seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"
#torch.backends.mps.is_available()
#device = torch.device("mps")

In [3]:
enc_100 = tiktoken.get_encoding('cl100k_base')
enc_p50 = tiktoken.get_encoding('p50k_base')
enc_r50 = tiktoken.get_encoding('r50k_base')
encoders = [enc_p50, enc_r50, enc_100]

encode = enc_100.encode

In [4]:
def foo(enc):
    encoded = enc.encode("det her er ret sjovt! - hvad med æg, høns or åben")
    tokens = [enc.decode([t]) for t in encoded]
    print("--begin--")
    print(f"{enc.name} => {len(tokens)}")
    print(", ".join(tokens))
    print("---end---")

for enc in encoders:
    foo(enc)

--begin--
p50k_base => 25
det,  her,  er,  ret,  s, j, ov, t, !,  -,  h, v, ad,  med,  �, �, g, ,,  h, ø, ns,  or,  �, �, ben
---end---
--begin--
r50k_base => 25
det,  her,  er,  ret,  s, j, ov, t, !,  -,  h, v, ad,  med,  �, �, g, ,,  h, ø, ns,  or,  �, �, ben
---end---
--begin--
cl100k_base => 20
det,  her,  er,  ret,  sj, ov, t, !,  -,  hvad,  med,  æ, g, ,,  h, ø, ns,  or,  å, ben
---end---


In [4]:
block_size = 256  # Number of continues tokens (history) to use for training
batch_size = 64  # Number of channels of data to feed to CPU/GPU to take advantage of parallel computing
n_embed = 384  # Number of neurons in layers / num of features to find
num_heads = 6
num_layers = 6
dropout = 0.2

lr = 3e-4  # Learning rate

In [5]:
data_dir = pl.Path("data")
model_dir = pl.Path("model")
model_dir.mkdir(parents=True, exist_ok=True)
plot_dir = pl.Path("plot")
plot_dir.mkdir(parents=True, exist_ok=True)
data_file = data_dir / "shakespeare_char" / "input.txt"
with open(data_file, 'r') as f:
    data = f.read()

In [6]:
chars = sorted(list(set(data)))
vocab_size = len(chars)

In [7]:
stoi = {ch:i for i,ch in enumerate(chars)}
encode = lambda s: [stoi[ch] for ch in s]
decode = lambda idxs: ''.join([chars[i] for i in idxs])

print(encode("hi there!"))
print(decode(encode("hi there!")))

[46, 47, 1, 58, 46, 43, 56, 43, 2]
hi there!


In [8]:
tdata = torch.tensor(encode(data), dtype=torch.long)
## Split data into train and validation data
n = int(0.9*len(tdata))
train_data = tdata[:n]
val_data = tdata[n:]

In [9]:
def get_batch(data):
    # Get random batch
    ix = torch.randint(len(data) - block_size, (batch_size,))
    X = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    X, y = X.to(device), y.to(device)
    return X, y

get_batch(train_data)

(tensor([[54, 43, 63,  ..., 54, 43, 63],
         [57,  1, 61,  ..., 47, 52,  1],
         [57,  1, 58,  ..., 46, 39, 58],
         ...,
         [23, 17,  1,  ..., 24, 17, 10],
         [43, 50, 54,  ...,  1, 21, 51],
         [63, 53, 59,  ..., 63,  1, 58]]),
 tensor([[43, 63,  1,  ..., 43, 63, 12],
         [ 1, 61, 53,  ..., 52,  1, 61],
         [ 1, 58, 46,  ..., 39, 58, 46],
         ...,
         [17,  1, 27,  ..., 17, 10,  0],
         [50, 54,  1,  ..., 21, 51, 54],
         [53, 59,  1,  ...,  1, 58, 39]]))

In [None]:
class LayerNorm1d:
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        xmean = x.mean(1, keepdims=True)
        xvar = x.var(1, keepdims=True)
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # Normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        return self.out

    def parameters(self):
        return self.gamma, self.beta

module = LayerNorm1d(100)
x = torch.randn(32, 100)
x = module(x)
x.shape

x[:,0].mean(), x[:,0].std()
x[0,:].mean(), x[0,:].std()

In [10]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(42)

class Head(nn.Module):
    def __init__(self, head_size, n_embed, block_size, dropout):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)  # Stores "what I am/have"
        self.query = nn.Linear(n_embed, head_size, bias=False)  # Stores "what am I looking for/interested in"
        self.value = nn.Linear(n_embed, head_size, bias=False)  # Stores "If you find me interesting, here is what I will communicate to you"
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        q = self.query(x) # (B,T,C)
        k = self.key(x)
        # Self attention
        wei = q @ k.transpose(-2,-1) * C ** -0.5  # Scaled to not have softmax immediately converge towards a single node (scaled attention)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        # Perform the weighted aggregation of values
        v = self.value(x)  # (B, T, C)
        out = wei @ v
        return out


class MultiHead(nn.Module):

    def __init__(self, num_heads, head_size, n_embed, block_size, dropout):
        super().__init__()
        self.heads = nn.ModuleList((Head(head_size, n_embed, block_size, dropout) for _ in range(num_heads)))
        self.proj = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = torch.cat([h(x) for h in self.heads], dim=-1)
        x = self.proj(x)
        x = self.dropout(x)
        return x


class FeedForward(nn.Module):

    """
    A simple layer followed by a non-linearity
    """
    def __init__(self, n_embed, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):

    """
    Intersperse communication and computation
    """
    def __init__(self, num_heads, n_embed, block_size, dropout):
        super().__init__()
        head_size = n_embed // num_heads
        self.ln1 = nn.LayerNorm(n_embed)
        self.sa_heads = MultiHead(num_heads, head_size, n_embed, block_size, dropout)
        self.ln2 = nn.LayerNorm(n_embed)
        self.ffwd = FeedForward(n_embed, dropout)

    def forward(self, x):
        x = self.ln1(x)
        x = x + self.sa_heads(x)
        x = self.ln2(x)
        x = x + self.ffwd(x) # (B, T, C) - each token thinks individually
        return x

class BigramLanguageModel(nn.Module):
    def __init__(self, num_layers, num_heads, block_size, vocab_size, n_embed, dropout):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embed)
        self.position_embedding = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(*((Block(num_heads, n_embed, block_size, dropout)) for _ in range(num_layers)))
        self.ln_f = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, x, y):
        B, T = x.shape
        token_embed = self.token_embedding(x) # (batch_size, time_size, C)
        pos_embed = self.position_embedding(torch.arange(T, device=x.device)) # (time_size, C)
        x = token_embed + pos_embed # tensors gets batch aligned, so pos_embed: (batch_suze, time_size, C)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x) # (batch_size, time_size, vocab_size)

        if y is None:
            return logits, None

        batch_size, block_size, vocab_size = logits.shape
        # Change layout to use cross_entropy (as it expects (batch, channel))
        logits = logits.view(batch_size * block_size, vocab_size) # Flatten two first dims
        y = y.view(batch_size*block_size) # Flatten
        loss = F.cross_entropy(logits, y)
        return logits, loss

    def generate(self, x, max_tokens):
        # X is (batch_size, time_size)
        self.eval()
        for _ in range(max_tokens):
            X_cond = x[:, -block_size:]
            logits, _ = self(X_cond, None)
            # Only take last element in block
            logits = logits[:,-1,:] # becomes (batch_size, vocab_size)
            probs = F.softmax(logits, dim=-1)
            # Sample distribution
            X_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1)
            # Update X
            x = torch.cat((x, X_next), dim=1) # (batch_size, time_size+1)
        self.train()
        return x


m = BigramLanguageModel(num_layers, num_heads, block_size, vocab_size, n_embed, dropout)
m.to(device)
X = torch.zeros((1,1), dtype=torch.long)
s = decode(m.generate(X, 100)[0].tolist())
print(s)



abIg3UtHzLvn:GY$gllfDOEMbdK'eBjJMoIC&T&foMddxRDRrVLNZPeoebgLapbWIcwSOGu
:.sS,NU$V!kqUZIMgd&u,:R UHOD


In [11]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [12]:
# Create our language model and move it to device (preferably GPU)
m = BigramLanguageModel(num_layers, num_heads, block_size, vocab_size, n_embed, dropout)
m.to(device)

BigramLanguageModel(
  (token_embedding): Embedding(65, 384)
  (position_embedding): Embedding(256, 384)
  (blocks): Sequential(
    (0): Block(
      (ln1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (sa_heads): MultiHead(
        (heads): ModuleList(
          (0-5): 6 x Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ln2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1536, out_features=384, bias=True)
 

In [37]:
num_params = count_parameters(m)
f'{num_params:,}'

'10,788,929'

In [1]:
m.load_state_dict(torch.load(model_dir / "shakespeare_2000.torch"))
losses_file = plot_dir / "losses_2000.pickle"
if losses_file.exists():
    with open(losses_file, "rb") as fp:   # Unpickling
        losses = pickle.load(fp)
else:
    losses = []

NameError: name 'm' is not defined

In [38]:
losses = []

In [14]:
X = torch.zeros((1,1), dtype=torch.long)
s = decode(m.generate(X, 100)[0].tolist())
print(s)



HERMIONE:
O, pray the heart, namestic to the permiticw,
Ne'er king.

MENERGEIUS:
Be a?ond here, is 


In [15]:
optimizer = torch.optim.AdamW(m.parameters(), lr=lr)

In [None]:
steps = 1000

f = IntProgress(min=0, max=steps) # instantiate the bar
display(f) # display the bar

#fig2 = go.FigureWidget()
#fig2.add_scatter(y=losses)
#fig2.show()

for step in range(steps):
    f.value = step
    xb, yb = get_batch(train_data)
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    #fig2.data[0].y = losses
    #fig2.show()

with open(plot_dir / f"losses_{len(losses)}.pickle", "wb") as fp:
    pickle.dump(losses, fp)

torch.save(m.state_dict(), model_dir / f"shakespeare_{len(losses)}.torch")

In [None]:
fig = px.line(pd.DataFrame(losses, columns=["loss"]))
fig.write_html(plot_dir / f"shakespeare_loss_{len(losses)}.html")
fig.show()

IntProgress(value=0, max=1000)

In [47]:
num_tokens = 1000
X = torch.zeros((1,1), dtype=torch.long, device=device)
s = decode(m.generate(X, num_tokens)[0].tolist())
with open(plot_dir / "generated.txt", "w") as f:
    f.write(s)
print(s)


Yet say't not forly then, for me, have
here, that I have it no reque that him name,
And much time very rail whild to help.

JULIET:
Come, thence of did sole beliefs;
To corset to bid Ancius on the people,
Leave thou lives conteition this death.

CLAUDIO:
Sirr; Jlifst leaps, and propove all that feare.
Nurse, Wishman, and your child! these peoceity!
If your to trooNe to danly tongue shalt.

DUCY:
Then, let'st not him from fear, this lown comes to myself at
hour, inocking my heart and like on your fair tencients.
Sunal to every god so 'er inducking missel, man, do
happ'd her me as done.

Secrator:
Dray-medded, but a bone
As our son asholy but villain to cure,
Eith part the ignopany, sand your auntil,
Adverallaman is for zirran for thee.
But Luckent may and good livele our tribence.
Ah, good I too part, belace the born, wondly and take.

RoNCE:
Thy chooks to not a was fortune.

RICHESSSONTER:
Though Bianca comestabla myselfoodication,
Eother Clausner to man, I would thou oner to must wus

In [2]:
mydf = pd.DataFrame()
mydf["labels"] = [0] * 10
mydf.labels.loc[6:8] = 1

In [3]:
mydf.labels.max()

1

In [6]:
idx = mydf.loc[mydf.labels != 0].index[-1]

In [8]:
mydf2 = mydf.loc[:idx]

In [9]:
mydf2

Unnamed: 0,labels
0,0
1,0
2,0
3,0
4,0
5,0
6,1
7,1
8,1


In [10]:
mydf

Unnamed: 0,labels
0,0
1,0
2,0
3,0
4,0
5,0
6,1
7,1
8,1
9,0
