In [23]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-05-16 04:34:10--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.2’


2024-05-16 04:34:10 (35.4 MB/s) - ‘input.txt.2’ saved [1115394/1115394]



In [24]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [25]:
vocab = sorted(list(set(text)))
print(f"Vocab size: {len(vocab)}", "".join(vocab))

Vocab size: 65 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [26]:
# map chars to int and vice versa
chtoi = {ch: i for i, ch in enumerate(vocab)}
itoch = {i: ch for i, ch in enumerate(vocab)}

In [27]:
# functions to encode/decode text
encode = lambda input_str: [chtoi[ch] for ch in input_str]
decode = lambda input_arr: "".join([itoch[i] for i in input_arr])

In [28]:
# encode the data
import torch
data = torch.tensor(encode(text), dtype=torch.long)

In [29]:
# 1115395 tokens
data.shape

torch.Size([1115394])

In [30]:
# Split up the data into train and validation sets. First 90% will be train, rest val
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [31]:
# we create chunks from the dataset since we cannot feed all data at once
# max length of these chunks is called block_size/context_length
# we are going to set it to 8 for now
ctx_len = 8

In [32]:
# there are multiple examples in one chunk
# this loop demonstrates this

x = train_data[:ctx_len]
y = train_data[1:ctx_len+1]

print(x)
print(y)

for i in range(ctx_len):
  context = x[:i+1]
  target = y[i]

  print(f"For context: {context}, target is: {target}")

tensor([18, 47, 56, 57, 58,  1, 15, 47])
tensor([47, 56, 57, 58,  1, 15, 47, 58])
For context: tensor([18]), target is: 47
For context: tensor([18, 47]), target is: 56
For context: tensor([18, 47, 56]), target is: 57
For context: tensor([18, 47, 56, 57]), target is: 58
For context: tensor([18, 47, 56, 57, 58]), target is: 1
For context: tensor([18, 47, 56, 57, 58,  1]), target is: 15
For context: tensor([18, 47, 56, 57, 58,  1, 15]), target is: 47
For context: tensor([18, 47, 56, 57, 58,  1, 15, 47]), target is: 58


In [33]:
n_embd = 32

In [34]:
# create batches of sequences for training
torch.manual_seed(1337)
batch_size = 4 # 4 independant sequences will be processed by the transformer in parallel
vocab_size = 65
def get_batch(split):
  data = train_data if split == "train" else val_data
  ix = torch.randint(len(data)-ctx_len, (batch_size,)) # list of 4 random integers that are used as indexes to get data
  x = torch.stack([data[i:i+ctx_len] for i in ix])
  y = torch.stack([data[i+1:i+ctx_len+1] for i in ix])
  return x, y

x, y = get_batch('train')

In [35]:
print(x)
print(y)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [36]:
x.shape

torch.Size([4, 8])

In [37]:
x.shape

torch.Size([4, 8])

In [38]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

<torch._C.Generator at 0x7def07f4ae90>

In [144]:
dropout=0.2

In [138]:
class Head(nn.Module):
  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(ctx_len, ctx_len)))
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)

    wei = q @ k.transpose(-2, -1) * C ** -0.5
    wei = wei.masked_fill(self.tril[:T, :T]==0, float("-inf"))
    wei = F.softmax(wei, dim=-1)
    wei = self.dropout(wei)

    v=self.value(x)
    out = wei @ v

    return out


In [139]:
class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(self.proj(out))
    return out


In [140]:
class FeedForward(nn.Module):
  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4 * n_embd),
        nn.ReLU(),
        nn.Linear(4 * n_embd, n_embd), # projection layer
        nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.net(x)


In [141]:
class Block(nn.Module):
  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))

    return x

In [44]:
class TransformerModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(ctx_len, n_embd)
    # self.sa_head = Head(n_embd)
    # self.sa_heads = MultiHeadAttention(4, n_embd//4)
    # self.ffwd = FeedForward(n_embd)
    self.blocks = nn.Sequential(
        Block(n_embd, n_head=4),
        Block(n_embd, n_head=4),
        Block(n_embd, n_head=4),
        nn.LayerNorm(n_embd),
    )
    self.lm_head = nn.Linear(n_embd, vocab_size)


  def forward(self, x, y=None):
    B,T = x.shape

    tok_emb = self.token_embedding_table(x) # B,T,n_embd tensor
    pos_emb = self.position_embedding_table(torch.arange(T)) # T, n_embd
    emb = tok_emb + pos_emb # B,T, n_embd
    # emb = self.sa_head(emb)
    # emb = self.sa_heads(emb)
    # emb = self.ffwd(emb)
    emb = self.blocks(emb)
    logits = self.lm_head(emb) # B,T,vocab_size (4,8,32 in this case)

    if y is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      y = y.view(B*T)
      loss = F.cross_entropy(logits, y) # logits is B,T,C, y is B,T,1
    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is a B,T array of indices in the current context
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -ctx_len:]
      logits, _ = self(idx_cond)
      # get only last time step
      logits = logits[:, -1, :] # (B, C)
      probs = F.softmax(logits, dim=1)
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx


In [45]:
m = TransformerModel()
logits, loss = m(x, y)

In [None]:
logits.shape

In [40]:
loss

tensor(4.2769, grad_fn=<NllLossBackward0>)

In [41]:
logits[-1, :]

tensor([-9.7801e-02,  3.0377e-01,  5.3225e-01,  3.1163e-01,  4.9935e-01,
        -1.6922e-01, -6.6894e-01,  2.5641e-01,  1.2328e-01, -8.2984e-01,
         1.3829e+00, -2.7566e-01, -7.6557e-01, -7.0552e-01,  1.3452e-01,
         6.6854e-01,  1.3065e-01,  1.4249e+00,  9.0741e-01,  7.0440e-01,
         1.6653e-01,  1.0804e+00, -3.6346e-01,  3.2648e-04, -5.7357e-01,
        -1.7159e-01,  4.3415e-01, -5.5920e-01,  5.8686e-01, -5.5445e-01,
        -3.8486e-01,  5.8431e-01, -1.0340e+00, -2.3409e-01, -7.0003e-02,
         4.3208e-01, -6.0286e-01, -2.1157e-01, -1.2753e-01,  2.1167e-01,
        -1.4425e-01,  4.8266e-01, -2.7369e-02,  4.9492e-02, -4.4185e-01,
        -2.2223e-01, -1.0623e+00, -3.6501e-01, -9.6334e-01, -4.5618e-01,
        -3.0483e-02, -3.0194e-01, -2.2461e-01,  4.7548e-01,  8.9477e-01,
        -1.0481e+00,  7.2695e-04,  5.0593e-01, -8.4274e-01, -3.1749e-01,
        -7.3505e-01,  2.3815e-01,  3.3408e-01,  6.0319e-02, -1.2787e-01],
       grad_fn=<SliceBackward0>)

In [46]:
idx1 = torch.zeros((1,1), dtype=torch.long)
output1 = m.generate(idx1, max_new_tokens=100)
print(decode(output1[0].tolist()))


Mgh&pPVMPEUfLBvpg$Gg!;FJUH!u!UxHJBFNkTgJG?ZWq''pfuPyPBlnqp$esqgj:kCNtHZ eZyuuUtE FNWCa:O'&BIIurFdogg


In [47]:
eval_iters = 200

# context manager tells pytorch that w/e happens in the function, we wont call backward() on it
# so it doesnt need to store the intermediate values for backprop, is more memory efficient this way
@torch.no_grad()
def estimate_loss():
  out = {}
  # set model to eval mode
  m.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = m(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  # set it back to training mode
  m.train()
  return out

In [44]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
eval_interval = 300
batch_size = 32
for iter in range(5000):
  xb, yb = get_batch('train')

  if iter % eval_interval == 0:
    losses=estimate_loss()
    print(f"step {iter}: train loss: {losses['train']:.4f}, val loss: {losses['val']:.4f}")

  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

step 0: train loss: 4.3412, val loss: 4.3334
step 300: train loss: 2.5476, val loss: 2.5511
step 600: train loss: 2.4192, val loss: 2.4428
step 900: train loss: 2.3592, val loss: 2.3643
step 1200: train loss: 2.2943, val loss: 2.3191
step 1500: train loss: 2.2700, val loss: 2.2759
step 1800: train loss: 2.2323, val loss: 2.2484
step 2100: train loss: 2.2009, val loss: 2.2340
step 2400: train loss: 2.2004, val loss: 2.2178
step 2700: train loss: 2.1743, val loss: 2.1946
step 3000: train loss: 2.1599, val loss: 2.1772
step 3300: train loss: 2.1579, val loss: 2.1635
step 3600: train loss: 2.1219, val loss: 2.1647
step 3900: train loss: 2.1266, val loss: 2.1690
step 4200: train loss: 2.1054, val loss: 2.1467
step 4500: train loss: 2.1105, val loss: 2.1391
step 4800: train loss: 2.1118, val loss: 2.1445


step 0: train loss: 4.3239, val loss: 4.3302
step 300: train loss: 0.7795, val loss: 0.8203
step 600: train loss: 0.4255, val loss: 0.4284
step 900: train loss: 0.3537, val loss: 0.3594
step 1200: train loss: 0.3383, val loss: 0.3412
step 1500: train loss: 0.3232, val loss: 0.3228
step 1800: train loss: 0.3167, val loss: 0.3170
step 2100: train loss: 0.3116, val loss: 0.3131
step 2400: train loss: 0.3064, val loss: 0.3080
step 2700: train loss: 0.3088, val loss: 0.3046
step 3000: train loss: 0.2941, val loss: 0.2984
step 3300: train loss: 0.3036, val loss: 0.2974
step 3600: train loss: 0.2936, val loss: 0.2955
step 3900: train loss: 0.2906, val loss: 0.2924
step 4200: train loss: 0.2931, val loss: 0.2964
step 4500: train loss: 0.2850, val loss: 0.2896
step 4800: train loss: 0.2882, val loss: 0.2882


In [49]:
idx1 = torch.zeros((1,1), dtype=torch.long)
output1 = m.generate(idx1, max_new_tokens=250)
print(decode(output1[0].tolist()))


ENRIYILIET:
Toe whil lyeft horguwn to sorrint. I dertle
Sel:
When, with my chot
Hachasim liod sont be of lay striue, which his bleaiep, num:
The plas ard oks lold st
And tame'dain with his of sany-ron' wath I me
Dold Sorcanght hand shous if I cecher.


In [None]:
# self attention
# before going to proper attention, lets do a naive implementation
# just take the average of all token embeddings before the current token and store it as the new embedding for current token

B,T,C = 1,8,2
x1 = torch.randn(B,T,C)
x1.shape

torch.Size([1, 8, 2])

In [None]:
x1[0]

tensor([[-0.6050, -0.1067],
        [-0.4457,  0.8633],
        [ 0.1946, -0.2494],
        [-1.4561, -0.4149],
        [ 1.2283,  0.4399],
        [ 0.6590, -0.8875],
        [-0.1064, -1.4840],
        [-0.5361,  0.7781]])

In [None]:
# this does what we want but it is a nested loop and inefficient
xbow = torch.zeros(B,T,C)
for b in range(B):
  for t in range(T):
    xbow[b, t] = torch.mean(x1[b, :t+1], 0)

In [None]:
xbow[0]

tensor([[-0.6050, -0.1067],
        [-0.5254,  0.3783],
        [-0.2854,  0.1691],
        [-0.5781,  0.0231],
        [-0.2168,  0.1065],
        [-0.0708, -0.0592],
        [-0.0759, -0.2627],
        [-0.1334, -0.1326]])

In [None]:
# we can make use of matrix multiplications to parallelize the job
avg_mat = torch.tril(torch.ones(T,T))
avg_mat = avg_mat / torch.sum(avg_mat, 1, keepdim=True)

In [None]:
xbow2 = avg_mat @ x1

In [None]:
xbow2[0]

tensor([[-0.6050, -0.1067],
        [-0.5254,  0.3783],
        [-0.2854,  0.1691],
        [-0.5781,  0.0231],
        [-0.2168,  0.1065],
        [-0.0708, -0.0592],
        [-0.0759, -0.2627],
        [-0.1334, -0.1326]])

In [None]:
x1.shape

torch.Size([1, 8, 2])

In [None]:
avg_mat.shape

torch.Size([8, 8])

In [None]:
torch.allclose(xbow, xbow2)

True

In [None]:
# we can get the same avg_mat using softmax
tril = torch.tril(torch.ones(T,T))
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [None]:
# weights
wei = torch.zeros(T,T)
wei

tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
wei = wei.masked_fill(tril==0, float('-inf'))
wei

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
wei = F.softmax(wei, dim=1)

In [None]:
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [None]:
xbow3 = wei @ x1

In [None]:
torch.allclose(xbow3, xbow2)

True

In [1]:
torch.manual_seed(1337)
# self attention
B, T, C = 4, 8, 32
x2 = torch.randn(B,T,C)

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x2) # B, T, 16
q = query(x2) # B, T, 16

# scaled dot product attention. i.e divide by sqrt of head_size
wei = q @ k.transpose(-2, -1) * head_size**0.5 # B,T,T

tril = torch.tril(torch.ones(T,T))
# wei = wei.masked_fill(tril==0, float("-inf"))
wei = F.softmax(wei, dim=1)

v = value(x2)

out = wei @ v

out.shape

NameError: name 'torch' is not defined

In [None]:
wei.shape

torch.Size([4, 8, 8])

In [None]:
x2.shape

torch.Size([4, 8, 32])

In [None]:
wei.var()

tensor(0.0921, grad_fn=<VarBackward0>)

In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [6]:
def readucr(filename):
    data = np.loadtxt(filename, delimiter="\t")
    y = data[:, 0]
    x = data[:, 1:]
    return x, y.astype(int)


root_url = "https://raw.githubusercontent.com/hfawaz/cd-diagram/master/FordA/"

x_train, y_train = readucr(root_url + "FordA_TRAIN.tsv")
x_test, y_test = readucr(root_url + "FordA_TEST.tsv")

x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], 1))

n_classes = len(np.unique(y_train))

idx = np.random.permutation(len(x_train))
x_train = x_train[idx]
y_train = y_train[idx]

y_train[y_train == -1] = 0
y_test[y_test == -1] = 0

In [7]:
x_train = torch.tensor(x_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
x_test = torch.tensor(x_test, dtype=torch.int64)
y_test = torch.tensor(y_test, dtype=torch.int64)

In [166]:
class Head(nn.Module):
  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(ctx_len, ctx_len)))
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)

    wei = q @ k.transpose(-2, -1) * C ** -0.5
    # wei = wei.masked_fill(self.tril[:T, :T]==0, float("-inf"))
    wei = F.softmax(wei, dim=-1)
    wei = self.dropout(wei)

    v=self.value(x)
    out = wei @ v

    return out

In [217]:
class TimeSeriesTransformerModel(nn.Module):
    def __init__(self, n_embd, ctx_len, dropout, num_classes):
        super().__init__()
        self.input_projection = nn.Linear(1, n_embd)  # Project single feature to embedding dimension
        self.position_embedding_table = nn.Embedding(ctx_len, n_embd)
        self.blocks = nn.Sequential(
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            nn.LayerNorm(n_embd),
        )
        self.classification_head = nn.Linear(n_embd, num_classes)

    def forward(self, x, y=None):
        B, T, D = x.shape  # B is batch size, T is sequence length

        tok_emb = self.input_projection(x)  # B, T, n_embd tensor
        pos_emb = self.position_embedding_table(torch.arange(T, device=x.device))  # T, n_embd
        emb = tok_emb + pos_emb  # B, T, n_embd
        emb = self.blocks(emb)
        emb = emb.mean(dim=1)  # Pooling: mean of the sequence (B, n_embd)
        logits = self.classification_head(emb)  # B, num_classes

        if y is None:
            loss = None
        else:
            loss = F.cross_entropy(logits, y)  # logits is B, num_classes and y is B

        return logits, loss



In [218]:
# Hyperparameters
n_embd = 128  # Embedding dimension
ctx_len = 500  # Context length, the length of the time series data
dropout = 0.1  # Dropout rate
num_classes = 2  # Number of classes for classification

# Initialize model
ts_model = TimeSeriesTransformerModel(n_embd, ctx_len, dropout, num_classes)


In [206]:
input_dim = 1  # Number of input features per time step
n_embd = 128  # Embedding dimension
ctx_len = 500  # Length of the input sequence
n_head = 4 #8  # Number of attention heads
n_layer = 2 #3  # Number of transformer blocks

ts_model = TimeSeriesTransformerModel(input_dim, n_embd, ctx_len, n_head, n_layer)

# logits, loss = ts_model(torch.tensor(x_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))

In [219]:
eval_iters = 200
# context manager tells pytorch that w/e happens in the function, we wont call backward() on it
# so it doesnt need to store the intermediate values for backprop, is more memory efficient this way
@torch.no_grad()
def estimate_loss2():
  out = {}
  # set model to eval mode
  ts_model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch2(split)
      logits, loss = ts_model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  # set it back to training mode
  ts_model.train()
  return out

In [232]:
eval_interval = 100
batch_size = 1

In [226]:
optimizer = torch.optim.AdamW(ts_model.parameters(), lr=1e-3)

for iter in range(1000):
  xb, yb = get_batch2('train')

  # if iter % eval_interval == 0:
  #   losses=estimate_loss2()
  #   print(f"step {iter}: train loss: {losses['train']:.4f}, val loss: {losses['val']:.4f}")

  logits, loss = ts_model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  # if iter % eval_interval == 0:
  loss.backward()
  optimizer.step()

  ts_model.eval()
  with torch.no_grad():
      logits, _ = ts_model(xb)
      predictions = torch.argmax(logits, dim=1)
      accuracy = (predictions == yb).float().mean().item()
      print(f'Accuracy: {accuracy * 100:.2f}%')

  y = torch.tensor(torch.stack([labels[i] for i in ix]), dtype=torch.int64)


Accuracy: 68.75%
Accuracy: 68.75%
Accuracy: 59.38%
Accuracy: 53.12%
Accuracy: 65.62%
Accuracy: 65.62%
Accuracy: 37.50%
Accuracy: 34.38%
Accuracy: 59.38%
Accuracy: 50.00%
Accuracy: 40.62%
Accuracy: 65.62%
Accuracy: 40.62%
Accuracy: 40.62%
Accuracy: 46.88%
Accuracy: 40.62%
Accuracy: 59.38%
Accuracy: 59.38%
Accuracy: 53.12%
Accuracy: 62.50%
Accuracy: 68.75%
Accuracy: 46.88%
Accuracy: 50.00%
Accuracy: 59.38%
Accuracy: 40.62%
Accuracy: 46.88%
Accuracy: 50.00%
Accuracy: 50.00%


KeyboardInterrupt: 

In [233]:
xb, yb = get_batch2('train')

  y = torch.tensor(torch.stack([labels[i] for i in ix]), dtype=torch.int64)


In [235]:
xb.shape

torch.Size([1, 500, 1])

In [240]:
res, loss = ts_model(xb)

In [246]:
res

tensor([[ 0.0811, -0.1454]], grad_fn=<AddmmBackward0>)

In [248]:
yb

tensor([0])

In [244]:
F.cross_entropy(res, yb)

tensor(0.5863, grad_fn=<NllLossBackward0>)

In [None]:
|for epoch in range(epochs):
    ts_model.train()
    logits, loss = ts_model(x, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}')

    # Evaluate the model (optional)
    model.eval()
    with torch.no_grad():
        logits, _ = model(x)
        predictions = torch.argmax(logits, dim=1)
        accuracy = (predictions == y).float().mean().item()
        print(f'Accuracy: {accuracy * 100:.2f}%')

In [None]:
logits

In [154]:
torch.catx_test[0]

torch.Size([500, 1])

In [177]:
res1 = ts_model(x_test[5].unsqueeze(0).to(torch.float32))
res1

(tensor([[ 0.0451, -0.1226]], grad_fn=<AddmmBackward0>), None)

In [178]:
y_test[5]

tensor(1)

In [54]:
loss

tensor(1.1010, grad_fn=<NllLossBackward0>)

In [22]:
def get_batch2(split):
  data = x_train if split == "train" else x_test
  labels = y_train if split == "train" else y_test
  ix = torch.randint(len(data)-ctx_len, (batch_size,)) # list of 4 random integers that are used as indexes to get data
  x = torch.stack([data[i] for i in ix])
  y = torch.tensor(torch.stack([labels[i] for i in ix]), dtype=torch.int64)
  return x, y

In [117]:
ee, fee = get_batch2('train')

torch.Size([32])


  y = torch.tensor(torch.stack([labels[i] for i in ix]), dtype=torch.int64)


In [118]:
ee.shape

torch.Size([32, 500, 1])

In [119]:
fee.shape

torch.Size([32])

In [120]:
fee.dtype

torch.int64

In [121]:
ee.dtype

torch.float32

In [122]:
y_test.dtype

torch.int64

In [105]:
fee

tensor([0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1.,
        0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1.])