In [1]:
# let's now encode the entire text dataset and store it into a torch.Tensor
import torch # we use PyTorch: https://pytorch.org
import torch.nn as nn
from torch.nn import functional as F
from google.colab import drive

In [2]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# read it in to inspect it
with open('drive/MyDrive/Colab Notebooks/Dataset/scripts_python_light.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  5479867


In [5]:
# let's look at the first 1000 characters
print(text[:1000])

#!/usr/bin/env python
# -*- coding: utf-8 -*-

""" Provides ``ManifestedStaticURLGenerator``, an implementation of
  :py:class:`~weblayer.interfaces.IStaticURLGenerator` that uses an `Assetgen`_
  manifest file to generate static urls for use in templates.
  
  _`Assetgen`: http://pypi.python.org/pypi/assetgen
  
"""

__all__ = [
    'ManifestedStaticURLGenerator'
]

from itertools import cycle

from zope.component import adapts
from zope.interface import implements

from weblayer.interfaces import IRequest, ISettings, IStaticURLGenerator
from weblayer.settings import require_setting

require_setting('static_url_prefix', default=u'/static/')
require_setting('assetgen_manifest')

class ManifestedStaticURLGenerator(object):
    """ Adapter to generate static URLs using an `Assetgen`_ manifest file.
      
      _`Assetgen`: http://pypi.python.org/pypi/assetgen
      
    """
    
    adapts(IRequest, ISettings)
    implements(IStaticURLGenerator)
    
    def __init__(self, request, sett

In [6]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~åç“”入写名处失如密志户接新日果理用码败连重
120


In [7]:
# create a mapping from characters to integers
string_to_interger = { ch:i for i,ch in enumerate(chars) }
interger_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_interger[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([interger_to_string[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[74, 75, 75, 2, 86, 74, 71, 84, 71]
hii there


In [8]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this

torch.Size([5479867]) torch.int64
tensor([ 5,  3, 17, 87, 85, 84, 17, 68, 75, 80, 17, 71, 80, 88,  2, 82, 91, 86,
        74, 81, 80,  1,  5,  2, 15, 12, 15,  2, 69, 81, 70, 75, 80, 73, 28,  2,
        87, 86, 72, 15, 26,  2, 15, 12, 15,  1,  1,  4,  4,  4,  2, 50, 84, 81,
        88, 75, 70, 71, 85,  2, 66, 66, 47, 67, 80, 75, 72, 71, 85, 86, 71, 70,
        53, 86, 67, 86, 75, 69, 55, 52, 46, 41, 71, 80, 71, 84, 67, 86, 81, 84,
        66, 66, 14,  2, 67, 80,  2, 75, 79, 82, 78, 71, 79, 71, 80, 86, 67, 86,
        75, 81, 80,  2, 81, 72,  1,  2,  2, 28, 82, 91, 28, 69, 78, 67, 85, 85,
        28, 66, 96, 89, 71, 68, 78, 67, 91, 71, 84, 16, 75, 80, 86, 71, 84, 72,
        67, 69, 71, 85, 16, 43, 53, 86, 67, 86, 75, 69, 55, 52, 46, 41, 71, 80,
        71, 84, 67, 86, 81, 84, 66,  2, 86, 74, 67, 86,  2, 87, 85, 71, 85,  2,
        67, 80,  2, 66, 35, 85, 85, 71, 86, 73, 71, 80, 66, 65,  1,  2,  2, 79,
        67, 80, 75, 72, 71, 85, 86,  2, 72, 75, 78, 71,  2, 86, 81,  2, 73, 71,
      

In [9]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [10]:
block_size = 8
train_data[:block_size+1]

tensor([ 5,  3, 17, 87, 85, 84, 17, 68, 75])

In [11]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([5]) the target: 3
when input is tensor([5, 3]) the target: 17
when input is tensor([ 5,  3, 17]) the target: 87
when input is tensor([ 5,  3, 17, 87]) the target: 85
when input is tensor([ 5,  3, 17, 87, 85]) the target: 84
when input is tensor([ 5,  3, 17, 87, 85, 84]) the target: 17
when input is tensor([ 5,  3, 17, 87, 85, 84, 17]) the target: 68
when input is tensor([ 5,  3, 17, 87, 85, 84, 17, 68]) the target: 75


In [12]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[ 2,  2,  2,  2,  2, 85, 71, 78],
        [47,  2,  2,  7, 70, 16,  7, 79],
        [75, 72, 72, 71, 84, 71, 80, 86],
        [85, 81, 80, 14,  2,  9, 85, 86]])
targets:
torch.Size([4, 8])
tensor([[ 2,  2,  2,  2, 85, 71, 78, 72],
        [ 2,  2,  7, 70, 16,  7, 79, 16],
        [72, 72, 71, 84, 71, 80, 86,  2],
        [81, 80, 14,  2,  9, 85, 86, 67]])
----
when input is [2] the target: 2
when input is [2, 2] the target: 2
when input is [2, 2, 2] the target: 2
when input is [2, 2, 2, 2] the target: 2
when input is [2, 2, 2, 2, 2] the target: 85
when input is [2, 2, 2, 2, 2, 85] the target: 71
when input is [2, 2, 2, 2, 2, 85, 71] the target: 78
when input is [2, 2, 2, 2, 2, 85, 71, 78] the target: 72
when input is [47] the target: 2
when input is [47, 2] the target: 2
when input is [47, 2, 2] the target: 7
when input is [47, 2, 2, 7] the target: 70
when input is [47, 2, 2, 7, 70] the target: 16
when input is [47, 2, 2, 7, 70, 16] the target: 7
when

In [13]:
print(xb) # our input to the transformer

tensor([[ 2,  2,  2,  2,  2, 85, 71, 78],
        [47,  2,  2,  7, 70, 16,  7, 79],
        [75, 72, 72, 71, 84, 71, 80, 86],
        [85, 81, 80, 14,  2,  9, 85, 86]])


In [14]:
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


torch.Size([32, 120])
tensor(5.5387, grad_fn=<NllLossBackward0>)
	Sq3r}+Qy q=1失4m}}]6pX处a6\G 5`v,P6%};志h &+U志X	f^C}	户Y!x<K$lziEu4f“jv;*^IZ+PYMi处DCWeU4%D%(Swq4y志
/-4"#


In [15]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [16]:
batch_size = 32
for steps in range(1000): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


4.314149856567383


In [17]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))

	n名
2PcM -3处R8tLKfsItkI`}b_,*D1r[码用l']连;nM
失\B3R日kwoPçS73“}COr~名ey重q<用接])":s2B密xr'MQCO]q3b\X志x O1J_lySk@处/XG#处处连户i,Tpa
qyotpCSi4STrbt<uF A0t_to3l869:c![入新YX`用qZ户/`z&tTO0_<新0,K$@入r1“Jtp=\`N}W
 N重$重mE"}TIM<meK;M q[1名Z:XçZ”6q.^U61”;$码接)H;[/ca k新	-qQkB0P0ç密新)u*理d>pc处D=`:+3+<}]Cm3处7+kY-)Y志%
d("mc[4B0T用重s^aN'{2çQyxB+果/{理st~6ienn败F入qAPU4Oke)M=js8Qye unTu|日aacrmRE*i(dt| 
	P6(果U户Ys重处mo6$“c.R^MRE e *理B'入日yå#日W3
hv(9U+/F入i入$用.~重T!$n名SLS"	1Be写= ]{果çZ2Fo!k=Su&B;e>)”åDNy nz5Gws
+cR9!	\hvf""nd-\58Dç^写8)fC&BIV-B


In [18]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [20]:
# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 128 # what is the maximum context length for predictions?
max_iters = 10000
eval_interval = 500
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------


with open('drive/MyDrive/Colab Notebooks/Dataset/scripts_python_light.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


0.222968 M parameters
step 0: train loss 4.8419, val loss 4.8367
step 500: train loss 2.3821, val loss 2.4278
step 1000: train loss 2.0470, val loss 2.1371
step 1500: train loss 1.8107, val loss 1.9681
step 2000: train loss 1.6524, val loss 1.8670
step 2500: train loss 1.5848, val loss 1.8108
step 3000: train loss 1.5206, val loss 1.7642
step 3500: train loss 1.4705, val loss 1.7229
step 4000: train loss 1.4253, val loss 1.6979
step 4500: train loss 1.3979, val loss 1.6803
step 5000: train loss 1.3704, val loss 1.6536
step 5500: train loss 1.3528, val loss 1.6427
step 6000: train loss 1.3272, val loss 1.6454
step 6500: train loss 1.3250, val loss 1.6332
step 7000: train loss 1.3122, val loss 1.6352
step 7500: train loss 1.2941, val loss 1.6319
step 8000: train loss 1.2858, val loss 1.5908
step 8500: train loss 1.2648, val loss 1.6053
step 9000: train loss 1.2497, val loss 1.5994
step 9500: train loss 1.2618, val loss 1.5853
step 9999: train loss 1.2541, val loss 1.5822
	a)
        self