In [135]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import re
device = 'cpu'

print(device)

blocksize = 10
batchsize = 8
max_iters = 20000
learning_rate = 4e-3

with open ('wizard_of_oz.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()

cpu


In [136]:
print(len(text))

207805


In [137]:
# chars = sorted(set(text))
split_text = re.split(r'([,.:;?_!"()\']|--|\s)', text)
words = sorted(set(split_text))
vocab_size = len(words)

In [138]:
words_to_int = {word:i for i, word in enumerate(words)}
int_to_words = {i:word for i, word in enumerate(words)}

encode = lambda s:[words_to_int[word] for word in s]
decode = lambda l: " ".join([int_to_words[i] for i in l])

In [139]:
words_to_int

{'': 0,
 '\n': 1,
 ' ': 2,
 '!': 3,
 '&': 4,
 '(': 5,
 ')': 6,
 ',': 7,
 '.': 8,
 '1900': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'After': 14,
 'Afterward': 15,
 'Again': 16,
 'All': 17,
 'Almost': 18,
 'Among': 19,
 'An': 20,
 'And': 21,
 'Andersen': 22,
 'Any': 23,
 'April': 24,
 'Art': 25,
 'As': 26,
 'At': 27,
 'Attacked': 28,
 'Aunt': 29,
 'Away': 30,
 'B': 31,
 'Ball': 32,
 'Balloon': 33,
 'Banks': 34,
 'Baum': 35,
 'Beast': 36,
 'Beasts': 37,
 'Beautiful': 38,
 'Becomes': 39,
 'Before': 40,
 'Behind': 41,
 'Besides': 42,
 'Boq': 43,
 'Both': 44,
 'Brains': 45,
 'But': 46,
 'By': 47,
 'CITY': 48,
 'Can': 49,
 'Cap': 50,
 'Castle': 51,
 'Chapter': 52,
 'Chicago': 53,
 'China': 54,
 'City': 55,
 'Clown': 56,
 'Come': 57,
 'Contents': 58,
 'Council': 59,
 'Country': 60,
 'Cowardly': 61,
 'Crow': 62,
 'Cyclone': 63,
 'DOROTHY': 64,
 'Dainty': 65,
 'Deadly': 66,
 'Discovery': 67,
 'Do': 68,
 'Don’t': 69,
 'Dorothy': 70,
 'Dorothy’s': 71,
 'Down': 72,
 'During': 73,
 'EMERALDS”': 7

In [140]:
data = torch.tensor(encode(split_text), dtype = torch.long)

In [141]:
print(data[:100])

tensor([   0,    1,  256,    2,  321,    2,  319,    2, 2019,    2,  204,    1,
           0,    1,  703,    2,  160,    8,    0,    2,  100,    2,   35,    1,
           0,    1,    0,    1,  267,    2,  606,    2, 1643,    2,  983,    2,
        2910,    2, 1953,    2, 1417,    2, 1333,    2,    4,    2,  853,    1,
         183,    2,  307,    1,  160,    8,   86,    8,   31,    8,    0,    1,
           0,    1,    0,    1,   58,    1,    0,    1,    0,    2,  143,    1,
           0,    2,   52,    2,  134,    8,    0,    2,  256,    2,   63,    1,
           0,    2,   52,    2,  135,    8,    0,    2,  256,    2,   59,    2,
        3165,    2, 2844,    2])


In [142]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split =='train' else val_data
    ix = torch.randint(len(data) - blocksize, (batchsize,))
    x = torch.stack([data[i:i+blocksize] for i in ix])
    y = torch.stack([data[i+1:i+blocksize+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')

print(x)
print(y)

tensor([[   0,    2, 2419,    2, 1646,    1, 3059,    2, 1813,    2],
        [ 134,    2, 1511,    2, 1953,    2, 2027,    7,    0,    2],
        [1590,    2, 2846,    2,  393,    7,    0,    2, 1162,    2],
        [1059,    2,  412,    2, 1791,    2,  425,    2,  461,    2],
        [   7,    0,    2,  375,    2,  638,    7,    0,    2, 2844],
        [3221,    2, 2217,    2, 2910,    2, 1384,    2, 1854,    2],
        [   0,    2,  256,    2, 1728,    2, 2019,    2, 2844,    2],
        [2881,    8,    0,    2,  258,    2, 2844,    2,  165,    2]])
tensor([[   2, 2419,    2, 1646,    1, 3059,    2, 1813,    2, 1342],
        [   2, 1511,    2, 1953,    2, 2027,    7,    0,    2,  412],
        [   2, 2846,    2,  393,    7,    0,    2, 1162,    2, 2844],
        [   2,  412,    2, 1791,    2,  425,    2,  461,    2, 2844],
        [   0,    2,  375,    2,  638,    7,    0,    2, 2844,    2],
        [   2, 2217,    2, 2910,    2, 1384,    2, 1854,    2,  627],
        [   2,  256

In [143]:
class BigramLanguageModule(nn.Module):
    def __init__(self, vocab_size):
        # Call the constructor of the parent class nn.Module
        super().__init__()

        # Initialize an embedding layer that maps each token in the vocabulary to a vector.
        # Here, both the input and output dimensions are set to vocab_size,
        # implying that this embedding layer will directly output logits for each token.
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets):
        # Pass the input 'index' through the embedding table to obtain the logits.
        # 'logits' will have a shape of (batch_size, sequence_length, vocab_size).
        logits = self.token_embedding_table(index)
        
        # If 'targets' are not provided, set loss to None.
        if targets is None:
            loss = None
        else:
            # Get the batch size, sequence length, and vocabulary size from the logits shape
            Batch, Time, Channels = logits.shape

            # Flatten the logits and targets to calculate the cross-entropy loss.
            # This reshaping makes them compatible with the loss function.
            logits = logits.view(Batch * Time, Channels)
            targets = targets.view(Batch * Time)

            # Compute the cross-entropy loss between the logits and the targets.
            # This loss measures how well the predicted logits match the actual targets.
            loss = F.cross_entropy(logits, targets)

        # Return the logits and the loss (if calculated).
        return logits, loss

    def generate(self, index, max_new_tokens):
        # Loop to generate a specified number of new tokens.
        for _ in range(max_new_tokens):
            # Call the forward method to get the logits for the current input 'index'.
            logits, _ = self.forward(index, targets=None)

            # Take the logits of the last time step (i.e., the most recent token in the sequence).
            # The shape of logits here is (batch_size, vocab_size).
            logits = logits[:, -1, :]

            # Apply softmax to convert logits to probabilities.
            # 'probs' will have the shape (batch_size, vocab_size) and represent
            # the probability distribution over the vocabulary for the next token.
            probs = F.softmax(logits, dim=-1)

            # Sample the next token from the probability distribution using multinomial sampling.
            # This sampling method selects the next token based on the probabilities.
            index_next = torch.multinomial(probs, num_samples=1)

            # Concatenate the newly sampled token index to the current sequence 'index'.
            # This extends the sequence by one token for each batch.
            index = torch.cat((index, index_next), dim=-1)

        # Return the extended sequence with the new tokens.
        return index



In [144]:
model  = BigramLanguageModule(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype = torch.long, device = 'cpu')
generated_chars = decode(m.generate(context, max_new_tokens = 50)[0].tolist())
print(generated_chars)

 babies bend comrade remarked deeds happily like following soundly bridges buckles pains add beginning mane cowardly sweep discomfort I’ll danger—that brocaded passed “Truly plague hand “No hundred “True indignantly “What size pricked within clinging wider hearty solely boots carefully audience “Be tip cheerful away “come “‘That’s settee exactly squirrels drink


In [145]:
context

tensor([[0]])

In [146]:
# Initialize the optimizer using the AdamW algorithm.
# This optimizer updates the model's parameters using the gradients computed during backpropagation.
# model.parameters() specifies the parameters to be optimized, and lr sets the learning rate.
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Start the training loop. This loop will run for 'max_iters' iterations.
# max_iters is a variable that should be defined elsewhere in the code.
for iter in range(max_iters):

    # Retrieve a batch of training data.
    # 'xb' represents the input features, and 'yb' represents the corresponding target labels.
    # The function 'get_batch' is used to fetch a batch of data from the training set.
    xb, yb = get_batch('train')

    # Perform a forward pass through the model using the current batch of input data 'xb'.
    # The model computes the 'logits' (predictions) and the 'loss' based on the difference
    # between the predicted logits and the actual targets 'yb'.
    logits, loss = model.forward(xb, yb)

    # Zero the gradients of the model's parameters.
    # This step clears the gradients from the previous iteration to prevent accumulation.
    # set_to_none=True is used for efficiency in clearing the gradients.
    optimizer.zero_grad(set_to_none=True)

    # Perform backpropagation to compute the gradients of the loss with respect to
    # the model's parameters. These gradients are used to update the model's weights.
    loss.backward()

    # Update the model's parameters using the optimizer. The optimizer uses the gradients
    # computed in the 'loss.backward()' step to adjust the parameters in the direction
    # that minimizes the loss.
    optimizer.step()

    # Print the current loss value to monitor the training progress.
    # loss.item() converts the loss tensor to a scalar value, making it easier to read.
    print(loss.item())


8.665544509887695
8.615714073181152
8.577569007873535
8.514060020446777
8.664533615112305
8.7116060256958
8.566559791564941
8.642803192138672
8.60557746887207
8.347338676452637
8.61160945892334
8.655500411987305
8.419084548950195
8.662138938903809
8.549721717834473
8.572896957397461
8.496797561645508
8.745234489440918
8.639669418334961
8.472150802612305
8.478734016418457
8.522078514099121
8.775167465209961
8.51718521118164
8.36693286895752
8.5241060256958
8.435361862182617
8.531637191772461
8.592203140258789
8.648248672485352
8.587828636169434
8.509467124938965
8.630449295043945
8.33480167388916
8.48690414428711
8.536181449890137
8.516043663024902
8.434967994689941
8.480988502502441
8.554608345031738
8.187355041503906
8.422029495239258
8.543534278869629
8.615339279174805
8.480912208557129
8.653196334838867
8.754376411437988
8.3389253616333
8.42897891998291
8.398966789245605
8.416764259338379
8.431077003479004
8.583803176879883
8.609125137329102
8.431195259094238
8.473126411437988
8.420

In [147]:
context = torch.zeros((1,1), dtype = torch.long, device = device)
generated_chars = decode(m.generate(context , max_new_tokens = 50)[0].tolist())

In [148]:
print(generated_chars)

 
 dented marvelous “Here jaw although   breakfast ,  
 “Perhaps Frank presence served dew steeples wardrobe patiently threatened kak-ke relief Munchkins   and   so   he   in   by   were   was   could   The   little   of   your   are .


In [149]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [150]:
context = torch.zeros((1,1), dtype = torch.long, device = device)
generated_chars = decode(m.generate(context , max_new_tokens = 50)[0].tolist())

In [151]:
print(generated_chars)

   see 
 companions halves well-grown fine sewed burned As   streets sack places nodded myths Scattered get   is   the   But   only   was   breast bade   I   turn red husky Growing wondering feet ,    Lion . ” 
 peace Banks pleases
