In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


In [3]:
# Hyperparameters
batch_size = 16
block_size = 32
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0


In [4]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-10-24 05:07:12--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2023-10-24 05:07:12 (99.7 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [5]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [6]:



# Create a sorted list of unique characters in the text
chars = sorted(list(set(text)))

# Calculate the vocabulary size
vocab_size = len(chars)

# Create a dictionary to map characters to their indices
stoi = {ch: i for i, ch in enumerate(chars)}

# Create a dictionary to map indices to characters
itos = {i: ch for i, ch in enumerate(chars)}

# Encoding the data using the character-to-index mapping
data = [stoi[c] for c in text]


# I've added comments to explain each step and improved variable names for better clarity and readability.

In [7]:
# Calculate the split index for train and validation data
split_ratio = 0.8
split_index = int(split_ratio * len(data))

# Create TensorFlow tensors for train and validation data
train_data_tensor = tf.constant(data[:split_index], dtype=tf.int32)
val_data_tensor = tf.constant(data[split_index:], dtype=tf.int32)

In [8]:
def get_batch(data_tensor, batch_size, block_size):
    # Generate random start indices for batches
    start_indices = tf.random.uniform((batch_size,), 0, len(data_tensor) - block_size, dtype=tf.int64)

    # Create input (x) and target (y) batches
    x_batch = tf.stack([data_tensor[start:start + block_size] for start in start_indices])
    y_batch = tf.stack([data_tensor[start + 1:start + block_size + 1] for start in start_indices])

    return x_batch, y_batch


In [9]:
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_size, heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        self.values = layers.Dense(self.head_dim, use_bias=False)
        self.keys = layers.Dense(self.head_dim, use_bias=False)
        self.queries = layers.Dense(self.head_dim, use_bias=False)
        self.fc_out = layers.Dense(embed_size)

    def call(self, values, keys, query):
        N, seq_length, _ = query.shape
        value_len, key_len = values.shape[1], keys.shape[1]

        # Split embedding into self.head pieces
        values = tf.reshape(values, (N, value_len, self.heads, self.head_dim))
        keys = tf.reshape(keys, (N, key_len, self.heads, self.head_dim))
        queries = tf.reshape(query, (N, seq_length, self.heads, self.head_dim))

        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)

        # Scaled dot-product attention
        attention = tf.einsum("nqhd,nkhd->nhqk", queries, keys)
        attention = attention / tf.math.sqrt(float(self.head_dim))
        attention = tf.nn.softmax(attention, axis=-1)

        out = tf.einsum("nhql,nlhd->nqhd", attention, values)
        out = tf.reshape(out, (N, seq_length, self.embed_size))
        out = self.fc_out(out)
        return out

class TransformerBlock(layers.Layer):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadSelfAttention(embed_size, heads)
        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = layers.LayerNormalization(epsilon=1e-6)

        self.feed_forward = keras.Sequential(
            [
                layers.Dense(forward_expansion * embed_size, activation="relu"),
                layers.Dense(embed_size),
            ]
        )

        self.dropout = layers.Dropout(dropout)

    def call(self, value, key, query):
        attention = self.attention(value, key, query)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out



In [10]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class BigramLanguageModel(keras.Model):
    def __init__(self, vocab_size, embed_size, heads, n_layers, max_length, forward_expansion, dropout):
        super(BigramLanguageModel, self).__init__()

        # Embedding layers
        self.embedding = layers.Embedding(vocab_size, embed_size)
        self.positional_embedding = layers.Embedding(max_length, embed_size)

        # Transformer blocks
        self.transformer_blocks = [
            TransformerBlock(embed_size, heads, dropout, forward_expansion)
            for _ in range(n_layers)
        ]

        # Dropout layer
        self.dropout = layers.Dropout(dropout)

        # Output fully connected layer
        self.fc_out = layers.Dense(vocab_size)

    def call(self, x):
        N, seq_length = x.shape

        # Generate positional encodings
        positions = tf.range(start=0, limit=seq_length, delta=1)

        # Embedding and positional encoding
        out = self.embedding(x)
        out += self.positional_embedding(positions)

        # Pass through transformer blocks
        for block in self.transformer_blocks:
            out = block(out, out, out)

        # Apply dropout
        out = self.dropout(out)

        # Output layer
        out = self.fc_out(out)
        return out


In [11]:
# Create the BigramLanguageModel
model = BigramLanguageModel(
    vocab_size=vocab_size,
    embed_size=n_embd,
    heads=n_head,
    n_layers=n_layer,
    max_length=block_size,
    forward_expansion=n_embd * 4,
    dropout=dropout
)

# Create an Adam optimizer
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)

# Define the loss function
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# List to store generated text
generated_text = []

# Training loop
for iteration in range(max_iters):
    # Get a batch of data
    x_batch, y_batch = get_batch(train_data_tensor, batch_size, block_size)

    # Calculate the loss and gradients
    with tf.GradientTape() as tape:
        logits = model(x_batch)
        loss = loss_fn(y_batch, logits)
    grads = tape.gradient(loss, model.trainable_variables)

    # Apply gradients using the optimizer
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # Print loss at specified intervals
    if iteration % eval_interval == 0:
        print(f"Iteration {iteration}, Loss: {loss.numpy()}")




Iteration 0, Loss: 4.474418640136719
Iteration 100, Loss: 3.346715211868286
Iteration 200, Loss: 3.2920451164245605
Iteration 300, Loss: 3.2201244831085205
Iteration 400, Loss: 3.251173496246338
Iteration 500, Loss: 3.2371301651000977
Iteration 600, Loss: 3.2259039878845215
Iteration 700, Loss: 3.2018816471099854
Iteration 800, Loss: 3.001217842102051
Iteration 900, Loss: 2.764075756072998
Iteration 1000, Loss: 2.6083405017852783
Iteration 1100, Loss: 2.532355546951294
Iteration 1200, Loss: 2.545006275177002
Iteration 1300, Loss: 2.563915967941284
Iteration 1400, Loss: 2.474588394165039
Iteration 1500, Loss: 2.484832525253296
Iteration 1600, Loss: 2.4055845737457275
Iteration 1700, Loss: 2.4342117309570312
Iteration 1800, Loss: 2.3724796772003174
Iteration 1900, Loss: 2.409709930419922
Iteration 2000, Loss: 2.3014073371887207
Iteration 2100, Loss: 2.3063302040100098
Iteration 2200, Loss: 2.370326280593872
Iteration 2300, Loss: 2.1841650009155273
Iteration 2400, Loss: 2.403679370880127


In [12]:
def generate_text(model, start_string, max_generate_length=2000):
    # Convert start_string to tensor
    input_eval = [stoi[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    generated_text = []

    model.reset_states()
    for i in range(max_generate_length):
        logits = model(input_eval)
        # Use a multinomial distribution to predict the token returned by the model
        predicted_id = tf.random.categorical(logits[:, 0, :], num_samples=1)[-1,0].numpy()


        # Append the predicted token to the input string and the generated text
        input_eval = tf.expand_dims([predicted_id], 0)
        generated_text.append(itos[predicted_id])

    return ''.join(generated_text)

start_string = " "  # You can use a space, or any other starting token
print(generate_text(model, start_string))

-sjwssseooedcseeeeeeeeiiiia    iiaa iaa  iaeeesfmpe
aaassss  oeeeeeeeep;cckiiaaa Aoredeeeeeeeeeee ----nedbtcuoenrvhss  a  iaa  A a  aahchscly ie a 
aoe i-dddeeeeoi--iiiiaaa
 -----whsia dmeechhcoeeeee ia
aa a ia;;S,seee'neee
i    itiieee iaaa ia
aaaaeeeeeiiiecooeeeeene aAoeoo aaa i rti------- iCreettteeeeeedesnneee 
 aei -ooa tt  aaoeeeeeeiiii---   
iwsse  a A hsiia Aereieoech''uoeeede  eia:tttbuoee aeeeeeeeeeeeeedsdccue - iieeemhspse a i-hsssdcthsc aeeeiii Ao iaaa aaa eedde munell eei iesmcheeeeeii- A;GO i
 eeereeeeeee aadsssbu i-b i
 
aaaa a idrrpeeeeeedchmbhshp -wh aaeeeeeeeeeeeeedvueiA
a DN i ---aaa aoeeee ii   aa aoeeeeeec iii
 ee a eeeeddeeesvlbeeei---
 -Mysreeee Afschssplpueea  a aa trcu   -iidddbyy'eds
eeeeedeeem edeei---ssvuoeelreeiiA
aaaaaaaa
a  ---------aeeeeeeed'oeeeeedii
  ----EE aeeeeeeeee 
.';   eeeeeeei a A;uuoedeeee aheedssssmcchfues -- hss  ivreeeeeeee aoeeeessbsddeoeeeeeeeemiibeeeeneirpurnnsssvhhss  aa ia eee,hddd
  aoeeessddc aaeeeeeedeeeeedsssssdchhssssiiiei---iwc  