In [2]:
import torch
from torch import nn
from torch.nn import functional as F
from tqdm import tqdm

torch.manual_seed(1234)
device = 'cuda' if torch.cuda.is_available() else 'cpu' 

#### In this project, we will attempt to build a chacarter-level GPT language model which learnes to add two non-negative integers, i.e. given the input string "a+b=c", the model will be trained to predict the next character following a sliding context window.

#### This is a simple next character prediction task. We will attempt two different versions of this task: 1) The integers of "c" are predicted left-to-right 2) the integers are predicted from right to left (i.e backward) which is typically how humans compute additions. 


In [67]:
# first let's set up the token vocabulary for this problem
# note that we have two special tokens '<*>' which denotes the beginning or end of a 
# sequence and the '<PAD>' token which is used for pre-padding sequences to ensure fixed length 
vocab = sorted(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '+', '=', '<*>', '<PAD>'])
vocab_size = len(vocab)
print(f"Vocabulary: {vocab}")
print(f"vocab_size = {vocab_size}")

# tokenization
ctoi = {vocab[i]:i for i in range(vocab_size)}
itoc = {i:vocab[i] for i in range(vocab_size)}
encode = lambda s: [ctoi[c] for c in s]  # converts a string to integer token sequence
decode = lambda s: [itoc[ix] for ix in s]  # converts an integer token sequence to string of characters
print(ctoi)

Vocabulary: ['+', '0', '1', '10', '2', '3', '4', '5', '6', '7', '8', '9', '<*>', '<PAD>', '=']
vocab_size = 15
{'+': 0, '0': 1, '1': 2, '10': 3, '2': 4, '3': 5, '4': 6, '5': 7, '6': 8, '7': 9, '8': 10, '9': 11, '<*>': 12, '<PAD>': 13, '=': 14}


#### Now lets implement the data loader which generates a batch of input-target pairs.

In [105]:
import random
random.seed(1223)

max_digits = 5 # max number of digits for input integers 'a' and 'b'
batch_size = 1
block_size = 15 # size of context window

# generates input target pairs for a single problem string "a+b=c"
def generate_problem(max_digits, block_size, backward=False):

    max_size = max(3*max_digits, 2*block_size)+1

    # randomly generate two integers
    a = random.randint(0,10**max_digits-1)
    b = random.randint(0,10**max_digits-1)
    c = a + b

    prompt = list(f"{a}+{b}=")
    answer = list(f"{c}")
    
    print(answer)
    if backward:
        # reverse the digits of "c"
        answer = reversed(answer)

    print(f"prompt: {prompt}")
    print(f"answer: {answer}")

    # encolse with special token
    prompt = ['<*>'] + prompt
    answer = answer + ['<*>'] 
    problem = prompt+answer
    tot_len = len(problem)

    # pre-pad the problem string to make it max_size long
    problem = ['<PAD>']*(max_size-tot_len) + problem
    
    print(f"padded problem: {problem}")
    
    contexts = []
    targets = []
    for i in range(max_size-block_size):
        context = problem[i:i+block_size] 
        target = problem[i+1:i+block_size+1]
        
        # tokenize the sequences
        #context = [ctoi[c] for c in context]
        #target = ctoi[target]

        contexts.append(context)
        targets.append(target)    
        print(f"context: {context} -- > target: {target}")

    # create pytorch tensors of tokenized input and target batch
    x, y, = None, None
    #x = torch.tensor(contexts).to(device)
    #y = torch.tensor(targets).to(device)

    return max_size, x, y


In [107]:
max_size, x, y = generate_problem(max_digits, block_size)

['1', '2', '4', '7', '0', '2']
prompt: ['5', '7', '4', '5', '7', '+', '6', '7', '2', '4', '5', '=']
answer: ['1', '2', '4', '7', '0', '2']
padded problem: ['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<*>', '5', '7', '4', '5', '7', '+', '6', '7', '2', '4', '5', '=', '1', '2', '4', '7', '0', '2', '<*>']
context: ['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<*>', '5', '7', '4'] -- > target: ['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<*>', '5', '7', '4', '5']
context: ['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<*>', '5', '7', '4', '5'] -- > target: ['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<*>', '5', '7', '4', '5', '7']
context: ['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<*>', '5', '7', '4', '5', '7'] --