## Import the Datasets and the libraries

### Import the datasets

In [13]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import random 

%matplotlib inline

SEED = 2147483647

# read all the names in a python list
with open(file="datasets/names.txt", mode="r") as namesTxt: 
    names = [line[:-1] for line in namesTxt.readlines()]

# shuffle the order of the names
random.seed(42)
random.shuffle(names)
n1 = int(0.8 * len(names))
n2 = int(0.9 * len(names))


### Build the datasets

In [22]:
# create the encoder for the letters
chars = ["."] + sorted(list(set("".join(names)))) 
stoi = {c:i for i,c in enumerate(chars)}
itos = {i:c for i,c in enumerate(chars)}
vocab_size = len(stoi)

# define the block size of the model
block_size = 3 # context length: how many characters to we consider to predict the next one

# helper function to encode the datasets
def buildDatasets(words):
    X, Y = [], []
    
    for word in words:
        context = [0] * block_size
        for ch in word + ".":
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    
    return (torch.tensor(X),torch.tensor(Y))

# build the datasets
xtrain, ytrain = buildDatasets(names[:n1]) # 80%
xtest, ytest = buildDatasets(names[n1:n2]) # 10%
xdev, ydev = buildDatasets(names[n2:]) # 10 %

In [24]:
print(f"size of xtrain = {xtrain.shape}")
print(f"size of xtest = {xtest.shape}")
print(f"size of xdev = {xdev.shape}")

size of xtrain = torch.Size([182624, 3])
size of xtest = torch.Size([22655, 3])
size of xdev = torch.Size([22866, 3])
