In [1]:
import torch
import torch.nn as nn
import numpy as np
import string
import json

In [2]:
def get_vocab():
    # Construct the character 'vocabulary'
    # we allow lowercase, uppercase and digits only, along with special characters:
    # "" - Empty string used to denote elements for the RNN to ignore
    # "<bos>" - Beginning of sequence token for the input the the RNN
    # "." - End of sequence token
    vocab = ["", "<bos>", "."] + list(string.ascii_lowercase + string.ascii_uppercase + string.digits + " ")
    id_to_char = {i: v for i, v in enumerate(vocab)} # maps from ids to characters
    char_to_id = {v: i for i, v in enumerate(vocab)} # maps from characters to ids
    return vocab, id_to_char, char_to_id

def load_data(filename):
    # read in the list of names
    data = json.load(open(filename, "r"))
    # append the end of sequence token to each name
    data = [v+'.' for v in data]
    return data

In [3]:
data = load_data("names_small.json")

# get the letter 'vocabulary'
vocab, id_to_char, char_to_id = get_vocab()
vocab_size = len(vocab)

In [4]:
print([char_to_id[c] for c in data[11][:20]]+5*[0])
print(data[11])

[47, 22, 7, 24, 7, 16, 65, 47, 10, 3, 16, 7, 65, 41, 5, 32, 17, 16, 3, 14, 0, 0, 0, 0, 0]
Steven Shane McDonald.


In [5]:
def seqs_to_ids(seqs, char_to_id, max_len=20):
    """Takes a list of names and turns them into a list of tokens ids.
    Responsible for padding sequences shorter than max_len with 0 so that all sequences are max_len.
    Also truncates names that are longer than max_len.
    Should also skip empty sequences if there are any.

    Args:
        seqs (list(str)): A list of names as strings.
        char_to_id (dict(str : int)): The mapping for characters to token ids
        max_len (int, optional): The maximum length of the ouput sequence. Defaults to 20.

    Returns:
        np.array: the names represented using token ids as 2d numpy array, 
            where each row corresponds to a name. The size of the array should be N * max_len
            where N is the number of non-empty sequences input. Padded with zeros if needed.
    """
    all_seqs = []
    # TODO: implement this function to turn a list of names into a 2d padded array of token ids
    for name in seqs:
        truncated = name[:max_len]
        name_sequence = [char_to_id[c] for c in truncated]
        if (len(name_sequence))<max_len:
            name_sequence=name_sequence+(max_len-len(name_sequence))*[0]
        all_seqs.append(name_sequence)    
    return np.array(all_seqs)

In [6]:
Y = seqs_to_ids(data, char_to_id)
X = np.concatenate([np.ones((Y.shape[0], 1)), Y[:, :-1]], axis=1)



In [7]:
train_frac = 0.9
num_train = int(X.shape[0]*train_frac)

Xtrain = torch.tensor(X[:num_train], dtype=torch.long)
Ytrain = torch.tensor(Y[:num_train], dtype=torch.long)
Xval = torch.tensor(X[num_train:], dtype=torch.long)
Yval = torch.tensor(Y[num_train:], dtype=torch.long)

In [28]:
class RNNLM(nn.Module):
    def __init__(self, vocab_size, emb_size = 32, gru_size=32):
        super(RNNLM, self).__init__()

        # store layer sizes
        self.emb_size = emb_size
        self.gru_size = gru_size

        # for embedding characters (ignores those with value 0: the padded values)
        self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        # GRU layer
        self.gru = nn.GRU(emb_size, gru_size, batch_first=True)
        # linear layer for output
        self.linear = nn.Linear(gru_size, vocab_size)
    
    def forward(self, x, h_last=None):
        """Takes a batch of names/sequences expressed as token ids and passes them through the GRU.
            The output is the predicted (un-normalized) probabilities of 
            the next token for all prefixes of the input sequences.

        Args:
            x (torch.tensor): A 2d tensor of longs giving the token ids for each batch. Shape B * S
                where B is the batch size (any batch size >= 1 is permitted), S is the length of the sequence.
            h_last (torch.tensor, optional): A 2d float tensor of size B * G where B is the batch size and G
                is the dimensionality of the GRU hidden state. The hidden state from the previous step, provide only if 
                generating sequences iteratively. Defaults to None.

        Returns:
            tuple(torch.tensor, torch.tensor): first element of the tuple is the B * S * V where V is the vocabulary size.
                This is the logit output of the RNNLM. The second element is the hidden state of the final step
                of the GRU it should be B * G dimensional.
        """

        # TODO: implement this function which does the forward pass of the RNNLM network
        embedded = self.emb(x)
        output, h = self.gru(embedded,h_last)
        #out = self.linear(output[:,-1,:])
        out = self.linear(output)
        #out = torch.nn.functional.softmax(out,dim=1)
        return out, h[-1]


In [29]:
def train_model(model, Xtrain, Ytrain, Xval, Yval, id_to_char, max_epoch):
    """Train the RNNLM model using the Xtrain and Ytrain examples.
    Uses batch stochastic gradient descent with the Adam optimizer on 
    the mean cross entropy loss. Prints out the validation loss
    after each epoch using calc_val_loss.

    Args:
        model (RNNLM): the RNNLM model.
        Xtrain (torch.tensor): The training data input sequence of size Nt * S. 
            Nt is the number of training examples, S is the sequence length. 
            The sequences always start with the <bos> token id.
            The rest of the sequence is just Ytrain shifted to the right one position.
            The sequence is zero padded.
        Ytrain (torch.tensor): The expected output sequence of size Nt * S. 
            Does not start with the <bos> token.
        Xval (torch.tensor): The validation data input sequence of size Nv * S. 
            Nv is the number of validation examples, S is the sequence length. 
            The sequences always start with the <bos> token id.
            The rest of sequence is just Yval shifted to the right one position.
            The sequence is zero padded.
        Yval (torch.tensor): The expected output sequence for the validation data of size Nv * S. 
            Does not start with the <bos> token. Is zero padded.
        id_to_char (dict(int : str)): A mapping from ids to tokens.
        max_epoch (int): the maximum number of epochs to train for.
    """
    # construct the adam optimizer
    optim = torch.optim.Adam(model.parameters(), lr=0.0001)
    # construct the cross-entropy loss function
    # we want to ignore padding cells with value == 0
    lossfn = nn.CrossEntropyLoss(ignore_index=0)

    # calculate number of batches
    batch_size = 32
    num_batches = int(Xtrain.shape[0] / batch_size)
    average_loss = 0 
    # run the main training loop over many epochs
    for e in range(max_epoch):
        total_loss = 0
        total_chars = 0
        for t in range(0,num_batches-1):
            batch_X = get_batch(t,Xtrain,batch_size)
            batch_Y = get_batch(t,Ytrain,batch_size)
            model.zero_grad()
            out, h = model(batch_X)
            #out= torch.nn.functional.softmax(out,dim=1)
            loss=lossfn(out.permute(0, 2, 1), batch_Y)
            loss.backward()
            optim.step()
        val_loss = calc_val_loss(model, Xval, Yval)
        print(val_loss)
        # TODO: implement the training loop of the RNNLM model
def get_batch(t,data,batch_size):
    return data[t*batch_size:(t+1)*batch_size]        







In [None]:
#     for n in range(num_batches):

#         # calculate batch start end idxs 
#         s = n * batch_size
#         e = (n+1)*batch_size
#         if e > Xval.shape[0]:
#             e = Xval.shape[0]

#         # compute output of model        
#         out,_ = model(Xval[s:e])

#         # compute loss and store
#         loss = lossfn(out.permute(0, 2, 1), Yval[s:e]).detach().cpu().numpy()
#         total_loss += loss

#         char_count = torch.count_nonzero(Yval[s:e].flatten())
#         total_chars += char_count.detach().cpu().numpy()

#     # compute average loss per character
#     total_loss /= total_chars
    
#     # set the model back to training mode in case we need gradients later
#     model.train()


In [30]:
def calc_val_loss(model, Xval, Yval):
    """Calculates the validation loss in average nats per character.

    Args:
        model (RNNLM): the RNNLM model.
        Xval (torch.tensor): The validation data input sequence of size B * S. 
            B is the batch size, S is the sequence length. The sequences always start with the <bos> token id.
            The rest of sequence is just Yval shifted to the right one position.
            The sequence is zero padded.
        Yval (torch.tensor): The expected output sequence for the validation data of size B * S. 
            Does not start with the <bos> token. Is zero padded.

    Returns:
        float: validation loss in average nats per character.
    """

    # use cross entropy loss
    lossfn = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')

    # put the model into eval mode because we don't need gradients
    model.eval()

    # calculate number of batches, we need to be precise this time
    batch_size = 32
    num_batches = int(Xval.shape[0] / batch_size)
    if Xval.shape[0] % batch_size != 0:
        num_batches += 1

    # sum up the total loss
    total_loss = 0
    total_chars = 0
    for n in range(num_batches):

        # calculate batch start end idxs 
        s = n * batch_size
        e = (n+1)*batch_size
        if e > Xval.shape[0]:
            e = Xval.shape[0]

        # compute output of model        
        out,_ = model(Xval[s:e])

        # compute loss and store
        loss = lossfn(out.permute(0, 2, 1), Yval[s:e]).detach().cpu().numpy()
        total_loss += loss

        char_count = torch.count_nonzero(Yval[s:e].flatten())
        total_chars += char_count.detach().cpu().numpy()

    # compute average loss per character
    total_loss /= total_chars
    
    # set the model back to training mode in case we need gradients later
    model.train()

    return total_loss


In [137]:
model = RNNLM(vocab_size)
train_model(model, Xtrain, Ytrain, Xval, Yval, id_to_char, max_epoch=10)

2.824993203126884
2.58618413380023
2.51347909897662
2.475872794491589
2.4512298953839267
2.432051269105194
2.4165737255937305
2.403743439609211
2.3930218885134593
2.383809648055891


In [32]:
import random 
 
def random_pick(some_list, probabilities):
    x = random.uniform(0,1)
    cumulative_probability = 0.0
    for item, item_probability in zip(some_list, probabilities):
        cumulative_probability += item_probability
        if  x < cumulative_probability:
            break
    return item 

In [24]:
random_pick([1,2,3], [0.2,0.75,0.05])

2

In [194]:
def gen_string(model, id_to_char, max_len=20, sample=True):
    """Generate a name using the model. The generation process should finish once
    the end token is seen. We either sample from the model, where the next token is
    chosen randomly according to the categorical probability distribution produced by softmax,
    or we use argmax decoding where the most likely token is chosen at every generation step.

    Args:
        model (RNNLM): The trained RNNLM model.
        id_to_char (dict(int, str)): A mapping from token ids to token strings.
        max_len (int, optional): The maximum length of the output senquence. Defaults to 20.
        sample (bool, optional): If True then generate samples. If False then use argmax decoding. 
            Defaults to True.

    Returns:
        str: The generated name as a string.
    """
    # put the model into eval mode because we don't need gradients
    model.eval()

    # setup the initial input to the network
    # we will use a batch size of one for generation
    h = torch.zeros((1,1,model.gru_size), dtype=torch.float) # h0 is all zeros
    x = torch.ones((1, 1), dtype=torch.long) # x is the <bos> token id which = 1
    out_str = ""
    # generate the sequence step by step
    for i in range(max_len):

        # TODO: implement the generation loop of the RNNLM model
        #       this should generate a name from the model
        #       using either sampling or argmax decoding 
        if sample==True:
            if out_str=="":
                out,h_out = model(x,h)
                probability = torch.nn.functional.softmax(out[0],dim=1)[0] 
                character_random = random_pick(vocab, probability)
                index_char = char_to_id[character_random]
                out_str+=character_random
            
            while len(out_str)<=20:
                predicted_x =torch.tensor(np.array([[char_to_id[out_str[-1]]]]), dtype=torch.long)
                index_char = char_to_id[character_random]
                out,h_out = model(predicted_x,h)
                p = torch.nn.functional.softmax(out[0],dim=1)[0] 
                character_random = random_pick(vocab, p)
                out_str+=character_random
            
        if sample==False:
            if out_str=="":
                out,h_out = model(x,h)
                probability = torch.nn.functional.softmax(out[0],dim=1)[0]
                max_index = np.argmax(list(probability))
                max_char = id_to_char[max_index]
                out_str+=max_char
            
            while len(out_str)<=20:
                #predicted_x = torch.tensor(np.array([[char_to_id[out_str[-1]]]]), dtype=torch.long)
                x = x.fill_(max_index)
                print('x',x)
                out,h_out = model(x,h)
                p = torch.nn.functional.softmax(out[0],dim=1)[0] 
                max_index = np.argmax(list(p))
                max_char = id_to_char[max_index]
                out_str+=max_char

                

    # set the model back to training mode in case we need gradients later
    model.train()

    return out_str


In [230]:
def gen_string(model, id_to_char, max_len=20, sample=True):
    """Generate a name using the model. The generation process should finish once
    the end token is seen. We either sample from the model, where the next token is
    chosen randomly according to the categorical probability distribution produced by softmax,
    or we use argmax decoding where the most likely token is chosen at every generation step.

    Args:
        model (RNNLM): The trained RNNLM model.
        id_to_char (dict(int, str)): A mapping from token ids to token strings.
        max_len (int, optional): The maximum length of the output senquence. Defaults to 20.
        sample (bool, optional): If True then generate samples. If False then use argmax decoding. 
            Defaults to True.

    Returns:
        str: The generated name as a string.
    """
    # put the model into eval mode because we don't need gradients
    model.eval()

    # setup the initial input to the network
    # we will use a batch size of one for generation
    h = torch.zeros((1,1,model.gru_size), dtype=torch.float) # h0 is all zeros
    x = torch.ones((1, 1), dtype=torch.long) # x is the <bos> token id which = 1
    out_str = ""
    # generate the sequence step by step
    for i in range(max_len):
        out,h = model(x,h)
        probability = torch.nn.functional.softmax(out[0],dim=1)[0]
        predicted_X = random_pick(list(id_to_char.keys()), probability)  if sample else np.argmax(list(probability))
        char_predict = id_to_char[predicted_X]
        x = x.fill_(predicted_X)
        if char_predict=='.':
            break
        out_str+=char_predict

    # set the model back to training mode in case we need gradients later
    model.train()

    return out_str


In [231]:
print("Argmax: ", gen_string(model, id_to_char, sample=False))

RuntimeError: Expected hidden size (1, 1, 32), got [1, 32]

In [226]:
print("Random:")
for i in range(10):
    gstr = gen_string(model, id_to_char)
    print(gstr)

Random:
SGIscCd6<bos>RiN95LCpUJh
M D AwickameMepcKKAK
N3wOphulenYy NikeyFl
ImiehulicAFwmpWadAJy
Awkergakaxz
GbexFycGhanilbetyngF
JoFedu61
CaByAthvihaxirx
K2Vt JAFuNacKencKutr
WAs Kmy TNkakiJidoll


In [98]:
out,h_out = model(x,h)
probability = torch.nn.functional.softmax(out[0],dim=1)[0]
max_index = np.argmax(list(prpbability))
max_char = id_to_char[max_index]

In [99]:
out_str=""
out_str+=max_char
index_array = np.array([[max_index]])
x = torch.tensor(index_array, dtype=torch.long)

In [183]:
out,h_out = model(x,h)
probability = torch.nn.functional.softmax(out[0],dim=1)[0] 
x

tensor([[38]])

In [101]:
probability

tensor([0.0046, 0.0039, 0.0274, 0.0398, 0.0073, 0.0166, 0.0031, 0.0658, 0.0263,
        0.0113, 0.1050, 0.0276, 0.0107, 0.0144, 0.0019, 0.0403, 0.0260, 0.1395,
        0.0109, 0.0195, 0.0122, 0.0255, 0.0208, 0.0338, 0.0149, 0.0202, 0.0025,
        0.0201, 0.0157, 0.0030, 0.0017, 0.0030, 0.0029, 0.0050, 0.0044, 0.0048,
        0.0048, 0.0070, 0.0034, 0.0028, 0.0038, 0.0032, 0.0053, 0.0032, 0.0032,
        0.0053, 0.0044, 0.0032, 0.0040, 0.0074, 0.0035, 0.0027, 0.0073, 0.0057,
        0.0048, 0.0121, 0.0076, 0.0073, 0.0087, 0.0082, 0.0061, 0.0050, 0.0102,
        0.0090, 0.0122, 0.0366], grad_fn=<SelectBackward>)

In [103]:
id_to_char[np.argmax(list(probability))]


'o'

In [None]:
prpbability

In [40]:
h = torch.zeros((1,1,model.gru_size), dtype=torch.float) # h0 is all zeros
x = torch.ones((1, 1), dtype=torch.long) # x is the <bos> token id which = 1
out,h_out = model(x,h)

In [45]:
out.shape

torch.Size([1, 1, 66])

In [50]:

prpbability = torch.nn.functional.softmax(out[0],dim=1)[0]

In [51]:
prpbability

tensor([2.3214e-04, 2.0970e-04, 3.6157e-04, 2.1848e-03, 3.9716e-04, 7.0805e-04,
        2.7473e-03, 8.4021e-04, 1.0081e-03, 5.4722e-04, 5.1366e-04, 1.2428e-03,
        5.5267e-04, 6.0660e-04, 6.6937e-04, 4.7224e-04, 1.3742e-03, 1.2534e-03,
        5.5245e-04, 4.6272e-04, 1.6006e-03, 9.1939e-04, 9.7934e-04, 5.6364e-04,
        9.7742e-04, 2.8476e-04, 2.8139e-04, 9.2276e-04, 4.3837e-04, 5.6948e-02,
        6.9449e-02, 5.4001e-02, 7.9029e-02, 3.1267e-02, 1.9711e-02, 3.9416e-02,
        2.3743e-02, 6.0535e-03, 1.3475e-01, 3.6693e-02, 3.2935e-02, 8.9798e-02,
        1.8913e-02, 8.1329e-03, 3.7928e-02, 1.0127e-03, 7.5102e-02, 5.9173e-02,
        5.3103e-02, 2.4374e-03, 1.1766e-02, 1.8786e-02, 5.5004e-04, 7.5721e-03,
        3.5901e-03, 2.9004e-04, 2.8214e-04, 3.1374e-04, 2.3527e-04, 1.9841e-04,
        1.8452e-04, 1.5880e-04, 1.3005e-04, 2.0600e-04, 3.0708e-04, 1.9284e-03],
       grad_fn=<SelectBackward>)

In [53]:
random_pick(vocab, prpbability)

'M'

In [73]:
np.argmax(list(prpbability))
id_to_char[38]

'J'

In [60]:
x = torch.zeros((1, 1), dtype=torch.long)
x

tensor([[0]])

In [65]:
y_array = np.array([[0]])
y = torch.tensor(y_array, dtype=torch.long)

In [66]:
y==x

tensor([[True]])

In [21]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
import random
for epoch in range(1, 60):
    model.fit(x, y, batch_size=128)
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated_text = x[start_index: start_index + maxlen]
    for i in range(400): #generates 400 length string
        preds = model.predict(generated_text)[0]
        next_index = sample(preds, temperature)
        next_char = chars[next_index]
        generated_text += next_char
        generated_text = generated_text[1:]

In [None]:
batch_X = Xtrain[:32]
batch_Y = Ytrain[:32]

In [None]:
out, h = model(batch_X)

In [None]:
import torch.nn 

In [None]:
res = torch.nn.functional.softmax(out,dim=1)

In [229]:
seqs_to_ids(['Bec.', 'Hannah.', 'Siqi.'], char_to_id, max_len=6)

array([[30,  7,  5,  2,  0,  0],
       [36,  3, 16, 16,  3, 10],
       [47, 11, 19, 11,  2,  0]])

In [None]:
def seqs_to_ids(seqs, char_to_id, max_len=20):
    """Takes a list of names and turns them into a list of tokens ids.
    Responsible for padding sequences shorter than max_len with 0 so that all sequences are max_len.
    Also truncates names that are longer than max_len.
    Should also skip empty sequences if there are any.

    Args:
        seqs (list(str)): A list of names as strings.
        char_to_id (dict(str : int)): The mapping for characters to token ids
        max_len (int, optional): The maximum length of the ouput sequence. Defaults to 20.

    Returns:
        np.array: the names represented using token ids as 2d numpy array, 
            where each row corresponds to a name. The size of the array should be N * max_len
            where N is the number of non-empty sequences input. Padded with zeros if needed.
    """
    all_seqs = []
    # TODO: implement this function to turn a list of names into a 2d padded array of token ids
    for name in seqs:
        truncated = name[:max_len]
        
        name_sequence = [char_to_id[c] for c in truncated]
        if (len(name_sequence))<max_len:
            name_sequence=name_sequence+(max_len-len(name_sequence))*[0]
        all_seqs.append(name_sequence)    
    return np.array(all_seqs)