In [6]:
import numpy as np

In [11]:
# standard rnn

# hyperparameters
hidden_size = 100 # number of hidden units
vocab_size = 5000 # number of unique words in training data

# initialize weights
np.random.seed(42)
W_hh = np.random.randn(hidden_size, hidden_size) * 0.01  # Hidden to hidden
W_hx = np.random.randn(hidden_size, vocab_size) * 0.01  # Input to hidden
W_s = np.random.randn(vocab_size, hidden_size) * 0.01  # Hidden to output

b_h = np.zeros((hidden_size, 1))  # Bias for hidden layer
b_s = np.zeros((vocab_size, 1))  # Bias for output layer

corpus = "Napoleon was the Emperor of France"

# tokenization - split corpus into words
tokens = corpus.lower().split()

# create vocab
vocab = list(set(tokens))
vocab_size = len(vocab)

# word to index
word_to_index = {w: idx for idx, w in enumerate(vocab)}
index_to_word = {idx: w for idx, w in enumerate(vocab)}

# word -> one-hot vectors
def word_to_one_hot(word):
    one_hot_vector = np.zeros(vocab_size)
    one_hot_vector[word_to_index[word]] = 1
    return one_hot_vector

def sentence_to_one_hot_vectors(sentence):
    # sentence into tokens
    tokens = sentence.lower().split()
    # convert each word in sentence to a one-hot vector
    one_hot_vectors = [ word_to_one_hot(word) for word in tokens]
    return one_hot_vectors

one_hot_vectors = sentence_to_one_hot_vectors(corpus)
one_hot_vectors

[array([0., 0., 0., 0., 1., 0.]),
 array([0., 0., 0., 0., 0., 1.]),
 array([1., 0., 0., 0., 0., 0.]),
 array([0., 0., 1., 0., 0., 0.]),
 array([0., 0., 0., 1., 0., 0.]),
 array([0., 1., 0., 0., 0., 0.])]

In [46]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)
logits = np.array([2.0, 1.0, 0.1])
probs = softmax(logits)
# print(probs)


def forward_pass(inputs, h_prev, W_hh, W_hx, W_s, b_h, b_s):
    """
    inputs: List of one-hot encoded words (as numpy arrays)
    h_prev: Previous hidden state
    W_hh: Hidden to hidden weight matrix
    W_hx: Input to hidden weight matrix
    W_s: Hidden to output weight matrix
    b_h: Hidden layer bias
    b_s: Output layer bias
    """
    outputs, h_states = [], []

    # empty list to store the outputs and hidden states at each time step
    for x in inputs:
        # calc new hidden state for current time step
        print(W_hx.shape, x.shape, W_hh.shape, h_prev.shape)
        h = np.tanh(np.dot(W_hx, x) + np.dot(W_hh, h_prev) + b_h)

        # calc unormalized log prob for next words
        y = np.dot(W_s, h) + b_s

        # apply softmax to get probs
        p = softmax(y)

        outputs.append(p)
        h_states.append(h)

        # Update hidden state to current state
        h_prev = h
    return outputs, h_states, h_prev
# Example parameters
vocab_size, hidden_size = 3, 2
W_hh = np.random.randn(hidden_size, hidden_size)
W_hx = np.random.randn(hidden_size, vocab_size)
W_s = np.random.randn(vocab_size, hidden_size)
b_h = np.zeros((hidden_size, 1))
b_s = np.zeros((vocab_size, 1))

# Initial hidden state
h_prev = np.zeros((hidden_size, 1))

# Example one-hot encoded input for the word "cat" in a 3-word vocabulary
x = np.array([[1], [0], [0]])

# Forward pass
# y_logits, h_next = forward_pass(x, h_prev, W_hh, W_hx, W_s, b_h, b_s)
# print("Output logits:", y_logits)

def predict_output(probabilities):
    # predicted word index with the one with max probability
    return np.argmax(probabilities, axis = 0)
print(predict_output(probs))
def cross_entropy_loss(y_true, y_pred):
    """
    y_true: The one-hot encoded vector of the true next word
    y_pred: The predicted probability distribution for the next word
    """
    # Multiply the true distribution with the log of predicted, sum it up, and negate the value
    loss = -np.sum(y_true * np.log(y_pred))
    return loss

y_true = np.array([0, 1, 0])  
loss = cross_entropy_loss(y_true, probs)
print("Cross-entropy loss:", loss)

0
Cross-entropy loss: 1.4170300162778335


In [13]:
preds = []
loss = 0

for true_word, output_prob in zip(true_words, outputs):
    pred_word_index = predict_output(output_prob)
    predictions.append(pred_word_index)

    # Calculate and accumulate the loss
    loss += cross_entropy_loss(true_word, output_prob)

# To get average loss per time step
average_loss = loss / len(true_words)

NameError: name 'true_words' is not defined

min_char_rnn

In [5]:
# Initialize weight matrices U, V, W from random distribution and bias b, c with zeros

# data loading
data = open('input.txt', 'r').read()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars) # number of unique charactors
print("data has %d characters, %d unique." % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
# print(char_to_ix)
ix_to_char = { i:ch for i,ch in enumerate(chars) }
# print(ix_to_char)

# hyperparams
hidden_size = 100
seq_length = 25
learning_rate = 1e-1

# model params
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input -> hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01  # hidden -> hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01  # hidden -> output

bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

# training

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)  # memory variables for Adagrad
smooth_loss = -np.log(1.0 / vocab_size) * seq_length  # loss at iteration 0

    

['o', ' ', 'r', 'e', 'h', 't', 'l']
data has 11 characters, 7 unique.
{'o': 0, ' ': 1, 'r': 2, 'e': 3, 'h': 4, 't': 5, 'l': 6}
{0: 'o', 1: ' ', 2: 'r', 3: 'e', 4: 'h', 5: 't', 6: 'l'}
