<a href="https://colab.research.google.com/github/woodRock/grokking-deep-learning/blob/main/chapter_12_neural_networks__that_write_like_shakespeare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 12 | Neural networks that write like Shakespeare

In [1]:
# Download reviews.txt and labels.txt from here: https://github.com/udacity/deep-learning/tree/master/sentiment-network

def pretty_print_review_and_label(i):
   print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()


# Preprocess dataset:

import sys

f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('labels.txt')
raw_labels = f.readlines()
f.close()

# Take only the first 1,000 reviews
raw_reviews = raw_reviews[:1000]
raw_labels = raw_labels[:1000]

tokens = list(map(lambda x:set(x.split(" ")),raw_reviews))

vocab = set()
for sent in tokens:
    for word in sent:
        if(len(word)>0):
            vocab.add(word)
vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))

target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)

In [31]:
import random
import math
from collections import Counter
import numpy as np

# Freeze the seed for reprodicablity.
np.random.seed(1)

# Activation functions
def sigmoid(x):
    return 1/(1 + np.exp(-x))

def sigmoid2deriv(output):
    return output * (1 - output)

def tanh(x):
    return np.tanh(x)

def tanh2deriv(output):
    return 1 - (output ** 2)

def softmax(x):
    temp = np.exp(x)
    return temp / np.sum(temp, axis=1, keepdims=True)

# Hyperparameters
epochs = 100
batch_size = 100
alpha = 2,
hidden_size = 400
input_dim = len(vocab)
output_dim = 2

# Convert the text to vector embeddings
vocab_size = len(vocab)
one_hot_labels = np.zeros((len(input_dataset),vocab_size), dtype=np.int32)
for i, sentence in enumerate(input_dataset):
    for word_idx in sentence:
        one_hot_labels[i][word_idx] = 1
text = one_hot_labels

# Convert the class labels to one hot enocding
one_hot_labels = np.zeros((len(target_dataset), output_dim), dtype=np.int32)
for i, l in enumerate(target_dataset):
    one_hot_labels[i][l] = 1
labels = one_hot_labels

# Initialize the neural network
weights_0_1 = 0.02 * np.random.random((len(vocab),hidden_size)) - 0.01
weights_1_2 =  0.2 * np.random.random((hidden_size,output_dim)) -  0.1

def similar(target='beautiful'):
    target_index = word2index[target]

    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

# Activation function
def sigmoid(x):
    return 1/(1 + np.exp(-x))

def sigmoid2deriv(output):
    return output * (1 - output)

# Training loop
for j in range(epochs):
    correct_cnt = 0
    for i in range(int(len(text) / batch_size)):
        batch_start, batch_end = ((i * batch_size), ((i+1) * batch_size))
        input, target = text[batch_start:batch_end], labels[batch_start:batch_end]

        # Forward pass
        layer_0 = input
        layer_1 = tanh(np.dot(layer_0, weights_0_1))
        dropout_mask = np.random.randint(2, size=layer_1.shape)
        layer_1 *= dropout_mask * 2
        layer_2 = softmax(np.dot(layer_1, weights_1_2))
        prediction = layer_2

        for k in range(batch_size):
            pred_label = prediction[k:k+1]
            true_label = labels[batch_start+k:batch_start+k+1]
            correct_cnt += int(np.argmax(pred_label) == np.argmax(true_label))

        # Back propagation
        layer_2_delta = (target - prediction) / (batch_size * layer_2.shape[0])
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * tanh2deriv(layer_1)
        layer_1_delta *= dropout_mask

        # Update the weights.
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)

    # Every 10 reviews
    if (j % 10 == 0 or j == epochs - 1):
        progress = j/float(len(text))
        accuracy = correct_cnt / float(len(labels))
        print(f"Epoch: {j} \t Progress: {progress:.4f} \t accuracy: {accuracy:.4f} \t similar: {similar('terrible')}")

print(similar('terrible'))

Epoch: 0 	 Progress: 0.0000 	 accuracy: 0.5370 	 similar: [('terrible', -0.0), ('devil', -0.14585044334391642), ('bewitched', -0.14750447714964798), ('excels', -0.14787151798216422), ('wafer', -0.14843285634264014), ('specified', -0.14878658646887943), ('playboy', -0.14919392139629584), ('proved', -0.14930569558805984), ('standing', -0.14955506388014483), ('antagonists', -0.14991448569655494)]
Epoch: 10 	 Progress: 0.0100 	 accuracy: 0.8930 	 similar: [('terrible', -0.0), ('specified', -0.1496393894006238), ('devil', -0.14971996478357716), ('excels', -0.15045022465034433), ('standing', -0.151404078416331), ('wafer', -0.15174087615117907), ('bewitched', -0.151881552634404), ('proved', -0.15213647413522108), ('firebird', -0.15216852759098515), ('warburton', -0.1523734015192894)]
Epoch: 20 	 Progress: 0.0200 	 accuracy: 0.9220 	 similar: [('terrible', -0.0), ('specified', -0.15273528455672525), ('excels', -0.15465105932684167), ('there', -0.15488043921083336), ('warburton', -0.15539855398

In [32]:
def analogy(positive=["terrible", "good"], negative = ["bad"]):
    norms = np.sum(weights_0_1 * weights_0_1, axis=1)
    norms.resize(norms.shape[0], 1)

    normed_weights = weights_0_1 * norms

    query_vector = np.zeros(len(weights_0_1[0]))
    for word in positive:
        query_vector += normed_weights[word2index[word]]
    for word in negative:
        query_vector -= normed_weights[word2index[word]]

    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - query_vector
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))

    # Ignore the first word, as it will be the positive query.
    return scores.most_common(10)[1:]

print(f'terrible, good (+) bad (-): {analogy(["terrible", "good"], ["bad"])}')
print(f'elizabeth, he (+) she (-): {analogy(["elizabeth", "he"], ["she"])}')
print(f'king, woman (+) man (-): {analogy(["king", "woman"], ["man"])}')
print(f'elizabeth, he (+) she (-): {analogy(["elizabeth", "he"], ["she"])}')
print(f'man, woman (+) king (-): {analogy(["man", "woman"], ["king"])}')

terrible, good (+) bad (-): [('wizards', -0.10692333205332843), ('admission', -0.1070622760446349), ('subtitles', -0.1077462892312441), ('underneath', -0.10780070899988253), ('reminds', -0.1078994709849344), ('cronies', -0.1080483996658244), ('fudoh', -0.10843258757640356), ('terrifying', -0.10852476507161836), ('ultraviolent', -0.10858590871986315)]
elizabeth, he (+) she (-): [('eurail', -0.10633683960172827), ('survivial', -0.10658863713010631), ('wry', -0.10680218224664625), ('losses', -0.10694106329941419), ('smashes', -0.10701497643476447), ('wizards', -0.10709894589216934), ('topping', -0.10716170296396138), ('problems', -0.10720804063097321), ('tsuyako', -0.10721238916544781)]
king, woman (+) man (-): [('eurail', -0.10639393035597863), ('survivial', -0.10650613768863797), ('smashes', -0.10669524182794579), ('wry', -0.10679103677355847), ('problems', -0.10691569328894408), ('losses', -0.10696007879276626), ('bigger', -0.10716130144318609), ('noise', -0.1071750717249476), ('exitin

In [33]:
import numpy as np
from collections import Counter

norms = np.sum(weights_0_1 * weights_0_1, axis=1)
norms.resize(norms.shape[0],1)
normed_weights = weights_0_1 * norms

def make_sentence_vector(words):
    indices = list(map(lambda x: word2index[x],\
                       filter(lambda x: x in word2index, words)))
    return np.mean(normed_weights[indices], axis=0)

reviews2vectors = list()
for review in tokens:
    sentence_vector = make_sentence_vector(review)
    reviews2vectors.append(sentence_vector)
reviews2vectors = np.array(reviews2vectors)

def most_similar_reviews(review):
    v = make_sentence_vector(review)
    scores = Counter()
    for i, val in enumerate(reviews2vectors.dot(v)):
        scores[i] = val
    most_similar = list()

    for idx, score in scores.most_common(3):
        most_similar.append(raw_reviews[idx][0:40])
    return most_similar

most_similar_reviews(["boring","awful"])

['this piece ain  t really worth a comment',
 'just a few words . . . . this movie real',
 'this movie was terrible  i rented it not']

In [35]:
import numpy as np

a = np.array([1,2,3])
b = np.array([0.1,0.2,0.3])
c = np.array([-1,-0.5,0])
d = np.array([0,0,0])

identity = np.eye(3)
print(identity)

print(a.dot(identity))
print(b.dot(identity))
print(c.dot(identity))
print(d.dot(identity))

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
[1. 2. 3.]
[0.1 0.2 0.3]
[-1.  -0.5  0. ]
[0. 0. 0.]


In [38]:
this = np.array([2,4,6])
movie = np.array([10,10,10])
rocks = np.array([1,1,1])

print(this + movie + rocks)
print((this.dot(identity) + movie).dot(identity) + rocks)

[13 15 17]
[13. 15. 17.]


In [95]:
import numpy as np

# Freeze the seed for reprodicability.
np.random.seed(1)

# Activation function
def softmax(x_):
  x = np.atleast_2d(x_)
  temp = np.exp(x)
  return temp / np.sum(temp, axis=1, keepdims=True)

# Initialize the network
word_vectors = {}
word_vectors['yankees'] = np.array([[0.,0.,0.]])
word_vectors['bears'] = np.array([[0.,0.,0.]])
word_vectors['braves'] = np.array([[0.,0.,0.]])
word_vectors['red'] = np.array([[0.,0.,0.]])
word_vectors['sox'] = np.array([[0.,0.,0.]])
word_vectors['lose'] = np.array([[0.,0.,0.]])
word_vectors['defeat'] = np.array([[0.,0.,0.]])
word_vectors['beat'] = np.array([[0.,0.,0.]])
word_vectors['tie'] = np.array([[0.,0.,0.]])
sentence2output = np.random.rand(3, len(word_vectors))
identity = np.eye(3)

# Hyperparmeters
epochs = 100
alpha = 0.01

for _ in range(epochs):
    # Foward pass
    layer_0 = word_vectors['red']
    layer_1 = layer_0.dot(identity) + word_vectors['sox']
    layer_2 = layer_1.dot(identity) + word_vectors['defeat']

    pred = softmax(layer_2.dot(sentence2output))
    # print(f"pred: {pred}")

    # Back propagation
    y = np.array([1,0,0,0,0,0,0,0,0]) # One hot vector for Yankees

    pred_delta = pred - y
    layer_2_delta = pred_delta.dot(sentence2output.T)
    defeat_delta =  layer_2_delta * 1
    layer_1_delta = layer_2_delta.dot(identity.T)
    sox_delta = layer_1_delta * 1
    layer_0_delta = layer_1_delta.dot(identity.T)
    alpha = 0.01

    word_vectors['red'] -= layer_0_delta * alpha
    word_vectors['sox'] -= sox_delta * alpha
    word_vectors['defeat'] -= defeat_delta * alpha

    identity -= np.outer(layer_0, layer_1_delta) * alpha
    identity -= np.outer(layer_1, layer_2_delta) * alpha
    sentence2output -= np.outer(layer_2, pred_delta) * alpha

# Foward pass
layer_0 = word_vectors['red']
layer_1 = layer_0.dot(identity) + word_vectors['sox']
layer_2 = layer_1.dot(identity) + word_vectors['defeat']
pred = softmax(layer_2.dot(sentence2output))

assert np.argmax(pred) == np.argmax(y), "Prediction does not match target"

Download the dataset

In [54]:
! wget https://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz
! tar -xvf tasks_1-20_v1-2.tar.gz

--2024-08-24 10:14:08--  https://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz
Resolving www.thespermwhale.com (www.thespermwhale.com)... 50.31.160.191
Connecting to www.thespermwhale.com (www.thespermwhale.com)|50.31.160.191|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15719851 (15M) [application/x-gzip]
Saving to: ‘tasks_1-20_v1-2.tar.gz’


2024-08-24 10:14:10 (9.57 MB/s) - ‘tasks_1-20_v1-2.tar.gz’ saved [15719851/15719851]

tasks_1-20_v1-2/
tasks_1-20_v1-2/hn/
tasks_1-20_v1-2/hn/qa16_basic-induction_train.txt
tasks_1-20_v1-2/hn/qa13_compound-coreference_train.txt
tasks_1-20_v1-2/hn/qa13_compound-coreference_test.txt
tasks_1-20_v1-2/hn/qa14_time-reasoning_test.txt
tasks_1-20_v1-2/hn/qa5_three-arg-relations_test.txt
tasks_1-20_v1-2/hn/qa17_positional-reasoning_train.txt
tasks_1-20_v1-2/hn/qa9_simple-negation_train.txt
tasks_1-20_v1-2/hn/qa12_conjunction_train.txt
tasks_1-20_v1-2/hn/qa6_yes-no-questions_train.txt
tasks_1-20_v1-2/hn/qa2_two-su

In [92]:
import sys,random,math
from collections import Counter
import numpy as np

f = open('tasks_1-20_v1-2/en/qa1_single-supporting-fact_train.txt','r')
raw = f.readlines()
f.close()

tokens = list()
for line in raw[0:1000]:
    tokens.append(line.lower().replace("\n","").split(" ")[1:])

print(tokens[0:3])

[['mary', 'moved', 'to', 'the', 'bathroom.'], ['john', 'went', 'to', 'the', 'hallway.'], ['where', 'is', 'mary?', '\tbathroom\t1']]


In [93]:
# Preprocess the dataset.
vocab = set()
for sentence in tokens:
    for word in sentence:
        vocab.add(word)

vocab = list(vocab)

word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

def words2indices(sentence):
    idx = list()
    for word in sentence:
        idx.append(word2index[word])
    return idx

# Activation function
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

# Freeze the seed for reproducability.
np.random.seed(1)
embed_size = 10

# word embeddings
embed = (np.random.rand(len(vocab),embed_size) - 0.5) * 0.1

# embedding -> embedding (initially the identity matrix)
recurrent = np.eye(embed_size)

# sentence embedding for empty sentence
start = np.zeros(embed_size)

# embedding -> output weights
decoder = (np.random.rand(embed_size, len(vocab)) - 0.5) * 0.1

# one hot lookups (for loss function)
one_hot = np.eye(len(vocab))

# Forwawrd oass
def predict(sentence):

    layers = list()
    layer = {}
    layer['hidden'] = start
    layers.append(layer)

    loss = 0
    # forward propagate
    preds = list()
    for target_i in range(len(sentence)):
        layer = {}
        # try to predict the next term
        layer['pred'] = softmax(layers[-1]['hidden'].dot(decoder))
        loss += -np.log(layer['pred'][sentence[target_i]])
        # generate the next hidden state
        layer['hidden'] = layers[-1]['hidden'].dot(recurrent) + embed[sentence[target_i]]
        layers.append(layer)

    return layers, loss

# Training loop
for j in range(epochs):
    alpha = 0.001
    sentence = words2indices(tokens[j % len(tokens)][1:])
    layers,loss = predict(sentence)

    # Forward pass
    for layer_idx in reversed(range(len(layers))):
        layer = layers[layer_idx]
        target = sentence[layer_idx-1]

        if(layer_idx > 0):  # if not the first layer
            layer['output_delta'] = layer['pred'] - one_hot[target]
            new_hidden_delta = layer['output_delta'].dot(decoder.transpose())

            # if the last layer - don't pull from a later one becasue it doesn't exist
            if(layer_idx == len(layers)-1):
                layer['hidden_delta'] = new_hidden_delta
            else:
                layer['hidden_delta'] = new_hidden_delta + layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())
        else: # if the first layer
            layer['hidden_delta'] = layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())

    # Back progpation
    start -= layers[0]['hidden_delta'] * alpha / float(len(sentence))
    for layer_idx, layer in enumerate(layers[1:]):
        decoder -= np.outer(layers[layer_idx]['hidden'], layer['output_delta']) * alpha / float(len(sentence))

        embed_idx = sentence[layer_idx]
        embed[embed_idx] -= layers[layer_idx]['hidden_delta'] * alpha / float(len(sentence))

        recurrent -= np.outer(layers[layer_idx]['hidden'], layer['hidden_delta']) * alpha / float(len(sentence))

    if (j % 1000 == 0 or j == epochs - 1):
        perplexity = np.exp(loss / len(sentence))
        print(f"Epoch: {j} \t Perplexitty: {perplexity}")



Epoch: 0 	 Perplexitty: 82.1100559632168
Epoch: 1000 	 Perplexitty: 81.99090217554809
Epoch: 2000 	 Perplexitty: 81.85525912134636
Epoch: 3000 	 Perplexitty: 81.63645524243188
Epoch: 4000 	 Perplexitty: 81.21583375974387
Epoch: 5000 	 Perplexitty: 80.33159335718055
Epoch: 6000 	 Perplexitty: 78.27503632847272
Epoch: 7000 	 Perplexitty: 72.33614108276905
Epoch: 8000 	 Perplexitty: 45.30273062756402
Epoch: 9000 	 Perplexitty: 24.562240422415016
Epoch: 10000 	 Perplexitty: 19.791584022300416
Epoch: 11000 	 Perplexitty: 18.278806376933236
Epoch: 12000 	 Perplexitty: 16.59387367652661
Epoch: 13000 	 Perplexitty: 14.088323498980094
Epoch: 14000 	 Perplexitty: 10.822503000255418
Epoch: 15000 	 Perplexitty: 8.202594684975473
Epoch: 16000 	 Perplexitty: 6.855140486994354
Epoch: 17000 	 Perplexitty: 6.020045586130928
Epoch: 18000 	 Perplexitty: 5.430417275951099
Epoch: 19000 	 Perplexitty: 5.040622868602438
Epoch: 20000 	 Perplexitty: 4.8132115025067765
Epoch: 21000 	 Perplexitty: 4.671143945581

In [94]:
sent_index = 4
l,_ = predict(words2indices(tokens[sent_index]))

print(tokens[sent_index])

for i,each_layer in enumerate(l[1:-1]):
    input = tokens[sent_index][i]
    true = tokens[sent_index][i+1]
    pred = vocab[each_layer['pred'].argmax()]
    print("Prev Input:" + input + (' ' * (12 - len(input))) +\
          "True:" + true + (" " * (15 - len(true))) + "Pred:" + pred)

['sandra', 'moved', 'to', 'the', 'garden.']
Prev Input:sandra      True:moved          Pred:is
Prev Input:moved       True:to             Pred:to
Prev Input:to          True:the            Pred:the
Prev Input:the         True:garden.        Pred:bedroom.
