<a href="https://colab.research.google.com/github/woodRock/grokking-deep-learning/blob/main/chapter_11_neural_networks_that_understand_language.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 11 | Neural networks that understand language

# Bag of words

In [3]:
import numpy as np

onehots = {}
onehots['cat'] = np.array([1,0,0,0])
onehots['the'] = np.array([0,1,0,0])
onehots['dog'] = np.array([0,0,1,0])
onehots['sat'] = np.array([0,0,0,1])

sentence = ['the', 'cat', 'sat']
x = onehots[sentence[0]] + onehots[sentence[1]] + onehots[sentence[2]]
print(f"Sentence encoding: {x}")

Sentence encoding: [1 1 0 1]


# Preprocessing

In [4]:
# Read the reviews.txt file
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close

f = open('labels.txt')
raw_labels = f.readlines()
f.close

# Tokenize the reviews
tokens = list(map(lambda x: set(x.split(" ")), raw_reviews))

vocab = set()
for sentence in tokens:
    for word in sentence:
        if (len(word) > 0):
            vocab.add(word)
vocab = list(vocab)

word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

input_dataset = list()
for sentence in tokens:
    sentence_indices = list()
    for word in sentence:
        try:
            sentence_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sentence_indices)))

target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)

size = 1_000
test_input_dataset = input_dataset[size:size+size]
test_target_dataset = target_dataset[size:size+size]
input_dataset = input_dataset[:size]
target_dataset = target_dataset[:size]

In [5]:
# Convert the class labels to onehot encoding
output_dim = 2
one_hot_labels = np.zeros((len(target_dataset),output_dim), dtype=np.float32)
for i,l in enumerate(target_dataset):
    one_hot_labels[i][l] = 1
labels = one_hot_labels
labels[:10]

# Test dataset
output_dim = 2
one_hot_labels = np.zeros((len(test_target_dataset),output_dim), dtype=np.float32)
for i,l in enumerate(target_dataset):
    one_hot_labels[i][l] = 1
test_labels = one_hot_labels
test_labels[:10]

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [6]:
# Convert the class labels to onehot encoding
vocab_size = len(vocab)
one_hot_labels = np.zeros((len(input_dataset),vocab_size), dtype=np.float32)
for i, sentence in enumerate(input_dataset):
    for word_idx in sentence:
        one_hot_labels[i][word_idx] = 1
text = one_hot_labels
text[:10]

# Test dataset
one_hot_labels = np.zeros((len(test_input_dataset),vocab_size), dtype=np.float32)
for i, sentence in enumerate(test_input_dataset):
    for word_idx in sentence:
        one_hot_labels[i][word_idx] = 1
test_text = one_hot_labels
test_text[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [7]:
# Activation functions
def tanh(x):
    return np.tanh(x)

def tanh2deriv(output):
    return 1 - (output ** 2)

def softmax(x):
    temp = np.exp(x)
    return temp / np.sum(temp, axis=1, keepdims=True)

# Hyperparameters
alpha = 2
iterations = 100
hidden_size = 100
input_dim = len(vocab)
output_dim = 2
batch_size = 100

# Initialize the neural network.
weights_0_1 = 0.02 * np.random.random((input_dim, hidden_size)) - 0.01
weights_1_2 = 0.2 * np.random.random((hidden_size, output_dim)) - 0.1

# Training loop
for j in range(iterations):
    correct_cnt = 0
    for i in range(int(len(text) / batch_size)):
        batch_start, batch_end = ((i * batch_size), ((i+1)*batch_size))
        input, target = text[batch_start:batch_end], labels[batch_start:batch_end]

        # Foward pass
        layer_0 = input
        layer_1 = tanh(np.dot(layer_0, weights_0_1))
        dropout_mask = np.random.randint(2, size=layer_1.shape)
        layer_1 *= dropout_mask * 2
        layer_2 = softmax(np.dot(layer_1, weights_1_2))
        prediction = layer_2

        for k in range(batch_size):
            pred_label = prediction[k:k+1]
            true_label = labels[batch_start+k:batch_start+k+1]
            correct_cnt += int(np.argmax(pred_label) == np.argmax(true_label))

        # Back propagation
        layer_2_delta = (target - prediction) / (batch_size * prediction.shape[0])
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)* tanh2deriv(layer_1)
        layer_1_delta *= dropout_mask

        # Update the weights
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)

    if (j % 10 == 0 or j == iterations - 1):

        # Evaluate on the test set.
        test_correct_cnt = 0

        for i in range(len(test_text)):
            input, target = test_text[i:i+1], test_labels[i:i+1]
            # Foward pass
            layer_0 = input
            layer_1 = tanh(np.dot(layer_0, weights_0_1))
            layer_2 = softmax(np.dot(layer_1, weights_1_2))
            prediction = layer_2

            test_correct_cnt += int(np.argmax(prediction) == np.argmax(target))

        print(f"I: {j}\tTraining accuracy: {correct_cnt/float(len(labels))} Test accuracy: {test_correct_cnt / float(len(test_labels))}")

I: 0	Training accuracy: 0.459 Test accuracy: 0.494
I: 10	Training accuracy: 0.828 Test accuracy: 0.726
I: 20	Training accuracy: 0.884 Test accuracy: 0.757
I: 30	Training accuracy: 0.909 Test accuracy: 0.768
I: 40	Training accuracy: 0.93 Test accuracy: 0.779
I: 50	Training accuracy: 0.952 Test accuracy: 0.785
I: 60	Training accuracy: 0.972 Test accuracy: 0.794
I: 70	Training accuracy: 0.983 Test accuracy: 0.8
I: 80	Training accuracy: 0.993 Test accuracy: 0.799
I: 90	Training accuracy: 0.995 Test accuracy: 0.798
I: 99	Training accuracy: 0.995 Test accuracy: 0.8


# Intro to embedding layer

In [8]:
import numpy as np
from collections import Counter
import math
from sklearn.model_selection import train_test_split

# Hyperparameters
alpha = 0.05
iterations = 10
hidden_size = 100
input_dim = len(vocab)  # Assuming vocab is defined elsewhere
output_dim = 2
batch_size = 10

# Initialize the neural network
weights_0_1 = 0.2 * np.random.random((input_dim, hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, output_dim)) - 0.1

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - weights_0_1[target_index]
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(np.sum(squared_difference))
    return scores.most_common(10)

# Activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Derivative of sigmoid
def sigmoid_derivative(x):
    return x * (1 - x)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=42)

def evaluate(X, y):
    correct_cnt = 0
    for i in range(len(X)):
        input_data = np.array([X[i]], dtype=np.int32)
        layer_1 = sigmoid(weights_0_1[input_data].sum(axis=1))
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
        prediction = np.argmax(layer_2)
        if prediction == np.argmax(y[i]):
            correct_cnt += 1
    return correct_cnt / len(X)

# Training loop
for j in range(iterations):
    correct_cnt = 0
    for i in range(0, len(X_train), batch_size):
        batch_end = min(i + batch_size, len(X_train))
        input_batch, target = X_train[i:batch_end], y_train[i:batch_end]

        # Convert input to an integer array.
        input_batch = np.array(input_batch, dtype=np.int32)

        # Forward pass
        layer_0 = weights_0_1[input_batch]
        layer_1 = sigmoid(layer_0.sum(axis=1))
        dropout_mask = np.random.randint(2, size=layer_1.shape)
        layer_1 *= dropout_mask * 2
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))

        prediction = layer_2

        # Count correct predictions
        correct_cnt += np.sum(np.argmax(prediction, axis=1) == np.argmax(target, axis=1))

        # Back propagation
        layer_2_delta = (prediction - target) * sigmoid_derivative(layer_2)
        layer_1_delta = np.dot(layer_2_delta, weights_1_2.T) * sigmoid_derivative(layer_1) * dropout_mask

        # Update weights_1_2
        weights_1_2 -= alpha * np.dot(layer_1.T, layer_2_delta)

        # Update weights_0_1 (word embeddings)
        for idx, word_idx in enumerate(input_batch):
            weights_0_1[word_idx] -= alpha * layer_1_delta[idx]

    # Calculate and display accuracies
    train_accuracy = correct_cnt / len(X_train)
    test_accuracy = evaluate(X_test, y_test)

    print(f"Epoch {j+1}/{iterations}")
    print(f"Training Accuracy: {train_accuracy:.2%}")
    print(f"Test Accuracy: {test_accuracy:.2%}")
    print(f"terrible: {similar('terrible')}\n")

print("Final evaluation:")
print(f"Training Accuracy: {evaluate(X_train, y_train):.2%}")
print(f"Test Accuracy: {evaluate(X_test, y_test):.2%}")
print(f"terrible: {similar('terrible')}")

  return 1 / (1 + np.exp(-x))


Epoch 1/10
Training Accuracy: 49.38%
Test Accuracy: 48.50%
terrible: [('terrible', -0.0), ('armies', -0.652016667004317), ('chevalier', -0.6575342307845361), ('reanimating', -0.6623769679687228), ('trendier', -0.6635086976859672), ('argh', -0.6739745311236714), ('mesmerized', -0.6796900501314465), ('caudill', -0.6797998322583804), ('apostle', -0.6818014590247808), ('unbounded', -0.6821421395801206)]

Epoch 2/10
Training Accuracy: 46.75%
Test Accuracy: 51.50%
terrible: [('terrible', -0.0), ('armies', -0.652016667004317), ('chevalier', -0.6575342307845361), ('reanimating', -0.6623769679687228), ('trendier', -0.6635086976859672), ('argh', -0.6739745311236714), ('mesmerized', -0.6796900501314465), ('caudill', -0.6797998322583804), ('apostle', -0.6818014590247808), ('unbounded', -0.6821421395801206)]

Epoch 3/10
Training Accuracy: 50.25%
Test Accuracy: 48.50%
terrible: [('terrible', -0.0), ('armies', -0.652016667004317), ('chevalier', -0.6575342307845361), ('reanimating', -0.662376967968722

In [9]:
from collections import Counter
import math

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] =  -math.sqrt(sum(squared_difference))

    return scores.most_common(10)

print(f"Words similar to beautiful:\n{similar('beautiful')}")
print(f"Words similar to terrible:\n{similar('terrible')}")
print(f"Words similar to good:\n{similar('good')}")

Words similar to beautiful:
[('beautiful', -0.0), ('kungfu', -0.6078559835763252), ('dorna', -0.6237234379636337), ('dnouement', -0.6276479094807271), ('pores', -0.630501691680436), ('butthorn', -0.6316024597393896), ('mozes', -0.6321821316026058), ('streamlining', -0.6340312414245621), ('ummm', -0.635353225069645), ('historic', -0.6367960620722979)]
Words similar to terrible:
[('terrible', -0.0), ('armies', -0.652016667004317), ('chevalier', -0.6575342307845361), ('reanimating', -0.6623769679687227), ('trendier', -0.6635086976859673), ('argh', -0.6739745311236714), ('mesmerized', -0.6796900501314466), ('caudill', -0.6797998322583804), ('apostle', -0.6818014590247808), ('unbounded', -0.6821421395801207)]
Words similar to good:
[('good', -0.0), ('cos', -0.6324252411684389), ('legislative', -0.6341788617695822), ('habitacin', -0.6361720386875899), ('tulsa', -0.6409584005626229), ('rino', -0.6446638067154833), ('frech', -0.6451918198712174), ('arrrrgh', -0.6463047995131386), ('naushads', 

# Filling in the blank

In [10]:
import random
import math
from collections import Counter
import numpy as np

# Freeze the random seed for reproducability.
np.random.seed(1)
random.seed(1)

f = open("reviews.txt")
raw_reviews = f.readlines()
f.close()

tokens = list(map(lambda x: (x.split(" ")), raw_reviews))
word_count = Counter()
for sentence in tokens:
    for word in sentence:
        word_count[word] -= 1
vocab = list(set(map(lambda x:x[0], word_count.most_common())))

word2index = {}
for i, word in enumerate(vocab):
  word2index[word] = i

concatenated = list()
input_dataset = list()
for sentence in tokens:
    sentence_indices = list()
    for word in sentence:
        try:
            sentence_indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            ""
    input_dataset.append(sentence_indices)
concatenated = np.array(concatenated)
random.shuffle(input_dataset)

In [11]:
# Hyperparameters
epochs = 10
alpha = 0.05,
iterations = 2
hidden_size = 50
window = 2
negative = 5

# Initialize the neural network
weights_0_1 = (np.random.rand(len(vocab),hidden_size) - 0.5) * 0.2
weights_1_2 = np.random.rand(len(vocab),hidden_size) * 0

layer_2_target = np.zeros(negative+1)
layer_2_target[0] = 1

def similar(target='beautiful'):
    target_index = word2index[target]

    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

# Activation function
def sigmoid(x):
    return 1/(1 + np.exp(-x))

# Training loop
for rev_i,review in enumerate(input_dataset * iterations):
    for target_i in range(len(review) - 1):
        # since it's really expensive to predict every vocabulary
        # we're only going to predict a random subset
        target_samples = [review[target_i]]+list(concatenated\
        [(np.random.rand(negative)*len(concatenated)).astype('int').tolist()])

        left_context = review[max(0,target_i-window):target_i]
        right_context = review[target_i+1:min(len(review),target_i+window)]

        # Forward pass
        layer_1 = np.mean(weights_0_1[left_context+right_context],axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))

        # Back propagation
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])

        # Update the weights
        weights_0_1[left_context+right_context] -= layer_1_delta * alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta,layer_1) * alpha

    # Every 250 reviews
    if(rev_i % 250 == 0):
        progress = rev_i/float(len(input_dataset)*iterations)
        print(f"Progress: {progress} similar: {similar('terrible')}")

print(similar('terrible'))

Progress: 0.0 similar: [('terrible', -0.0), ('dropout', -0.3535710262631364), ('rogge', -0.36796044919786497), ('vindictive', -0.37802728614605563), ('nets', -0.38246957704222073), ('desperadoes', -0.3825013857244422), ('company', -0.3845972963181407), ('nepotists', -0.38473029376866913), ('sanskrit', -0.3882094781788426), ('towelheads', -0.38932866227646384)]
Progress: 0.005 similar: [('terrible', -0.0), ('erect', -0.4722613200948294), ('weapon', -0.4817547765546732), ('disinterred', -0.4945705426075276), ('dissing', -0.496218843266514), ('pothole', -0.5004740338798189), ('amateur', -0.5024309511504912), ('myles', -0.5025487153075878), ('collectors', -0.5041483516844909), ('vining', -0.5064871896851985)]
Progress: 0.01 similar: [('terrible', -0.0), ('adam', -0.5502458709268596), ('bronte', -0.5550456181533929), ('seasoned', -0.5650011105768041), ('jack', -0.5676925749089833), ('contrived', -0.5721254258685475), ('penetrator', -0.5726492252894031), ('meant', -0.574218378523698), ('kell

In [13]:
def analogy(positive=["terrible", "good"], negative = ["bad"]):
    norms = np.sum(weights_0_1 * weights_0_1, axis=1)
    norms.resize(norms.shape[0], 1)

    normed_weights = weights_0_1 * norms

    query_vector = np.zeros(len(weights_0_1[0]))
    for word in positive:
        query_vector += normed_weights[word2index[word]]
    for word in negative:
        query_vector -= normed_weights[word2index[word]]

    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - query_vector
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))

    # Ignore the first word, as it will be the positive query.
    return scores.most_common(10)[1:]

print(f'terrible, good (+) bad (-): {analogy(["terrible", "good"], ["bad"])}')
print(f'elizabeth, he (+) she (-): {analogy(["elizabeth", "he"], ["she"])}')
print(f'king, woman (+) man (-): {analogy(["king", "woman"], ["man"])}')

terrible, good (+) bad (-): [('nice', -175.18314140475408), ('fine', -175.2732277367962), ('great', -175.324684520726), ('terrific', -175.3998157201147), ('worth', -175.54903408526687), ('worthy', -175.6249046287322), ('superb', -175.66809849930374), ('decent', -175.71252005002424), ('solid', -175.76833592514976)]
elizabeth, he (+) she (-): [('she', -246.82780685905263), ('it', -246.83140804694918), ('there', -246.8891755572739), ('been', -247.04855676522277), ('they', -247.14552423966597), ('you', -247.37944334650362), ('michael', -247.3984602711197), ('ms', -247.48457871466053), ('alan', -247.50913936933864)]
king, woman (+) man (-): [('king', -346.07052790806455), ('\n', -346.30663327184016), ('course', -346.8401729700213), ('woman', -346.8678352636859), ('rest', -346.97124426819965), ('town', -347.35893294942343), ('son', -347.3619178395633), ('end', -347.38989153592905), ('girl', -347.4213425998364)]
