<a href="https://colab.research.google.com/github/woodRock/grokking-deep-learning/blob/main/chapter_11_neural_networks_that_understand_language.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 11 | Neural networks that understand language

# Bag of words

In [4]:
import numpy as np

onehots = {}
onehots['cat'] = np.array([1,0,0,0])
onehots['the'] = np.array([0,1,0,0])
onehots['dog'] = np.array([0,0,1,0])
onehots['sat'] = np.array([0,0,0,1])

sentence = ['the', 'cat', 'sat']
x = onehots[sentence[0]] + onehots[sentence[1]] + onehots[sentence[2]]
print(f"Sentence encoding: {x}")

Sentence encoding: [1 1 0 1]


# Preprocessing

In [54]:
# Read the reviews.txt file
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close

f = open('labels.txt')
raw_labels = f.readlines()
f.close

# Tokenize the reviews
tokens = list(map(lambda x: set(x.split(" ")), raw_reviews))

vocab = set()
for sentence in tokens:
    for word in sentence:
        if (len(word) > 0):
            vocab.add(word)
vocab = list(vocab)

word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

input_dataset = list()
for sentence in tokens:
    sentence_indices = list()
    for word in sentence:
        try:
            sentence_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sentence_indices)))

target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)

size = 1_000
test_input_dataset = input_dataset[size:size+size]
test_target_dataset = target_dataset[size:size+size]
input_dataset = input_dataset[:size]
target_dataset = target_dataset[:size]

In [55]:
# Convert the class labels to onehot encoding
output_dim = 2
one_hot_labels = np.zeros((len(target_dataset),output_dim), dtype=np.float32)
for i,l in enumerate(target_dataset):
    one_hot_labels[i][l] = 1
labels = one_hot_labels
labels[:10]

# Test dataset
output_dim = 2
one_hot_labels = np.zeros((len(test_target_dataset),output_dim), dtype=np.float32)
for i,l in enumerate(target_dataset):
    one_hot_labels[i][l] = 1
test_labels = one_hot_labels
test_labels[:10]

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [56]:
# Convert the class labels to onehot encoding
vocab_size = len(vocab)
one_hot_labels = np.zeros((len(input_dataset),vocab_size), dtype=np.float32)
for i, sentence in enumerate(input_dataset):
    for word_idx in sentence:
        one_hot_labels[i][word_idx] = 1
text = one_hot_labels
text[:10]

# Test dataset
one_hot_labels = np.zeros((len(test_input_dataset),vocab_size), dtype=np.float32)
for i, sentence in enumerate(test_input_dataset):
    for word_idx in sentence:
        one_hot_labels[i][word_idx] = 1
test_text = one_hot_labels
test_text[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [57]:
# Activation functions
def tanh(x):
    return np.tanh(x)

def tanh2deriv(output):
    return 1 - (output ** 2)

def softmax(x):
    temp = np.exp(x)
    return temp / np.sum(temp, axis=1, keepdims=True)

# Hyperparameters
alpha = 2
iterations = 100
hidden_size = 100
input_dim = len(vocab)
output_dim = 2
batch_size = 100

# Initialize the neural network.
weights_0_1 = 0.02 * np.random.random((input_dim, hidden_size)) - 0.01
weights_1_2 = 0.2 * np.random.random((hidden_size, output_dim)) - 0.1

# Training loop
for j in range(iterations):
    correct_cnt = 0
    for i in range(int(len(text) / batch_size)):
        batch_start, batch_end = ((i * batch_size), ((i+1)*batch_size))
        input, target = text[batch_start:batch_end], labels[batch_start:batch_end]

        # Foward pass
        layer_0 = input
        layer_1 = tanh(np.dot(layer_0, weights_0_1))
        dropout_mask = np.random.randint(2, size=layer_1.shape)
        layer_1 *= dropout_mask * 2
        layer_2 = softmax(np.dot(layer_1, weights_1_2))
        prediction = layer_2

        for k in range(batch_size):
            pred_label = prediction[k:k+1]
            true_label = labels[batch_start+k:batch_start+k+1]
            correct_cnt += int(np.argmax(pred_label) == np.argmax(true_label))

        # Back propagation
        layer_2_delta = (target - prediction) / (batch_size * prediction.shape[0])
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)* tanh2deriv(layer_1)
        layer_1_delta *= dropout_mask

        # Update the weights
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)

    if (j % 10 == 0 or j == iterations - 1):

        # Evaluate on the test set.
        test_correct_cnt = 0

        for i in range(len(test_text)):
            input, target = test_text[i:i+1], test_labels[i:i+1]
            # Foward pass
            layer_0 = input
            layer_1 = tanh(np.dot(layer_0, weights_0_1))
            layer_2 = softmax(np.dot(layer_1, weights_1_2))
            prediction = layer_2

            test_correct_cnt += int(np.argmax(prediction) == np.argmax(target))

        print(f"I: {j}\tTraining accuracy: {correct_cnt/float(len(labels))} Test accuracy: {test_correct_cnt / float(len(test_labels))}")

I: 0	Training accuracy: 0.494 Test accuracy: 0.509
I: 10	Training accuracy: 0.78 Test accuracy: 0.502
I: 20	Training accuracy: 0.847 Test accuracy: 0.486
I: 30	Training accuracy: 0.874 Test accuracy: 0.48
I: 40	Training accuracy: 0.904 Test accuracy: 0.489
I: 50	Training accuracy: 0.926 Test accuracy: 0.493
I: 60	Training accuracy: 0.95 Test accuracy: 0.498
I: 70	Training accuracy: 0.966 Test accuracy: 0.505
I: 80	Training accuracy: 0.978 Test accuracy: 0.508
I: 90	Training accuracy: 0.987 Test accuracy: 0.51
I: 99	Training accuracy: 0.991 Test accuracy: 0.506


# Intro to embedding layer

In [44]:
import numpy as np
from collections import Counter
import math
from sklearn.model_selection import train_test_split

# Hyperparameters
alpha = 0.05
iterations = 10
hidden_size = 100
input_dim = len(vocab)  # Assuming vocab is defined elsewhere
output_dim = 2
batch_size = 10

# Initialize the neural network
weights_0_1 = 0.2 * np.random.random((input_dim, hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, output_dim)) - 0.1

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - weights_0_1[target_index]
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(np.sum(squared_difference))
    return scores.most_common(10)

# Activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Derivative of sigmoid
def sigmoid_derivative(x):
    return x * (1 - x)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=42)

def evaluate(X, y):
    correct_cnt = 0
    for i in range(len(X)):
        input_data = np.array([X[i]], dtype=np.int32)
        layer_1 = sigmoid(weights_0_1[input_data].sum(axis=1))
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
        prediction = np.argmax(layer_2)
        if prediction == np.argmax(y[i]):
            correct_cnt += 1
    return correct_cnt / len(X)

# Training loop
for j in range(iterations):
    correct_cnt = 0
    for i in range(0, len(X_train), batch_size):
        batch_end = min(i + batch_size, len(X_train))
        input_batch, target = X_train[i:batch_end], y_train[i:batch_end]

        # Convert input to an integer array.
        input_batch = np.array(input_batch, dtype=np.int32)

        # Forward pass
        layer_0 = weights_0_1[input_batch]
        layer_1 = sigmoid(layer_0.sum(axis=1))
        dropout_mask = np.random.randint(2, size=layer_1.shape)
        layer_1 *= dropout_mask * 2
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))

        prediction = layer_2

        # Count correct predictions
        correct_cnt += np.sum(np.argmax(prediction, axis=1) == np.argmax(target, axis=1))

        # Back propagation
        layer_2_delta = (prediction - target) * sigmoid_derivative(layer_2)
        layer_1_delta = np.dot(layer_2_delta, weights_1_2.T) * sigmoid_derivative(layer_1) * dropout_mask

        # Update weights_1_2
        weights_1_2 -= alpha * np.dot(layer_1.T, layer_2_delta)

        # Update weights_0_1 (word embeddings)
        for idx, word_idx in enumerate(input_batch):
            weights_0_1[word_idx] -= alpha * layer_1_delta[idx]

    # Calculate and display accuracies
    train_accuracy = correct_cnt / len(X_train)
    test_accuracy = evaluate(X_test, y_test)

    print(f"Epoch {j+1}/{iterations}")
    print(f"Training Accuracy: {train_accuracy:.2%}")
    print(f"Test Accuracy: {test_accuracy:.2%}")
    print(f"terrible: {similar('terrible')}\n")

print("Final evaluation:")
print(f"Training Accuracy: {evaluate(X_train, y_train):.2%}")
print(f"Test Accuracy: {evaluate(X_test, y_test):.2%}")
print(f"terrible: {similar('terrible')}")

  return 1 / (1 + np.exp(-x))


Epoch 1/10
Training Accuracy: 48.00%
Test Accuracy: 48.50%
terrible: [('terrible', -0.0), ('yb', -0.6271164376013738), ('lyrically', -0.6317240614076646), ('shamble', -0.636646820823081), ('oxy', -0.6372006103711703), ('yvette', -0.6387138586459286), ('obscuring', -0.6390080214657136), ('buckley', -0.6393306246892859), ('disservices', -0.6397892262824619), ('towelheads', -0.6426212275558818)]

Epoch 2/10
Training Accuracy: 44.88%
Test Accuracy: 48.50%
terrible: [('terrible', -0.0), ('yb', -0.6271164376013738), ('lyrically', -0.6317240614076646), ('shamble', -0.636646820823081), ('oxy', -0.6372006103711703), ('yvette', -0.6387138586459286), ('obscuring', -0.6390080214657136), ('buckley', -0.6393306246892859), ('disservices', -0.6397892262824619), ('towelheads', -0.6426212275558818)]

Epoch 3/10
Training Accuracy: 48.25%
Test Accuracy: 48.50%
terrible: [('terrible', -0.0), ('yb', -0.6271164376013738), ('lyrically', -0.6317240614076646), ('shamble', -0.636646820823081), ('oxy', -0.6372006

KeyboardInterrupt: 

In [58]:
from collections import Counter
import math

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] =  -math.sqrt(sum(squared_difference))

    return scores.most_common(10)

print(f"Words similar to beautiful:\n{similar('beautiful')}")
print(f"Words similar to terrible:\n{similar('terrible')}")
print(f"Words similar to good:\n{similar('good')}")

Words similar to beautiful:
[('beautiful', -0.0), ('his', -0.07071780387973033), ('fine', -0.07366165803828469), ('god', -0.07371790999687042), ('ever', -0.07402884728794429), ('perhaps', -0.07475378824368628), ('mostly', -0.0749826641492825), ('come', -0.07519590679756649), ('more', -0.07530645237090662), ('carrey', -0.07563617098476084)]
Words similar to terrible:
[('terrible', -0.0), ('contortion', -0.0652114763026173), ('cortney', -0.06565973471827609), ('rediscoveries', -0.06580153703393334), ('realised', -0.06693199209336141), ('rudest', -0.066963514140267), ('lan', -0.06698015279419733), ('svendsen', -0.06698044295236112), ('charm', -0.06705438246572451), ('insomniacs', -0.06707230338230676)]
Words similar to good:
[('good', -0.0), ('message', -0.06830673830453966), ('ron', -0.06992446680197126), ('fi', -0.07092333385227206), ('taylor', -0.07115919248803718), ('seasons', -0.07256386495132805), ('son', -0.07257995515395603), ('accident', -0.07267769971425227), ('then', -0.0728981

# Filling in the blank

In [47]:
import random
import math
from collections import Counter
import numpy as np

# Freeze the random seed for reproducability.
np.random.seed(1)
random.seed(1)

f = open("reviews.txt")
raw_reviews = f.readlines()
f.close()

tokens = list(map(lambda x: (x.split(" ")), raw_reviews))
word_count = Counter()
for sentence in tokens:
    for word in sentence:
        word_count[word] -= 1
vocab = list(set(map(lambda x:x[0], word_count.most_common())))

word2index = {}
for i, word in enumerate(vocab):
  word2index[word] = i

concatenated = list()
input_dataset = list()
for sentence in tokens:
    sentence_indices = list()
    for word in sentence:
        try:
            sentence_indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            ""
    input_dataset.append(sentence_indices)
concatenated = np.array(concatenated)
random.shuffle(input_dataset)

In [70]:
# Hyperparameters
epochs = 10
alpha = 0.05,
iterations = 2
hidden_size = 50
window = 2
negative = 5

# Initialize the neural network
weights_0_1 = (np.random.rand(len(vocab),hidden_size) - 0.5) * 0.2
weights_1_2 = np.random.rand(len(vocab),hidden_size)*0

layer_2_target = np.zeros(negative+1)
layer_2_target[0] = 1

def similar(target='beautiful'):
    target_index = word2index[target]

    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

# Activation function
def sigmoid(x):
    return 1/(1 + np.exp(-x))

# Training loop
for rev_i,review in enumerate(input_dataset * iterations):
    for target_i in range(len(review) - 1):
        # since it's really expensive to predict every vocabulary
        # we're only going to predict a random subset
        target_samples = [review[target_i]]+list(concatenated\
        [(np.random.rand(negative)*len(concatenated)).astype('int').tolist()])

        left_context = review[max(0,target_i-window):target_i]
        right_context = review[target_i+1:min(len(review),target_i+window)]

        # Forward pass
        layer_1 = np.mean(weights_0_1[left_context+right_context],axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))

        # Back propagation
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])

        # Update the weights
        weights_0_1[left_context+right_context] -= layer_1_delta * alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta,layer_1)*alpha

    # Every 250 reviews
    if(rev_i % 250 == 0):
        progress = rev_i/float(len(input_dataset)*iterations)
        print(f"Progress: {progress} similar: {similar('terrible')}")

print(similar('terrible'))

Progress: 0.0 similar: [('terrible', -0.0), ('awoke', -0.3651169249947063), ('masquerades', -0.39108873398737865), ('cheapo', -0.3945828793589487), ('campers', -0.3953999868256407), ('tie', -0.39564375285482867), ('saaad', -0.4013687810025308), ('montoss', -0.4034894640474157), ('coherant', -0.40581171112337355), ('woulda', -0.40705455528722234)]
Progress: 0.125 similar: [('terrible', -0.0), ('herself', -0.43816118022724215), ('vijay', -0.4782528014059888), ('taken', -0.48498883756715705), ('perfectly', -0.4881200559183418), ('street', -0.49215364007755397), ('bobby', -0.49961749701662234), ('clear', -0.49963937187159785), ('named', -0.5057191275149554), ('storyline', -0.5079262287364211)]
Progress: 0.25 similar: [('terrible', -0.0), ('taken', -0.47195914291046165), ('basically', -0.47977884871181775), ('street', -0.4864328173243087), ('perfectly', -0.48759282558187605), ('storyline', -0.4881298353554536), ('named', -0.4958455232887505), ('co', -0.4962125663149723), ('hilarious', -0.50

In [71]:
def analogy(positive=["terrible", "good"], negative = ["bad"]):
    norms = np.sum(weights_0_1 * weights_0_1, axis=1)
    norms.resize(norms.shape[0], 1)

    normed_weights = weights_0_1 * norms

    query_vector = np.zeros(len(weights_0_1[0]))
    for word in positive:
        query_vector += normed_weights[word2index[word]]
    for word in negative:
        query_vector -= normed_weights[word2index[word]]

    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - query_vector
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))

    # Ignore the first word, as it will be the positive query.
    return scores.most_common(10)[1:]

print(f'terrible, good (+) bad (-): {analogy(["terrible", "good"], ["bad"])}')
print(f'elizabeth, he (+) she (-): {analogy(["elizabeth", "he"], ["she"])}')
print(f'king, woman (+) man (-): {analogy(["king", "woman"], ["man"])}')

terrible, good (+) bad (-): [('have', -390.15019358970096), ('in', -390.8835587470175), ('\n', -391.1089229220066), ('but', -391.1855549494826), ('.', -391.4678701283952), ('good', -391.5047402353443), ('to', -391.5465135186223), ('by', -391.64278036853506), ('this', -391.77344893810215)]
elizabeth, he (+) she (-): [('but', -224.79554527807562), ('to', -225.01071145951823), ('\n', -225.1433560147473), ('this', -225.167789375941), ('by', -225.4759646703603), ('of', -225.8909616565375), ('a', -225.94601938379344), ('the', -225.94690882701872), ('is', -225.98890858036214)]
king, woman (+) man (-): [('undercuts', -124.47271859929674), ('yoko', -124.64956857659786), ('flit', -124.6860944853715), ('nemico', -124.69295870741928), ('palde', -124.69518562145726), ('oogling', -124.69656809780903), ('divali', -124.70321163964871), ('kik', -124.70383134273686), ('temperamental', -124.7039864286581)]
