In [1]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from collections import Counter

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
nltk.data.path.append('.')

In [5]:
!git clone https://github.com/yeesem/NLP_Dataset

Cloning into 'NLP_Dataset'...
remote: Enumerating objects: 36, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (36/36), done.[K
remote: Total 36 (delta 7), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (36/36), 21.40 MiB | 4.77 MiB/s, done.
Resolving deltas: 100% (7/7), done.


In [6]:
# Load, tokenize and process the data
import re
with open("/content/NLP_Dataset/shakespeare.txt") as f:
  data = f.read()

data = re.sub(r'[,!?;-]', '.',data)
data = nltk.word_tokenize(data)
data = [ch.lower() for ch in data if ch.isalpha() or ch == '.']
print("Number of tokens : ", len(data), "\n", data[:15])

Number of tokens :  60976 
 ['o', 'for', 'a', 'muse', 'of', 'fire', '.', 'that', 'would', 'ascend', 'the', 'brightest', 'heaven', 'of', 'invention']


In [7]:
# Compute the frequency distribution of the words in the dataset
fdist = nltk.FreqDist(word for word in data)
print("Size of vocabulary: ", len(fdist))
print("Most of frequent tokens : ", fdist.most_common(20))

Size of vocabulary:  5775
Most of frequent tokens :  [('.', 9630), ('the', 1521), ('and', 1394), ('i', 1257), ('to', 1159), ('of', 1093), ('my', 857), ('that', 781), ('in', 770), ('a', 752), ('you', 748), ('is', 630), ('not', 559), ('for', 467), ('it', 460), ('with', 441), ('his', 434), ('but', 417), ('me', 417), ('your', 397)]


In [10]:
def get_dict(data):
  data = sorted(list(set(data)))
  word2Ind = {}
  Ind2word = {}
  idx = 0
  for word in data:
    word2Ind[word] = idx
    Ind2word[idx] = word
    idx += 1
  return word2Ind, Ind2word

In [11]:
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
print("Size of vocabulary: ", V)

Size of vocabulary:  5775


In [12]:
# Example of word to index mapping
print("Index of the word 'king' : ", word2Ind['king'])
print("Word which has index 2743 : ", Ind2word[2743])

Index of the word 'king' :  2744
Word which has index 2743 :  kinds


# Training the Model

### Initializing the model

In [13]:
def initialize_model(N, V, random_seed = 1):
  np.random.seed(random_seed)

  W1 = np.random.uniform(0,1,(N,V))
  W2 = np.random.uniform(0,1,(V,N))
  b1 = np.random.uniform(0,1,(N,1))
  b2 = np.random.uniform(0,1,(V,1))

  return W1, W2, b1, b2

### Activation function

In [14]:
def softmax(z):
  yhat = np.exp(z) / np.sum(np.exp(z), axis = 0)
  return yhat

### Forward propagation

In [16]:
def forward_prop(x, W1, W2, b1, b2):
  h = np.dot(W1, x) + b1

  # Method 1
  #h[h < 0] = 0

  # Method 2
  h = np.maximum(h, 0)

  z = np.dot(W2 ,h) + b2

  return z, h

### Cost function

In [17]:
def compute_cost(y, yhat, batch_size):
  logprobs = np.multiply(np.log(yhat), y)
  cost = -1/batch_size * np.sum(logprobs)
  cost = np.squeeze(cost)
  return cost

### Training the Model - Backpropagation

In [18]:
def back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size):
  l1 = np.dot(W2.T,yhat-y)

  # Apply relu to L1
  l1[l1 < 0] = 0

  # compute the gradient for W1
  grad_W1 = 1/batch_size * np.dot(l1,x.T)

  # Compute gradient of W2
  grad_W2 = 1/batch_size * np.dot(yhat-y,h.T)

  # compute gradient for b1
  grad_b1 = 1/batch_size * np.dot(l1,np.ones((batch_size,1)))

  # compute gradient for b2
  grad_b2 = 1/batch_size * np.dot(yhat-y,np.ones((batch_size,1)))
  ### END CODE HERE ####

  return grad_W1, grad_W2, grad_b1, grad_b2

### Gradient Descent

In [None]:
def gradient_descent(data, word2Ind, N, V, num_iters, alpha=0.03,
                     random_seed=282, initialize_model=initialize_model,
                     get_batches=get_batches, forward_prop=forward_prop,
                     softmax=softmax, compute_cost=compute_cost,
                     back_prop=back_prop):

  W1, W2, b1, b2 = initialize_model(N, V, random_seed)
  batch_size = 128
  iters = 0
  C = 2

  for x, y in get_batches(data, word2Ind, V, C, batch_size):
    z, h = forward_prop(x,W1,W2,b1,b2)

    yhat = softmax(z)

    # Compute cost
    cost = compute_cost(y, yhat, batch_size)

    if ( (iters + 1)%10 == 0):
      print(f"iters: {iters + 1} cost: {cost:.6f}")

    # Get gradient
    grad_W1, grad_W2, grad_b1, grad_b2 = back_prop(x,yhat,y,h,W1,W2,b1,b2,batch_size)

    # Update weights and biases
    W1 = W1 - alpha * grad_W1
    W2 = W2 - alpha * grad_W2
    b1 = b1 - alpha * grad_b1
    b2 = b2 - alpha * grad_b2

    iters += 1
    if iters == num_iters:
      break
    if iters % 100 == 0:
      alpha *= 0.66

  return W1, W2, b1, b2

In [None]:
C = 2
N = 50
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
num_iters = 150
print("Call gradient descent")
W1, W2, b1, b2 = gradient_descent(data, word2Ind, N, V, num_iters)

# Visualizing the word vectors

In [None]:
from matplotlib import pyplot