In [1]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from collections import Counter

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
nltk.data.path.append('.')

In [5]:
!git clone https://github.com/yeesem/NLP_Dataset

Cloning into 'NLP_Dataset'...
remote: Enumerating objects: 36, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (36/36), done.[K
remote: Total 36 (delta 7), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (36/36), 21.40 MiB | 4.77 MiB/s, done.
Resolving deltas: 100% (7/7), done.


In [6]:
# Load, tokenize and process the data
import re
with open("/content/NLP_Dataset/shakespeare.txt") as f:
  data = f.read()

data = re.sub(r'[,!?;-]', '.',data)
data = nltk.word_tokenize(data)
data = [ch.lower() for ch in data if ch.isalpha() or ch == '.']
print("Number of tokens : ", len(data), "\n", data[:15])

Number of tokens :  60976 
 ['o', 'for', 'a', 'muse', 'of', 'fire', '.', 'that', 'would', 'ascend', 'the', 'brightest', 'heaven', 'of', 'invention']


In [7]:
# Compute the frequency distribution of the words in the dataset
fdist = nltk.FreqDist(word for word in data)
print("Size of vocabulary: ", len(fdist))
print("Most of frequent tokens : ", fdist.most_common(20))

Size of vocabulary:  5775
Most of frequent tokens :  [('.', 9630), ('the', 1521), ('and', 1394), ('i', 1257), ('to', 1159), ('of', 1093), ('my', 857), ('that', 781), ('in', 770), ('a', 752), ('you', 748), ('is', 630), ('not', 559), ('for', 467), ('it', 460), ('with', 441), ('his', 434), ('but', 417), ('me', 417), ('your', 397)]


In [10]:
def get_dict(data):
  data = sorted(list(set(data)))
  word2Ind = {}
  Ind2word = {}
  idx = 0
  for word in data:
    word2Ind[word] = idx
    Ind2word[idx] = word
    idx += 1
  return word2Ind, Ind2word

In [11]:
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
print("Size of vocabulary: ", V)

Size of vocabulary:  5775


In [12]:
# Example of word to index mapping
print("Index of the word 'king' : ", word2Ind['king'])
print("Word which has index 2743 : ", Ind2word[2743])

Index of the word 'king' :  2744
Word which has index 2743 :  kinds


# Training the Model

### Initializing the model

In [13]:
def initialize_model(N, V, random_seed = 1):
  np.random.seed(random_seed)

  W1 = np.random.uniform(0,1,(N,V))
  W2 = np.random.uniform(0,1,(V,N))
  b1 = np.random.uniform(0,1,(N,1))
  b2 = np.random.uniform(0,1,(V,1))

  return W1, W2, b1, b2

### Activation function

In [14]:
def softmax(z):
  yhat = np.exp(z) / np.sum(np.exp(z), axis = 0)
  return yhat

### Forward propagation