In [1]:
pip install emoji

Collecting emoji
  Downloading emoji-2.12.1-py3-none-any.whl.metadata (5.4 kB)
Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/431.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/431.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m430.1/431.4 kB[0m [31m7.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.12.1


In [19]:
import sys

import re
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
import emoji

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Data preparation

### Cleaning and tokenization

In [3]:
corpus = 'Who ❤️ "word embeddings" in 2020? I do!!!'

In [4]:
print(f'Corpus:  {corpus}')
data = re.sub(r'[,!?;-]', '.', corpus)
print(f"After clearning punctuation : ", {data})

Corpus:  Who ❤️ "word embeddings" in 2020? I do!!!
After clearning punctuation :  {'Who ❤️ "word embeddings" in 2020. I do...'}


In [5]:
print(f"Initial string : {data}")
data = nltk.word_tokenize(data)
print(f"After tokenization: {data}")

Initial string : Who ❤️ "word embeddings" in 2020. I do...
After tokenization: ['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2020', '.', 'I', 'do', '...']


In [6]:
print(f'Initial list of tokens: {data}')
data = [ch.lower() for ch in data
        if ch.isalpha()
        or ch == "."
        or bool(emoji.emoji_list(ch))
      ]
print(f"After cleaning : {data}")

Initial list of tokens: ['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2020', '.', 'I', 'do', '...']
After cleaning : ['who', '❤️', 'word', 'embeddings', 'in', '.', 'i', 'do']


In [7]:
def tokenize(corpus):
  data = re.sub(r'[,!?;-]+', '.', corpus)
  data = nltk.word_tokenize(data)
  data = [ch.lower() for ch in data
          if ch.isalpha()
          or ch == '.'
          or bool(emoji.emoji_list(ch))
          ]
  return data

In [13]:
corpus = 'I am happy because I am learning'
print(f'Corpus:  {corpus}')
words = tokenize(corpus)
print(f'Words (tokens):  {words}')

Corpus:  I am happy because I am learning
Words (tokens):  ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']


In [8]:
tokenize("Hi John, my name is Peter. How are you?")

['hi', 'john', '.', 'my', 'name', 'is', 'peter', '.', 'how', 'are', 'you', '.']

# Sliding window of words

In [9]:
def get_windows(words, C):
  i = C
  while i < len(words) - C:
    center_word = words[i]
    context_words = words[(i - C) : i] + words[(i + 1) : (i + C + 1)]
    yield context_words, center_word
    i += 1

In [10]:
for x,y in get_windows(['i', 'am', 'happy', 'because', 'i', 'am', 'learning'], 2):
  print(f"{x}\t{y}")

['i', 'am', 'because', 'i']	happy
['am', 'happy', 'i', 'am']	because
['happy', 'because', 'am', 'learning']	i


# Transforming words into vectors for the training set

### Mapping words to indices and indices to words

In [11]:
def get_dict(words):
  words = sorted(list(set(words)))
  idx = 0
  word2Ind = {}
  Ind2word = {}
  for word in words:
    word2Ind[word] = idx
    Ind2word[idx] = word
    idx += 1
  return word2Ind, Ind2word

In [14]:
word2Ind, Ind2word = get_dict(words)

In [15]:
word2Ind

{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4}

In [16]:
print("Index of the word 'i':  ",word2Ind['i'])

Index of the word 'i':   3


In [17]:
V = len(word2Ind)
print("Size of vocabulary: ", V)

Size of vocabulary:  5


# Getting one-hot word vectors

In [18]:
n = word2Ind['happy']
n

2

In [20]:
center_word_vector = np.zeros(V)
center_word_vector

array([0., 0., 0., 0., 0.])

In [21]:
len(center_word_vector) == V

True

In [22]:
center_word_vector[n] = 1

In [23]:
center_word_vector

array([0., 0., 1., 0., 0.])

In [24]:
def word_to_one_hot_vector(word, word2Ind, V):
  one_hot_vector = np.zeros(V)
  one_hot_vector[word2Ind[word]] = 1
  return one_hot_vector

In [25]:
word_to_one_hot_vector('happy', word2Ind, V)

array([0., 0., 1., 0., 0.])

In [26]:
word_to_one_hot_vector('learning', word2Ind, V)

array([0., 0., 0., 0., 1.])

# Getting context word vectors

In [27]:
context_words = ['i', 'am', 'because', 'i']

In [28]:
context_word_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
context_word_vectors

[array([0., 0., 0., 1., 0.]),
 array([1., 0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0.]),
 array([0., 0., 0., 1., 0.])]

In [29]:
np.mean(context_word_vectors, axis = 0)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [30]:
def context_words_to_vector(context_words, word2Ind, V):
  context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
  context_words_vectors = np.mean(context_word_vectors, axis = 0)
  return context_words_vectors

In [31]:
context_words_to_vector(['i', 'am', 'because', 'i'], word2Ind, V)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [32]:
context_words_to_vector(['am', 'happy', 'i', 'am'], word2Ind, V)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

# Building the training set

In [33]:
words

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

In [35]:
for context_words, center_word in get_windows(words, 2):
  print(f"Context words: {context_words} -> {context_words_to_vector(context_words, word2Ind, V)}")
  print(f"Center words : {center_word}   -> {word_to_one_hot_vector(center_word, word2Ind, V)}")
  print()

Context words: ['i', 'am', 'because', 'i'] -> [0.25 0.25 0.   0.5  0.  ]
Center words : happy   -> [0. 0. 1. 0. 0.]

Context words: ['am', 'happy', 'i', 'am'] -> [0.25 0.25 0.   0.5  0.  ]
Center words : because   -> [0. 1. 0. 0. 0.]

Context words: ['happy', 'because', 'am', 'learning'] -> [0.25 0.25 0.   0.5  0.  ]
Center words : i   -> [0. 0. 0. 1. 0.]



In [36]:
def get_training_example(words, C, word2Ind, V):
  for context_words, center_word in get_windows(words, C):
    yield context_words_to_vector(context_words, word2Ind, V), word_to_one_hot_vector(center_word, word2Ind, V)

In [37]:
for context_words_vector, center_word_vector in get_training_example(words, 2, word2Ind, V):
  print(f"Context words vector: {context_words_vector}")
  print(f"Center word vector  : {center_word_vector}")
  print()

Context words vector: [0.25 0.25 0.   0.5  0.  ]
Center word vector  : [0. 0. 1. 0. 0.]

Context words vector: [0.25 0.25 0.   0.5  0.  ]
Center word vector  : [0. 1. 0. 0. 0.]

Context words vector: [0.25 0.25 0.   0.5  0.  ]
Center word vector  : [0. 0. 0. 1. 0.]



# Continuous bag-of-words model

### ReLU activation function

In [40]:
np.random.seed(10)
z_1 = 10 * np.random.rand(5, 1) - 5
z_1

array([[ 2.71320643],
       [-4.79248051],
       [ 1.33648235],
       [ 2.48803883],
       [-0.01492988]])

In [41]:
h = z_1.copy()

In [42]:
h < 0

array([[False],
       [ True],
       [False],
       [False],
       [ True]])

In [43]:
h[h < 0] = 0

In [44]:
h

array([[2.71320643],
       [0.        ],
       [1.33648235],
       [2.48803883],
       [0.        ]])

In [45]:
def relu(z):
  result = z.copy()
  result[result < 0] = 0
  return result

In [46]:
z = np.array([[-1.25459881], [ 4.50714306], [ 2.31993942], [ 0.98658484], [-3.4398136 ]])
relu(z)

array([[0.        ],
       [4.50714306],
       [2.31993942],
       [0.98658484],
       [0.        ]])

### Softmax activation function

In [47]:
z = np.array([9, 8, 11, 10, 8.5])
z

array([ 9. ,  8. , 11. , 10. ,  8.5])

In [48]:
e_z = np.exp(z)
e_z

array([ 8103.08392758,  2980.95798704, 59874.1417152 , 22026.46579481,
        4914.7688403 ])

In [49]:
sum_e_z = np.sum(e_z)
sum_e_z

97899.41826492078

In [50]:
e_z[0] / sum_e_z

0.08276947985173956

In [51]:
def softmax(z):
  e_z = np.exp(z)
  sum_e_z = np.sum(e_z)
  return e_z / sum_e_z

In [52]:
softmax([9, 8, 11, 10, 8.5])

array([0.08276948, 0.03044919, 0.61158833, 0.22499077, 0.05020223])

### Dimensions: 1-D arrays vs 2-D column vectors

In [53]:
x_array = np.zeros(V)
x_array

array([0., 0., 0., 0., 0.])

In [54]:
x_array.shape

(5,)

In [55]:
x_column_vector = x_array.copy()
x_column_vector.shape = (V,1)
x_column_vector

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [56]:
x_column_vector.shape

(5, 1)

### Forward propagation

In [57]:
N = 3

In [58]:
W1 = np.array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
               [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
               [ 0.26637602, -0.23846886, -0.37770863, -0.11399446,  0.34008124]])

W2 = np.array([[-0.22182064, -0.43008631,  0.13310965],
               [ 0.08476603,  0.08123194,  0.1772054 ],
               [ 0.1871551 , -0.06107263, -0.1790735 ],
               [ 0.07055222, -0.02015138,  0.36107434],
               [ 0.33480474, -0.39423389, -0.43959196]])

b1 = np.array([[ 0.09688219],
               [ 0.29239497],
               [-0.27364426]])

b2 = np.array([[ 0.0352008 ],
               [-0.36393384],
               [-0.12775555],
               [-0.34802326],
               [-0.07017815]])

In [59]:
print(f'V (vocabulary size): {V}')
print(f'N (embedding size / size of the hidden layer): {N}')
print(f'size of W1: {W1.shape} (NxV)')
print(f'size of b1: {b1.shape} (Nx1)')
print(f'size of W2: {W2.shape} (VxN)')
print(f'size of b2: {b2.shape} (Vx1)')

V (vocabulary size): 5
N (embedding size / size of the hidden layer): 3
size of W1: (3, 5) (NxV)
size of b1: (3, 1) (Nx1)
size of W2: (5, 3) (VxN)
size of b2: (5, 1) (Vx1)


In [60]:
training_examples = get_training_example(words, 2, word2Ind, V)

In [61]:
x_array, y_array = next(training_examples)

In [62]:
x_array

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [63]:
y_array

array([0., 0., 1., 0., 0.])

In [64]:
x = x_array.copy()
x.shape = (V,1)
print('x')
print(x)

print()

y = y_array.copy()
y.shape = (V,1)
print('y')
print(y)

x
[[0.25]
 [0.25]
 [0.  ]
 [0.5 ]
 [0.  ]]

y
[[0.]
 [0.]
 [1.]
 [0.]
 [0.]]


### Values of the hidden layer

In [65]:
z1 = np.dot(W1, x) + b1

In [66]:
z1

array([[ 0.36483875],
       [ 0.63710329],
       [-0.3236647 ]])

In [67]:
h = relu(z1)

### Value of the output layer

In [68]:
z2 = np.dot(W2, h) + b2
z2

array([[-0.31973737],
       [-0.28125477],
       [-0.09838369],
       [-0.33512159],
       [-0.19919612]])

In [69]:
y_hat = softmax(z2)
y_hat

array([[0.18519074],
       [0.19245626],
       [0.23107446],
       [0.18236353],
       [0.20891502]])

### Cross-entropy loss