In [None]:
pip install emoji

Collecting emoji
  Downloading emoji-2.12.1-py3-none-any.whl.metadata (5.4 kB)
Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/431.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m430.1/431.4 kB[0m [31m34.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.12.1


In [None]:
import re
import nltk

# Use to handle punctuation
nltk.download('punkt')

import numpy as np
from nltk.tokenize import word_tokenize
import emoji

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Data Preparation

### Cleaning and tokenization

In [None]:
# Define corpus
corpus = 'Who ❤️ "word embeddings" in 2020? I do!!!"'

In [None]:
# Print original corpus
print(f"Corpus : {corpus}")

# Do the substitution
data = re.sub(r'[,!?;-]', '.', corpus)

# Print cleaned corpus
print(f"After cleaning punctuation : {data}")

Corpus : Who ❤️ "word embeddings" in 2020? I do!!!"
After cleaning punctuation : Who ❤️ "word embeddings" in 2020. I do..."


In [None]:
# Print the cleaned corpus
print(f"Initial string : {data}")

# Tokenize the cleaned corpus
data = nltk.word_tokenize(data)

# Print the tokenized version of the corpus
print(f"After tokenization : {data}")

Initial string : Who ❤️ "word embeddings" in 2020. I do..."
After tokenization : ['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2020', '.', 'I', 'do', '...', "''"]


In [None]:
# Print the tokenized version of the corpus
print(f"Initial list of tokens : {data}")

# Filter tokenized corpus using list comprehension
data  = [
    ch.lower() for ch in data
    if ch.isalpha()
    or ch == '.'
    or bool(emoji.emoji_list(ch))
]

# Print the tokenized and filtered version of the corpus
print(f"After cleaning : {data}")

Initial list of tokens : ['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2020', '.', 'I', 'do', '...', "''"]
After cleaning : ['who', '❤️', 'word', 'embeddings', 'in', '.', 'i', 'do']


In [None]:
def tokenize(corpus):
  data = re.sub(r'[,!?;-]+', '.', corpus)
  data = nltk.word_tokenize(data)
  data = [
      ch.lower() for ch in data
      if ch.isalpha()
      or ch == '.'
      or bool(emoji.emoji_list(ch))
  ]
  return data

In [None]:
# Define new corpus
corpus = "I am happy because I am learning"

# Print new corpus
print(f"Corpus : {corpus}")

# Save tokenized version of corpus into 'words' variable
words = tokenize(corpus)

# Print the tokenzied version of the corpus
print(f"Words (tokens) : {words}")

Corpus : I am happy because I am learning
Words (tokens) : ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']


In [None]:
# Run this with any sentence
tokenize("My name is John. How are you?")

['my', 'name', 'is', 'john', '.', 'how', 'are', 'you', '.']

# Sliding window of words

In [None]:
# Define the 'get_windows' function
def get_windows(words, C):
  i = C
  while i < len(words) - C:
    center_word = words[i]
    context_words = words[(i - C) : i] + words[(i + 1) : (i + C + 1)]
    yield context_words, center_word
    i += 1

In [None]:
for x,y in get_windows(['i', 'am', 'happy', 'because', 'i', 'am', 'learning'], 2):
  print(f"{x}\t{y}")

['i', 'am', 'because', 'i']	happy
['am', 'happy', 'i', 'am']	because
['happy', 'because', 'am', 'learning']	i


In [None]:
# Print 'context_words' and 'center_word' for any sentence with a 'context half-size' of 1
for x, y in get_windows(tokenize("My name is John. How are you?"), 1):
    print(f'{x}\t{y}')

['my', 'is']	name
['name', 'john']	is
['is', '.']	john
['john', 'how']	.
['.', 'are']	how
['how', 'you']	are
['are', '.']	you


# Transforming words into vectors for the training set

In [None]:
def get_dict(data):
  words = sorted(list(set(data)))
  n = len(words)
  idx = 0

  # return these correctly
  word2Ind = {}
  Ind2word = {}
  for k in words:
    word2Ind[k] = idx
    Ind2word[idx] = k
    idx +=1

  return word2Ind, Ind2word

In [None]:
# Get "word2Ind" and "Ind2word" dictionaries for the tokenized corpus
word2Ind, Ind2word = get_dict(words)

In [None]:
# Print 'word2Ind' dictionary
word2Ind

{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4}

In [None]:
# Print value for the key 'i' within word2Ind dictionary
print("Index of the word 'i' : ", word2Ind['i'])

Index of the word 'i' :  3


In [None]:
# Print 'Ind2word' dictionary
Ind2word

{0: 'am', 1: 'because', 2: 'happy', 3: 'i', 4: 'learning'}

In [None]:
# Print value for the key '2' within Ind2word dictionary
print("Word which has index 2 : ", Ind2word[2])

Word which has index 2 :  happy


In [None]:
# Save length of word2Ind dictionary into the 'V' variable
V = len(word2Ind)

# Print length of word2Ind dictionary
print("Size of vacabulary : ", V)

Size of vacabulary :  5


# Getting one-hot word vectors

In [None]:
# Save index of word 'happy' into the 'n' variable
n = word2Ind['happy']

# Print index of word 'happy'
n

2

In [None]:
# Create vector with the same length as the vocabulary, filled with zeros
center_word_vector = np.zeros(V)

# Print vector
center_word_vector

array([0., 0., 0., 0., 0.])

In [None]:
# Assert that the length of the vector is the same as the size of the vocabulary
len(center_word_vector) == V

True

In [None]:
# Replace element number 'n' with a 1
center_word_vector[n] = 1

# Print vector
center_word_vector

array([0., 0., 1., 0., 0.])

In [None]:
# Define the 'word_to_one_hot_vector' function
def word_to_one_hot_vector(word, word2Ind, V):
  one_hot_vector = np.zeros(V)
  one_hot_vector[word2Ind[word]] = 1
  return one_hot_vector

In [None]:
# Print output of 'word_to_one_hot_vector' function for word 'happy'
word_to_one_hot_vector('happy', word2Ind, V)

array([0., 0., 1., 0., 0.])

# Getting context word vectors

In [None]:
# Define list containing context words
context_words = ['i', 'am', 'because', 'i']

In [None]:
# Create one-hot vectors for each context word
context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]

# Print context words vectors
context_words_vectors

[array([0., 0., 0., 1., 0.]),
 array([1., 0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0.]),
 array([0., 0., 0., 1., 0.])]

In [None]:
# Compute the mean of vectors using numpy
# Note the axis=0 parameter that tells mean to calculate the
# average of the rows (if you had wanted the average of the columns,
# you would have used axis=1).
np.mean(context_words_vectors, axis = 0)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [None]:
# Define the 'context_words_to_vector' function
def context_words_to_vector(context_words, word2Ind, V):
  context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
  context_words_vectors = np.mean(context_words_vectors, axis = 0)
  return context_words_vectors

In [None]:
# Print output of 'context_words_to_vector' function for context words: 'i', 'am', 'because', 'i'
context_words_to_vector(['i', 'am', 'because', 'i'], word2Ind, V)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

# Building the training set

In [None]:
# Print corpus
words

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

In [None]:
for context_words, center_word in get_windows(words, 2):
  print(f"Context words : {context_words} -> {context_words_to_vector(context_words, word2Ind, V)}")
  print(f'Center word:  {center_word} -> {word_to_one_hot_vector(center_word, word2Ind, V)}')
  print()

Context words : ['i', 'am', 'because', 'i'] -> [0.25 0.25 0.   0.5  0.  ]
Center word:  happy -> [0. 0. 1. 0. 0.]

Context words : ['am', 'happy', 'i', 'am'] -> [0.5  0.   0.25 0.25 0.  ]
Center word:  because -> [0. 1. 0. 0. 0.]

Context words : ['happy', 'because', 'am', 'learning'] -> [0.25 0.25 0.25 0.   0.25]
Center word:  i -> [0. 0. 0. 1. 0.]



In [None]:
# Define the generator function 'get_training_example'
def get_training_example(words, C, word2Ind, V):
  for context_words, center_word in get_windows(words, C):
    yield context_words_to_vector(context_words, word2Ind, V), word_to_one_hot_vector(center_word, word2Ind, V)

In [None]:
# Print vectors associated to center and context words for corpus using the generator function
for context_words_vector, center_word_vector in get_training_example(words, 2, word2Ind, V):
    print(f'Context words vector:  {context_words_vector}')
    print(f'Center word vector:  {center_word_vector}')
    print()

Context words vector:  [0.25 0.25 0.   0.5  0.  ]
Center word vector:  [0. 0. 1. 0. 0.]

Context words vector:  [0.5  0.   0.25 0.25 0.  ]
Center word vector:  [0. 1. 0. 0. 0.]

Context words vector:  [0.25 0.25 0.25 0.   0.25]
Center word vector:  [0. 0. 0. 1. 0.]

