In [2]:
import re
import numpy as np
import matplotlib.pyplot as plt

### Make Encoder and Decoder functions

In [3]:
# list of sentences
text = ['All that we are is the result of what we have thought',
        'To be or not to be is the question',
        'Be yourself everyone else is already taken']
# create a vocab of unique tokens(words)
allwords = re.split(r"\s", " ".join(text).lower())
vocab = sorted(set(allwords))


In [4]:
# create encoder and decoder dictionaries
word2idx = {word:i for i, word in enumerate(vocab)}
idx2word = {i:word for i, word in enumerate(vocab)}

In [5]:
# create encoder and decoder function
def encoder(text):
  words = re.split(r"\s", text.lower())
  return [word2idx[w] for w in words]

def decoder(indices):
  return " ".join([idx2word[i] for i in indices])

In [6]:
newtext = 'we already are the result of what ' \
          'everyone else already thought'
newTokenIDs = encoder(newtext)
print(newtext)
print(newTokenIDs)
print(decoder(newTokenIDs))

we already are the result of what everyone else already thought
[18, 1, 2, 15, 12, 9, 19, 5, 4, 1, 16]
we already are the result of what everyone else already thought


In [10]:
targetWord = 'to'
targetTokenID = word2idx[targetWord]
targetLocs = np.where(np.array(allwords)==targetWord)[0]
print(f'"{targetWord}" appears at indices {targetLocs}')

tokens = [word2idx[w] for w in allwords]
for t in targetLocs:
  print(tokens[t-1:t+2])
  print(" ".join(allwords[t-1:t+2]))


"to" appears at indices [12 16]
[16, 17, 3]
thought to be
[8, 17, 3]
not to be


### One-hot Encoding

In [11]:
word_matrix = np.zeros((len(allwords), len(vocab)), dtype=int)
# create matrix
for i, word in enumerate(allwords):
  word_matrix[i][word2idx[word]] = 1

print(word_matrix)

[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 