# Word level one hot encoding

In [0]:
import numpy as np

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

token_index = {}

for sample in samples:
  for word in sample.split():
    if word not in token_index:
      token_index[word] = len(token_index) + 1
     
   
  
max_length = 10

results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
for i, sample in enumerate(samples):
  for j, word in list(enumerate(sample.split()))[:max_length]:
    index = token_index.get(word)
    results[i, j, index] = 1

In [6]:
token_index

{'The': 1,
 'ate': 8,
 'cat': 2,
 'dog': 7,
 'homework.': 10,
 'mat.': 6,
 'my': 9,
 'on': 4,
 'sat': 3,
 'the': 5}

# Character level one hot encoding

In [0]:
import string

samples = ['The cat sat on the mat.', 'The dog ate my homework.']
characters = string.printable

token_index = dict(zip(characters, range(1, len(characters) + 1)))

In [15]:
max_length = 50
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))

for i, sample in enumerate(samples):
  for j, character in enumerate(samples[:max_length]):
    index = token_index.get(character)
    results[i, j, index] = 1

results.shape

(2, 50, 101)

# Using keras

In [22]:
from keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

tokenizer = Tokenizer(num_words = 1000)
tokenizer.fit_on_texts(samples)

sequences = tokenizer.texts_to_sequences(samples)

one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')

sequences

[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]

# Hashing

In [0]:
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

dim = 1000
max_length = 10

results = np.zeros((len(samples), max_length, dim))
for i, sample in enumerate(samples):
  for j, word in list(enumerate(sample.split()))[:max_length]:
    index = abs(hash(word)) % dim
    results[i, j, index] = 1