In [3]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
from keras.preprocessing import text

import pandas as pd
import itertools  
from keras.utils import np_utils
import numpy as np
import io
import os

from sklearn.metrics.pairwise import euclidean_distances

#### Types of WordEmbedding Algorithms:
        
        1. CBOW
        2. Skip-Gram with Negative Sampling 

#### Text Preprocessing Steps

1. Convert text to UTF-8 [Encode]
2. Remove Special Characters, Numbers, Punctuations, Stop Words, html urls etc. [This can be done using regex or NLTK python library]
3. Lemmatization & Stemming
        It is the process of converting a word to its base form, e.g., “caring” to “care”
4. Convert to lower case
5. Tokenize

In [4]:
file_path = '/Users/ukannika/work/personal/machine-learning/datasets/tweets.csv'
raw_df = pd.read_csv(file_path, encoding = "ISO-8859-1")

tweets = raw_df['Tweet Content']
tweets.head(5)

0    Pets change our lives &amp; become a part of o...
1    Another spot of our #morethanmedicine bus in #...
2    What a great team â¦@HealthSourceOHâ© â¦@Lo...
3    What a great team â¦@HealthSourceOHâ© â¦@Lo...
4    What a great team â¦@HealthSourceOHâ© â¦@Lo...
Name: Tweet Content, dtype: object

In [6]:
print("Shape: ", tweets.shape)

Shape:  (386,)


##### Tokenizer
Text tokenization utility class.

num_words: *the maximum number of words to keep, based
            on word frequency. Only the most common `num_words-1` words will
            be kept.*
            
filters: *a string where each element is a character that will be
            filtered from the texts. The default is all punctuation, plus
            tabs and line breaks, minus the `'` character.*
            
lower: *boolean. Whether to convert the texts to lowercase.*

split: *str. Separator for word splitting.*

char_level: *if True, every character will be treated as a token.*

oov_token: *if given, it will be added to word_index and used to
            replace out-of-vocabulary words during text_to_sequence calls*
        
**Methods :**
1. fit_on_texts => Updates internal vocabulary based on a list of texts. Should be used before texts_to_sequences

2. texts_to_sequences => Transforms each text in texts to a sequence of integers.


Step1 => Create a vocabulary(Each word assigned with unique number). This vocabulary get's created when 
tokenizer.fit_on_texts method called.

Step2 => Convert text to sequences. Each sentence vector contains numbers associated with that word.

In [7]:
oov_token = '<UNK>'
tokenizer = tf.keras.preprocessing.text.Tokenizer(
                        num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n>^https?:\/\/.*[\r\n]*[^\x00-\x7F]+', lower=True,
                        split=' ', char_level=False, oov_token=oov_token, document_count=0)

# Tokenize our training data 
tokenizer.fit_on_texts(tweets) 

# Generate Sequeneces.
sequences = tokenizer.texts_to_sequences(tweets)
print(tweets[0])
print(sequences[0])

Pets change our lives &amp; become a part of our families â¤ï¸
That's why our members offer many solutions to help you to enjoy a long-lasting bond with your happy &amp; healthy pet ð±ð¶
#MorethanMedicine #PetCare #PetsareFamily https://t.co/fZNIXge9a3
[3, 34, 563, 9, 112, 31, 427, 2, 90, 18, 9, 959, 428, 2, 143, 17, 19, 9, 83, 367, 166, 51, 14, 4, 25, 30, 4, 104, 2, 429, 205, 22, 310, 21, 42, 2, 19, 31, 11, 19, 3, 960, 6, 8, 3, 69, 3, 494, 5, 961]


Mappings between word to index and viceversa.

In [8]:
word_index = tokenizer.word_index
index_word = tokenizer.index_word

In [9]:
for x in list(word_index)[10:15]:
    print ("{}:{}".format(x,  word_index[x]))
    
print("\n")

for x in list(index_word)[10:15]:
    print ("{}:{}".format(x,  index_word[x]))

eal:11
and:12
er:13
ion:14
we:15


11:eal
12:and
13:er
14:ion
15:we


#### CBOW Implementation

##### Padding
Padding is a special form of masking where the masked steps are at the start or at the beginning of a sequence. Padding comes from the need to encode sequence data into contiguous batches: in order to make all sequences in a batch fit a given standard length, it is necessary to pad or truncate some sequences.

In [10]:
word_index['PAD'] = 0 
index_word[0] = 'PAD'

vocab_size = len(word_index)
print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word_index.items())[:10])

Vocabulary Size: 1125
Vocabulary Sample: [('<UNK>', 1), ('a', 2), ('e', 3), ('o', 4), ('co', 5), ('more', 6), ('i', 7), ('anmedicine', 8), ('our', 9), ('animal', 10)]


In [11]:
def generate_cbow_pairs(sequence, window_size):
    x = []
    y = []
    for i in range(0, len(sequence) - window_size, 1):
        x.append(sequence[i:i+window_size])
        y.append(sequence[i + window_size])
    
    return (x, y)

In [12]:
# Verify generate_cbow_pairs method. 
for sequence in sequences[0:1]:
    print(sequence, "\n")
    x, y = generate_cbow_pairs(sequence, 4)
    print(x, y)

[3, 34, 563, 9, 112, 31, 427, 2, 90, 18, 9, 959, 428, 2, 143, 17, 19, 9, 83, 367, 166, 51, 14, 4, 25, 30, 4, 104, 2, 429, 205, 22, 310, 21, 42, 2, 19, 31, 11, 19, 3, 960, 6, 8, 3, 69, 3, 494, 5, 961] 

[[3, 34, 563, 9], [34, 563, 9, 112], [563, 9, 112, 31], [9, 112, 31, 427], [112, 31, 427, 2], [31, 427, 2, 90], [427, 2, 90, 18], [2, 90, 18, 9], [90, 18, 9, 959], [18, 9, 959, 428], [9, 959, 428, 2], [959, 428, 2, 143], [428, 2, 143, 17], [2, 143, 17, 19], [143, 17, 19, 9], [17, 19, 9, 83], [19, 9, 83, 367], [9, 83, 367, 166], [83, 367, 166, 51], [367, 166, 51, 14], [166, 51, 14, 4], [51, 14, 4, 25], [14, 4, 25, 30], [4, 25, 30, 4], [25, 30, 4, 104], [30, 4, 104, 2], [4, 104, 2, 429], [104, 2, 429, 205], [2, 429, 205, 22], [429, 205, 22, 310], [205, 22, 310, 21], [22, 310, 21, 42], [310, 21, 42, 2], [21, 42, 2, 19], [42, 2, 19, 31], [2, 19, 31, 11], [19, 31, 11, 19], [31, 11, 19, 3], [11, 19, 3, 960], [19, 3, 960, 6], [3, 960, 6, 8], [960, 6, 8, 3], [6, 8, 3, 69], [8, 3, 69, 3], [3, 69,

##### Build train data

In [13]:
# Hyper parameters for CBOW model.
window_size = 4
embed_size = 300

In [14]:
train_features = []
train_labels = []

for sequence in sequences:
    x, y = generate_cbow_pairs(sequence, window_size)
    train_features.extend(x)
    train_labels.extend(y)

# Convert the list to numpy array.
x_train = np.asarray(train_features)
y_train = np.asarray(train_labels)

print("Features Shape: ", x_train.shape)
print("Labels Shape: ", y_train.shape)

Features Shape:  (14494, 4)
Labels Shape:  (14494,)


In [15]:
Y_labels = np_utils.to_categorical(y_train, vocab_size)
print(Y_labels.shape)
Y_labels

(14494, 1125)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

#### Build Model & Train

Embedding class Turns positive integers (indexes) into dense vectors of fixed size.

e.g. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]

This layer can only be used as the first layer in a model.

In [69]:
model = keras.Sequential([
        keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size),
        keras.layers.GlobalAveragePooling1D(),
        keras.layers.Dense(256, activation='relu'),
        keras.layers.Dense(vocab_size, activation='softmax')])

model.compile(optimizer=keras.optimizers.Adam(lr=1e-3), loss=keras.losses.CategoricalCrossentropy())

In [70]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 4, 300)            337500    
_________________________________________________________________
global_average_pooling1d_3 ( (None, 300)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 256)               77056     
_________________________________________________________________
dense_9 (Dense)              (None, 1125)              289125    
Total params: 703,681
Trainable params: 703,681
Non-trainable params: 0
_________________________________________________________________


In [74]:
history = model.fit(x_train, Y_labels, batch_size=2000, epochs=10)

Train on 14494 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### Retrieve the learned embeddings

In [75]:
embedding_layer = model.layers[0]
weights = embedding_layer.get_weights()[0]
print(weights.shape) 
print(weights[0])

(1125, 300)
[ 0.02007321  0.04350071  0.01544568 -0.00936638 -0.03054251 -0.03378773
  0.0253515   0.01920177  0.04073009  0.04364152  0.0457342  -0.01785219
  0.03220594  0.04336016  0.03884926 -0.02602983  0.04449982 -0.02690039
 -0.00022074 -0.03083779 -0.0433033  -0.04465207  0.04261256 -0.02998441
  0.0375744   0.02662111 -0.0128968  -0.03417498  0.00962695  0.01021521
  0.04195645 -0.0374647  -0.02804193 -0.01547996 -0.01075705 -0.0181417
  0.03958631 -0.0018759   0.03126604 -0.01205534  0.03290215 -0.02777009
 -0.04677712 -0.04433953 -0.02434924  0.02098035 -0.03920626 -0.00438463
 -0.03070349 -0.03782425 -0.00810627  0.00963453  0.01537367  0.01646267
  0.00408176 -0.02522275  0.02311251 -0.02515165  0.03230816  0.01896275
 -0.0452767   0.04619545  0.04516533  0.01172287  0.01555493  0.00772208
 -0.04287213  0.01814878  0.03943956 -0.01290433  0.04728908 -0.00499083
 -0.01116442  0.03490582 -0.04606031  0.03758509 -0.00830249 -0.0215108
 -0.03080089  0.00321914 -0.00918034 -0.0

#### Find Similar Words

In [76]:
# Write weights to disk to view it on Embedding Projector. In practice,
# we can use Embedding Projector or project this data to 30 Dimensions and then Use t-Sne to visualize it.
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for num, word in enumerate(word_index):
    vec = weights[num - 1] # skip 0, it's padding.
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    
out_v.close()
out_m.close()

#### pre-trained word vectors

In [60]:
out_v = io.open('vecs_pretrained.tsv', 'w', encoding='utf-8')
out_m = io.open('meta_pretrained.tsv', 'w', encoding='utf-8')

embeddings_index = {}

with open(os.path.join('/Users/ukannika/Downloads/glove.6B/', 'glove.6B.100d.txt')) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs_vec = np.fromstring(coefs, 'f', sep=' ')
        out_m.write(word + "\n")
        out_v.write('\t'.join([str(x) for x in coefs_vec]) + "\n")   

# View these in Embedding Projector

##### Skip Gram Models With Negative Sampling

This function transforms a sequence of word indexes (list of integers) into tuples of words of the form:
    
    (word, word in the same window), with label 1 (positive samples).
    (word, random word from the vocabulary), with label 0 (negative samples).

In [83]:
x, y = tf.keras.preprocessing.sequence.skipgrams(
                sequences[0], vocab_size, window_size=1, negative_samples=0.6, shuffle=True,
                categorical=False, sampling_table=None, seed=None)

print(x, "\n")
print(y)

[[960, 6], [9, 348], [19, 11], [19, 31], [2, 90], [310, 22], [367, 221], [143, 871], [8, 3], [563, 533], [90, 499], [166, 367], [22, 205], [21, 42], [5, 399], [30, 766], [143, 434], [367, 83], [367, 57], [9, 667], [6, 8], [21, 310], [31, 810], [8, 6], [494, 5], [90, 242], [31, 1055], [83, 9], [17, 11], [34, 3], [104, 2], [42, 233], [961, 5], [427, 2], [18, 597], [112, 31], [4, 999], [42, 1094], [3, 19], [3, 359], [2, 19], [205, 429], [494, 3], [563, 34], [9, 19], [494, 981], [11, 276], [3, 494], [42, 21], [367, 166], [83, 367], [18, 499], [25, 30], [5, 494], [5, 961], [428, 959], [19, 17], [2, 922], [51, 712], [30, 25], [25, 408], [3, 27], [4, 1013], [14, 4], [166, 1085], [3, 8], [143, 17], [3, 69], [2, 104], [51, 166], [104, 977], [4, 952], [19, 174], [69, 3], [959, 9], [21, 308], [563, 9], [2, 428], [19, 677], [494, 800], [429, 205], [51, 14], [112, 1024], [2, 429], [69, 330], [960, 3], [205, 22], [960, 343], [3, 69], [42, 2], [17, 516], [19, 1000], [19, 2], [19, 549], [143, 2], [18,

In [84]:
# Build train data
x = []
y = []

for sequence in sequences:
    x_skipgm, y_skipgm = tf.keras.preprocessing.sequence.skipgrams(
                sequence, vocab_size, window_size=1, negative_samples=0.6, shuffle=True,
                categorical=False, sampling_table=None, seed=None)
    x.extend(x_skipgm)
    y.extend(y_skipgm)
    
x_train = np.asarray(x)
y_train = np.asarray(y)

In [86]:
# Build Model
model = keras.Sequential([
          keras.layers.Embedding(vocab_size, embed_size),
          keras.layers.GlobalAveragePooling1D(),
          keras.layers.Dense(16, activation='relu'),
          keras.layers.Dense(1, activation='sigmoid')
        ])

model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), metrics=['accuracy'])

history = model.fit(x, y, batch_size=2000, epochs=5)

Train on 49938 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [64]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) 

out_v = io.open('vecs_skipgram.tsv', 'w', encoding='utf-8')
out_m = io.open('meta_skipgram.tsv', 'w', encoding='utf-8')

for num, word in enumerate(word_index):
    vec = weights[num - 1] # skip 0, it's padding.
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    
out_v.close()
out_m.close()

(50006, 300)
