In [None]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
from keras.preprocessing import text

import pandas as pd
import itertools  
from keras.utils import np_utils
import numpy as np
import io
import os

from sklearn.metrics.pairwise import euclidean_distances

#### Types of WordEmbedding Algorithms:
        
        1. CBOW
        2. Skip-Gram with Negative Sampling 

#### Text Preprocessing Steps

1. Convert text to UTF-8 [Encode]
2. Remove Special Characters, Numbers, Punctuations, Stop Words, html urls etc. [This can be done using regex or NLTK python library]
3. Lemmatization & Stemming
        It is the process of converting a word to its base form, e.g., “caring” to “care”
4. Convert to lower case
5. Tokenize

In [None]:
file_path = '/Users/ukannika/work/personal/machine-learning/datasets/tweets.csv'
raw_df = pd.read_csv(file_path, encoding = "ISO-8859-1")

tweets = raw_df['Tweet Content']
tweets.head(5)

In [None]:
print("Shape: ", tweets.shape)

##### Tokenizer
Text tokenization utility class.

num_words: *the maximum number of words to keep, based
            on word frequency. Only the most common `num_words-1` words will
            be kept.*
            
filters: *a string where each element is a character that will be
            filtered from the texts. The default is all punctuation, plus
            tabs and line breaks, minus the `'` character.*
            
lower: *boolean. Whether to convert the texts to lowercase.*

split: *str. Separator for word splitting.*

char_level: *if True, every character will be treated as a token.*

oov_token: *if given, it will be added to word_index and used to
            replace out-of-vocabulary words during text_to_sequence calls*
        
**Methods :**
1. fit_on_texts => Updates internal vocabulary based on a list of texts. Should be used before texts_to_sequences

2. texts_to_sequences => Transforms each text in texts to a sequence of integers.


Step1 => Create a vocabulary(Each word assigned with unique number). This vocabulary get's created when 
tokenizer.fit_on_texts method called.

Step2 => Convert text to sequences. Each sentence vector contains numbers associated with that word.

In [None]:
oov_token = '<UNK>'
tokenizer = tf.keras.preprocessing.text.Tokenizer(
                        num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n>^https?:\/\/.*[\r\n]*[^\x00-\x7F]+', lower=True,
                        split=' ', char_level=False, oov_token=oov_token, document_count=0)

# Tokenize our training data 
tokenizer.fit_on_texts(tweets) 

# Generate Sequeneces.
sequences = tokenizer.texts_to_sequences(tweets)
print(tweets[0])
print(sequences[0])

Mappings between word to index and viceversa.

In [None]:
word_index = tokenizer.word_index
index_word = tokenizer.index_word

In [None]:
for x in list(word_index)[10:15]:
    print ("{}:{}".format(x,  word_index[x]))
    
print("\n")

for x in list(index_word)[10:15]:
    print ("{}:{}".format(x,  index_word[x]))

#### CBOW Implementation

##### Padding
Padding is a special form of masking where the masked steps are at the start or at the beginning of a sequence. Padding comes from the need to encode sequence data into contiguous batches: in order to make all sequences in a batch fit a given standard length, it is necessary to pad or truncate some sequences.

In [None]:
word_index['PAD'] = 0 
index_word[0] = 'PAD'

vocab_size = len(word_index)
print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word_index.items())[:10])

In [None]:
def generate_cbow_pairs(sequence, window_size):
    x = []
    y = []
    for i in range(0, len(sequence) - window_size, 1):
        x.append(sequence[i:i+window_size])
        y.append(sequence[i + window_size])
    
    return (x, y)

In [None]:
# Verify generate_cbow_pairs method. 
for sequence in sequences[0:1]:
    print(sequence, "\n")
    x, y = generate_cbow_pairs(sequence, 4)
    print(x, y)

##### Build train data

In [None]:
# Hyper parameters for CBOW model.
window_size = 4
embed_size = 300

In [None]:
train_features = []
train_labels = []

for sequence in sequences:
    x, y = generate_cbow_pairs(sequence, window_size)
    train_features.extend(x)
    train_labels.extend(y)

# Convert the list to numpy array.
x_train = np.asarray(train_features)
y_train = np.asarray(train_labels)

print("Features Shape: ", x_train.shape)
print("Labels Shape: ", y_train.shape)

In [None]:
Y_labels = np_utils.to_categorical(y_train, vocab_size)
print(Y_labels.shape)
Y_labels

#### Build Model & Train

Embedding class Turns positive integers (indexes) into dense vectors of fixed size.

e.g. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]

This layer can only be used as the first layer in a model.

In [None]:
# First understand Embedding layer input shape and output shape. 
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, embed_size, input_length=window_size))

# Assume input will have 32 rows and each row will have 10 dimensional vector. 
# Below line of code generates 32 rows with 1o dimensional vector and values between 0 and 1000.  
input_array = np.random.randint(1000, size=(32, 10))

model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)

# Print output shape
# (32, 10, 300) => (batchSize, input dimensional space(window_size), embedding size)
print(output_array.shape)

# Now convert the 3D to 2D, as we need to feed this into Dense layer for training. 
# Example for converting. 
input_shape = (1, 2, 3)
x = tf.random.normal(input_shape)
y = tf.keras.layers.GlobalAveragePooling1D()(x)
print(x.shape)
print(x)
print(y.shape)
print(y)

In [None]:
model = keras.Sequential([
        keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size),
        keras.layers.GlobalAveragePooling1D(),
        keras.layers.Dense(256, activation='relu'),
        keras.layers.Dense(vocab_size, activation='softmax')])

# Compile the model.
model.compile(optimizer=keras.optimizers.Adam(lr=1e-3), loss=keras.losses.CategoricalCrossentropy())

In [None]:
model.summary()

In [None]:
history = model.fit(x_train, Y_labels, batch_size=2000, epochs=10)

#### Retrieve the learned embeddings

In [None]:
embedding_layer = model.layers[0]
weights = embedding_layer.get_weights()[0]
print(weights.shape) 
print(weights[0])

# Weights Shape => (vocab_size, embedding_size) = (1125, 300)

#### Find Similar Words

In [None]:
# Write weights to disk to view it on Embedding Projector. In practice,
# we can use Embedding Projector or project this data to 30 Dimensions and then Use t-Sne to visualize it.
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for num, word in enumerate(word_index):
    vec = weights[num - 1] # skip 0, it's padding.
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    
out_v.close()
out_m.close()

#### pre-trained word vectors

In [None]:
out_v = io.open('vecs_pretrained.tsv', 'w', encoding='utf-8')
out_m = io.open('meta_pretrained.tsv', 'w', encoding='utf-8')

embeddings_index = {}

with open(os.path.join('/Users/ukannika/Downloads/glove.6B/', 'glove.6B.100d.txt')) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs_vec = np.fromstring(coefs, 'f', sep=' ')
        out_m.write(word + "\n")
        out_v.write('\t'.join([str(x) for x in coefs_vec]) + "\n")   

# View these in Embedding Projector

##### Skip Gram Models With Negative Sampling

This function transforms a sequence of word indexes (list of integers) into tuples of words of the form:
    
    (word, word in the same window), with label 1 (positive samples).
    (word, random word from the vocabulary), with label 0 (negative samples).

In [None]:
x, y = tf.keras.preprocessing.sequence.skipgrams(
                sequences[0], vocab_size, window_size=1, negative_samples=0.6, shuffle=True,
                categorical=False, sampling_table=None, seed=None)

print(x, "\n")
print(y)

In [None]:
# Build train data
x = []
y = []

for sequence in sequences:
    x_skipgm, y_skipgm = tf.keras.preprocessing.sequence.skipgrams(
                sequence, vocab_size, window_size=1, negative_samples=0.6, shuffle=True,
                categorical=False, sampling_table=None, seed=None)
    x.extend(x_skipgm)
    y.extend(y_skipgm)
    
x_train = np.asarray(x)
y_train = np.asarray(y)

In [None]:
# Build Model
model = keras.Sequential([
          keras.layers.Embedding(vocab_size, embed_size),
          keras.layers.GlobalAveragePooling1D(),
          keras.layers.Dense(16, activation='relu'),
          keras.layers.Dense(1, activation='sigmoid')
        ])

model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), metrics=['accuracy'])

model.summary()

In [None]:
history = model.fit(x, y, batch_size=2000, epochs=5)

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) 

# Weights Shape => (vocab_size, embedding_size) = (1125, 300)