## LSTM from Scrach

In [2]:
import numpy as np
from collections import defaultdict
import random
import tensorflow as tf




In [3]:
# Sample text corpus
corpus = [
    "the quick brown fox jumped over the lazy dog",
    "the dog sat on the mat",
    "the cat sat on the mat",
    "the quick red fox jumped over the sleeping cat"
]

In [4]:
# Preprocessing: Tokenize the text and build vocabulary
tokenized_corpus = [sentence.split() for sentence in corpus]
vocabulary = set(word for sentence in tokenized_corpus for word in sentence)
word2idx = {word: idx for idx, word in enumerate(vocabulary)}
idx2word = {idx: word for word, idx in word2idx.items()}

In [28]:
print("Tokenized corpus:", tokenized_corpus)
print("Vocabulary:", vocabulary)
print("Word to index:", word2idx)
print("Index to word:", idx2word)

Tokenized corpus: [['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog'], ['the', 'dog', 'sat', 'on', 'the', 'mat'], ['the', 'cat', 'sat', 'on', 'the', 'mat'], ['the', 'quick', 'red', 'fox', 'jumped', 'over', 'the', 'sleeping', 'cat']]
Vocabulary: {'over', 'brown', 'the', 'quick', 'dog', 'fox', 'sat', 'on', 'mat', 'sleeping', 'jumped', 'cat', 'lazy', 'red'}
Word to index: {'over': 0, 'brown': 1, 'the': 2, 'quick': 3, 'dog': 4, 'fox': 5, 'sat': 6, 'on': 7, 'mat': 8, 'sleeping': 9, 'jumped': 10, 'cat': 11, 'lazy': 12, 'red': 13}
Index to word: {0: 'over', 1: 'brown', 2: 'the', 3: 'quick', 4: 'dog', 5: 'fox', 6: 'sat', 7: 'on', 8: 'mat', 9: 'sleeping', 10: 'jumped', 11: 'cat', 12: 'lazy', 13: 'red'}


In [30]:
# Parameters
window_size = 2
embedding_dim = 50
learning_rate = 0.01
epochs = 10000

In [7]:
# Generate training data
def generate_training_data(tokenized_corpus, word2idx, window_size):

    """Generates training data for a word2vec model.
    Args:
        corpus: A list of sentences, where each sentence is a list of words.
        word2idx: A dictionary that maps words to their indices in the vocabulary.
        window_size: The size of the context window.

    Returns:
        A list of tuples, where each tuple contains the index of a center word
        and the index of a context word.
    """

    training_data = []
    for sentence in tokenized_corpus:
        sentence_indices = [word2idx[word] for word in sentence]
        for center_word_pos in range(len(sentence_indices)):
            center_word_idx = sentence_indices[center_word_pos]
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                if context_word_pos < 0 or context_word_pos >= len(sentence_indices) or context_word_pos == center_word_pos:
                    continue
                context_word_idx = sentence_indices[context_word_pos]
                training_data.append((center_word_idx, context_word_idx))
                
    return training_data

In [8]:
training_data = generate_training_data(tokenized_corpus, word2idx, window_size)

In [9]:
training_data

[(2, 3),
 (2, 1),
 (3, 2),
 (3, 1),
 (3, 5),
 (1, 2),
 (1, 3),
 (1, 5),
 (1, 10),
 (5, 3),
 (5, 1),
 (5, 10),
 (5, 0),
 (10, 1),
 (10, 5),
 (10, 0),
 (10, 2),
 (0, 5),
 (0, 10),
 (0, 2),
 (0, 12),
 (2, 10),
 (2, 0),
 (2, 12),
 (2, 4),
 (12, 0),
 (12, 2),
 (12, 4),
 (4, 2),
 (4, 12),
 (2, 4),
 (2, 6),
 (4, 2),
 (4, 6),
 (4, 7),
 (6, 2),
 (6, 4),
 (6, 7),
 (6, 2),
 (7, 4),
 (7, 6),
 (7, 2),
 (7, 8),
 (2, 6),
 (2, 7),
 (2, 8),
 (8, 7),
 (8, 2),
 (2, 11),
 (2, 6),
 (11, 2),
 (11, 6),
 (11, 7),
 (6, 2),
 (6, 11),
 (6, 7),
 (6, 2),
 (7, 11),
 (7, 6),
 (7, 2),
 (7, 8),
 (2, 6),
 (2, 7),
 (2, 8),
 (8, 7),
 (8, 2),
 (2, 3),
 (2, 13),
 (3, 2),
 (3, 13),
 (3, 5),
 (13, 2),
 (13, 3),
 (13, 5),
 (13, 10),
 (5, 3),
 (5, 13),
 (5, 10),
 (5, 0),
 (10, 13),
 (10, 5),
 (10, 0),
 (10, 2),
 (0, 5),
 (0, 10),
 (0, 2),
 (0, 9),
 (2, 10),
 (2, 0),
 (2, 9),
 (2, 11),
 (9, 0),
 (9, 2),
 (9, 11),
 (11, 2),
 (11, 9)]

In [10]:
# Initialize weights
W1 = np.random.rand(len(vocabulary), embedding_dim)
W2 = np.random.rand(embedding_dim, len(vocabulary))

In [11]:
W1.shape

(14, 50)

In [12]:
# Helper function: Softmax
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [31]:
# Training the model
for epoch in range(epochs):
    loss = 0
    for center_word_idx, context_word_idx in training_data:
        # Forward pass
        h = W1[center_word_idx]
        u = np.dot(W2.T, h)
        y_pred = softmax(u)
        
        # Calculate error
        e = np.zeros(len(vocabulary))
        e[context_word_idx] = 1
        error = e - y_pred
        
        # Backpropagation
        dW2 = np.outer(h, error)
        dW1 = np.dot(W2, error)
        
        W1[center_word_idx] += learning_rate * dW1
        W2 += learning_rate * dW2

        loss += -np.log(y_pred[context_word_idx])
        
    if (epoch + 1) % 100 == 0:
        print(f'Epoch: {epoch + 1}, Loss: {loss}')

Epoch: 100, Loss: 160.7800452392337
Epoch: 200, Loss: 160.72244895560266
Epoch: 300, Loss: 160.66923771321703
Epoch: 400, Loss: 160.620070536631
Epoch: 500, Loss: 160.574639166864
Epoch: 600, Loss: 160.53266315510993
Epoch: 700, Loss: 160.49388595249692
Epoch: 800, Loss: 160.45807179221922
Epoch: 900, Loss: 160.4250031957297
Epoch: 1000, Loss: 160.39447896588618
Epoch: 1100, Loss: 160.3663125568946
Epoch: 1200, Loss: 160.34033073371995
Epoch: 1300, Loss: 160.31637245264616
Epoch: 1400, Loss: 160.29428791022016
Epoch: 1500, Loss: 160.27393772037405
Epoch: 1600, Loss: 160.25519218947457
Epoch: 1700, Loss: 160.2379306668417
Epoch: 1800, Loss: 160.22204095427463
Epoch: 1900, Loss: 160.20741876267385
Epoch: 2000, Loss: 160.1939672072356
Epoch: 2100, Loss: 160.18159633518485
Epoch: 2200, Loss: 160.17022268179485
Epoch: 2300, Loss: 160.15976885170997
Epoch: 2400, Loss: 160.1501631234466
Epoch: 2500, Loss: 160.1413390755351
Epoch: 2600, Loss: 160.13323523313628
Epoch: 2700, Loss: 160.125794734

In [33]:
# Extract word embeddings
word_embeddings = {word: W1[idx] for word, idx in word2idx.items()}

In [35]:
print(word_embeddings)

{'over': array([ 0.2643237 ,  0.15359734,  0.2566751 , -0.49269645,  0.25349305,
       -0.36274086,  0.07544404,  0.01349164,  0.04836143, -0.17382901,
       -0.45694518, -0.30115153,  0.05815846,  0.79509194, -0.13766509,
        0.70968481, -0.07543288,  0.01416559, -0.36577318,  0.0248995 ,
       -0.37730451,  0.02311338, -0.24298693,  0.09364391,  0.20392836,
        0.69879893,  0.89376086,  0.28696915,  0.21500321,  0.13875812,
       -0.36617171,  0.57249123,  0.21578237, -0.11820204,  0.07730346,
       -0.74920352,  0.19502251,  0.96743235,  0.59802843,  0.09031106,
        0.16748159,  0.58650932,  0.12622882,  0.38731539,  0.61492919,
       -0.18646264, -0.25414947, -0.49893071,  0.38759397,  0.15442185]), 'brown': array([ 0.41553012,  0.72048019,  0.4980261 , -0.18519245,  0.41900809,
       -0.27936133,  0.5855726 , -0.09478651, -0.01247578,  0.235049  ,
       -0.12090455, -0.30194405,  0.25679069,  1.17407469, -0.10311129,
        0.86244504,  0.05790593,  0.47579246

## Word Embedding using TensorFlow

In [16]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
# Sample text corpus
corpus = [
    "the quick brown fox jumped over the lazy dog",
    "the dog sat on the mat",
    "the cat sat on the mat",
    "the quick red fox jumped over the sleeping cat"
]

In [18]:
# Preprocessing: Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

In [19]:
# Parameters
embedding_dim = 50
max_length = max(len(sequence) for sequence in sequences)

In [20]:
# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

In [21]:
# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)
])




In [22]:
# Compile the model
model.compile(optimizer='adam', loss='mse')




In [23]:
# Get the embedding layer weights
embeddings = model.layers[0].get_weights()[0]

In [24]:
# Print word embeddings
word_embeddings = {word: embeddings[idx] for word, idx in word_index.items()}

In [25]:
print(word_embeddings)

{'the': array([-0.04612049, -0.01190308, -0.00153064, -0.01120682, -0.03748507,
        0.00288481, -0.01680446,  0.04362172,  0.02980744, -0.04893311,
        0.04205172, -0.04865813, -0.04556816, -0.02228706,  0.03424952,
       -0.00112004,  0.02273047,  0.03050229, -0.00623054, -0.02094885,
        0.04634091, -0.01049602,  0.01203208,  0.01687162,  0.01887866,
       -0.02861352, -0.03881355,  0.04917821, -0.00909869, -0.01181167,
       -0.01551112,  0.01500956, -0.02353489,  0.04423911,  0.04411631,
        0.0218701 ,  0.02088172,  0.03352297,  0.01336863,  0.03856951,
       -0.03614985, -0.02435851, -0.02646753,  0.02696675,  0.02806308,
       -0.03035482,  0.01249006,  0.03066972,  0.02265425,  0.02730176],
      dtype=float32), 'quick': array([-0.01442453,  0.01459267, -0.00989127, -0.02025933,  0.02713529,
        0.03997696,  0.02862414, -0.03576346, -0.02210573, -0.04239181,
        0.04378546, -0.02323896,  0.01567811, -0.01853814,  0.01903195,
       -0.02118861,  0.0