## LSTM from Scrach

In [54]:
import numpy as np
from collections import defaultdict
import random

In [55]:
# Sample text corpus
corpus = [
    "the quick brown fox jumped over the lazy dog",
    "the dog sat on the mat",
    "the cat sat on the mat",
    "the quick red fox jumped over the sleeping cat"
]

In [56]:
# Preprocessing: Tokenize the text and build vocabulary
tokenized_corpus = [sentence.split() for sentence in corpus]
vocabulary = set(word for sentence in tokenized_corpus for word in sentence)
word2idx = {word: idx for idx, word in enumerate(vocabulary)}
idx2word = {idx: word for word, idx in word2idx.items()}

In [57]:
# Parameters
window_size = 2
embedding_dim = 50
learning_rate = 0.01
epochs = 1000

In [58]:
# Generate training data
def generate_training_data(tokenized_corpus, word2idx, window_size):
    training_data = []
    for sentence in tokenized_corpus:
        sentence_indices = [word2idx[word] for word in sentence]
        for center_word_pos in range(len(sentence_indices)):
            center_word_idx = sentence_indices[center_word_pos]
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                if context_word_pos < 0 or context_word_pos >= len(sentence_indices) or context_word_pos == center_word_pos:
                    continue
                context_word_idx = sentence_indices[context_word_pos]
                training_data.append((center_word_idx, context_word_idx))
    return training_data

In [59]:
training_data = generate_training_data(tokenized_corpus, word2idx, window_size)

In [60]:
# Initialize weights
W1 = np.random.rand(len(vocabulary), embedding_dim)
W2 = np.random.rand(embedding_dim, len(vocabulary))

In [61]:
# Helper function: Softmax
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [62]:
# Training the model
for epoch in range(epochs):
    loss = 0
    for center_word_idx, context_word_idx in training_data:
        # Forward pass
        h = W1[center_word_idx]
        u = np.dot(W2.T, h)
        y_pred = softmax(u)
        
        # Calculate error
        e = np.zeros(len(vocabulary))
        e[context_word_idx] = 1
        error = e - y_pred
        
        # Backpropagation
        dW2 = np.outer(h, error)
        dW1 = np.dot(W2, error)
        
        W1[center_word_idx] += learning_rate * dW1
        W2 += learning_rate * dW2

        loss += -u[context_word_idx] + np.log(np.sum(np.exp(u)))
        
    if (epoch + 1) % 100 == 0:
        print(f'Epoch: {epoch + 1}, Loss: {loss}')

Epoch: 100, Loss: 165.61670556615383
Epoch: 200, Loss: 164.0472338973487
Epoch: 300, Loss: 163.50347425471477
Epoch: 400, Loss: 163.1389447086024
Epoch: 500, Loss: 162.84356337027336
Epoch: 600, Loss: 162.58942561779088
Epoch: 700, Loss: 162.36578010360975
Epoch: 800, Loss: 162.16686079388708
Epoch: 900, Loss: 161.9887877755731
Epoch: 1000, Loss: 161.82860364244493


In [63]:
# Extract word embeddings
word_embeddings = {word: W1[idx] for word, idx in word2idx.items()}

In [64]:
print(word_embeddings)

{'red': array([ 1.34284504,  0.48596913,  0.93122386,  0.24091977,  0.69870994,
        0.71792767,  0.82247418, -0.12521853,  0.56059745,  0.03651641,
        0.85196203,  0.55594785,  0.88842967,  0.71053781, -0.06635493,
       -0.24596134,  0.28533118, -0.09439753,  0.1908053 ,  0.3026512 ,
        0.58852646,  0.25546583,  0.95912175,  0.33163666, -0.22906976,
       -0.26683525,  0.25881204,  1.09317104,  0.14269744,  0.20945284,
        0.62096893,  0.5556406 ,  0.4695311 , -0.23116748,  0.07818583,
        0.55881944,  0.6433599 ,  0.38579393,  0.95370989,  0.53124692,
        0.45520997,  0.40721149,  0.17247706,  0.67548582,  0.05884562,
        0.87220211, -0.19802671,  1.00798248,  0.55590143,  0.74926016]), 'sleeping': array([ 0.85523059,  0.70592107,  0.86968361,  0.40033544,  0.35178697,
        0.7492172 ,  0.3194825 ,  0.2383344 ,  0.42704636,  0.07163806,
        0.03702935,  0.7530885 , -0.07928316,  0.20806225,  0.45011269,
        0.71280374,  0.21716324,  0.705667

## Word Embedding using TensorFlow

In [65]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [66]:
# Sample text corpus
corpus = [
    "the quick brown fox jumped over the lazy dog",
    "the dog sat on the mat",
    "the cat sat on the mat",
    "the quick red fox jumped over the sleeping cat"
]

In [67]:
# Preprocessing: Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

In [68]:
# Parameters
embedding_dim = 50
max_length = max(len(sequence) for sequence in sequences)

In [69]:
# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

In [70]:
# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)
])

In [71]:
# Compile the model
model.compile(optimizer='adam', loss='mse')

In [72]:
# Get the embedding layer weights
embeddings = model.layers[0].get_weights()[0]

In [73]:
# Print word embeddings
word_embeddings = {word: embeddings[idx] for word, idx in word_index.items()}

In [74]:
print(word_embeddings)

{'the': array([-0.01241259, -0.01774875, -0.03346141, -0.02676596, -0.01879693,
        0.03532088, -0.01283132,  0.04316194,  0.00310711, -0.02778285,
        0.02844122, -0.03548081,  0.0278048 , -0.00316311, -0.04807645,
       -0.01086904, -0.04114916, -0.00942147,  0.02552435, -0.00955017,
        0.02342251,  0.02952727, -0.01891347,  0.02555095, -0.04900492,
       -0.01400251, -0.03615185,  0.01436577, -0.00489699, -0.02112705,
       -0.01310675, -0.04418906, -0.0294462 ,  0.01972851,  0.01470939,
        0.03180972,  0.03277678, -0.00360649,  0.01958463,  0.02837828,
       -0.03490628, -0.03802397, -0.03347663, -0.03064616,  0.0130899 ,
        0.00590051, -0.01553063, -0.02775564,  0.04552602,  0.04437527],
      dtype=float32), 'quick': array([-0.01111599, -0.01390284, -0.00893711, -0.04998709, -0.03684326,
        0.02854704,  0.01096918, -0.00909437, -0.02005515, -0.00279737,
        0.04925277, -0.03151499, -0.03774675, -0.00207589, -0.0108268 ,
       -0.00092208,  0.0