In [1]:
# a. Data preparation
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, LSTM

# Sample sentences for demonstration
corpus = ["the cat sat on the mat", "the dog sat on the log", "cats and dogs are friends"]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1  # +1 for padding

# b. Generate training data
def generate_training_data(corpus, window_size=1):
    input_data, target_data = [], []
    for sentence in corpus:
        tokenized = tokenizer.texts_to_sequences([sentence])[0]
        for i in range(window_size, len(tokenized) - window_size):
            input_data.append(tokenized[i-window_size:i] + tokenized[i+1:i+window_size+1])
            target_data.append(tokenized[i])
    return np.array(input_data), np.array(target_data)

X, y = generate_training_data(corpus)

# c. Train model
model = Sequential([
    Embedding(total_words, 10, input_length=X.shape[1]),
    LSTM(50),
    Dropout(0.2),
    Dense(total_words, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=1)

# d. Output
print("Model training complete.")



Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.1818 - loss: 2.5643
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step - accuracy: 0.2727 - loss: 2.5613
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.3636 - loss: 2.5589
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.3636 - loss: 2.5562
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.4545 - loss: 2.5539
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.5455 - loss: 2.5513
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.4545 - loss: 2.5488
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.3636 - loss: 2.5467
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [7]:
# Output: The model can now provide embeddings for each word
embedding_layer = model.layers[0]
embeddings = embedding_layer.get_weights()[0]

