In [55]:
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda
import seaborn as sns
import matplotlib.pyplot as plt

In [57]:
data = """I love to play football", "Football is a great game", "I enjoy watching football matches", "The team played well", "Football brings people together"""

In [59]:
sentences = data.split('.')
sentences

['I love to play football", "Football is a great game", "I enjoy watching football matches", "The team played well", "Football brings people together']

In [61]:
clean_sent = []
for sentence in sentences:
    if sentence=="":
        continue
    sentence=re.sub('[^a-zA-Z0-9]+', ' ', (sentence))
    sentence=re.sub(r"(?:^| )\w (?:$| )",' ', (sentence)).strip()
    sentence=sentence.lower()
    clean_sent.append(sentence)
clean_sent

['i love to play football football is a great game i enjoy watching football matches the team played well football brings people together']

In [63]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_sent)
sequences = tokenizer.texts_to_sequences(clean_sent)
print(sequences)

[[2, 3, 4, 5, 1, 1, 6, 7, 8, 9, 2, 10, 11, 1, 12, 13, 14, 15, 16, 1, 17, 18, 19]]


In [65]:
index_to_word = {}
word_to_index = {}

for i, sequence in enumerate(sequences):
    word_in_sentence = clean_sent[i].split()
    for j, value in enumerate(sequence):
        index_to_word[value] = word_in_sentence[j]
        word_to_index[word_in_sentence[j]] = value
print(index_to_word, "\n")
print(word_to_index)

{2: 'i', 3: 'love', 4: 'to', 5: 'play', 1: 'football', 6: 'is', 7: 'a', 8: 'great', 9: 'game', 10: 'enjoy', 11: 'watching', 12: 'matches', 13: 'the', 14: 'team', 15: 'played', 16: 'well', 17: 'brings', 18: 'people', 19: 'together'} 

{'i': 2, 'love': 3, 'to': 4, 'play': 5, 'football': 1, 'is': 6, 'a': 7, 'great': 8, 'game': 9, 'enjoy': 10, 'watching': 11, 'matches': 12, 'the': 13, 'team': 14, 'played': 15, 'well': 16, 'brings': 17, 'people': 18, 'together': 19}


In [67]:
vocab_size = len(tokenizer.word_index) + 1
emb_size = 10
context_size = 2
contexts = []
targets = []

for sequence in sequences:
    for i in range(context_size, len(sequence) - context_size):
        target = sequence[i]
        context = [sequence[i-2], sequence[i-1],sequence[i+1], sequence[i+2]]
        contexts.append(context)
        targets.append(target)
print(contexts, "\n")
print(targets)

[[2, 3, 5, 1], [3, 4, 1, 1], [4, 5, 1, 6], [5, 1, 6, 7], [1, 1, 7, 8], [1, 6, 8, 9], [6, 7, 9, 2], [7, 8, 2, 10], [8, 9, 10, 11], [9, 2, 11, 1], [2, 10, 1, 12], [10, 11, 12, 13], [11, 1, 13, 14], [1, 12, 14, 15], [12, 13, 15, 16], [13, 14, 16, 1], [14, 15, 1, 17], [15, 16, 17, 18], [16, 1, 18, 19]] 

[4, 5, 1, 1, 6, 7, 8, 9, 2, 10, 11, 1, 12, 13, 14, 15, 16, 1, 17]


In [69]:
for i in range(5):
    words = []
    target = index_to_word.get(targets[i])
    for j in contexts[i]:
        words.append(index_to_word.get(j))
    print(words, " -> ", target)

['i', 'love', 'play', 'football']  ->  to
['love', 'to', 'football', 'football']  ->  play
['to', 'play', 'football', 'is']  ->  football
['play', 'football', 'is', 'a']  ->  football
['football', 'football', 'a', 'great']  ->  is


In [71]:
X = np.array(contexts)
Y = np.array(targets)

In [73]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=2*context_size),
    Lambda(lambda x: tf.reduce_mean(x, axis=1)),
    Dense(256, activation='relu'),
    Dense(512, activation='relu'),
    Dense(vocab_size, activation='softmax')
])

In [75]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X, Y, epochs=10)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.0000e+00 - loss: 2.9959
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.2105 - loss: 2.9897
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.2105 - loss: 2.9842
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.2105 - loss: 2.9785
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.2105 - loss: 2.9720
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.2105 - loss: 2.9645
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.2105 - loss: 2.9561
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.2105 - loss: 2.9464
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [77]:
def predict_word(model, context):
    context = np.array(context).reshape(1, -1)
    predictions = model.predict(context)
    return index_to_word[np.argmax(predictions)]
example_context = [1, 2, 4, 5]
predicted_word = predict_word(model, example_context)
print(f'Predicted word: {predicted_word}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
Predicted word: football
