In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding, GlobalMaxPooling1D, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.metrics.pairwise import cosine_similarity

In [6]:
file_path = './LP-IV-datasets/CBOW/CBOW.txt'

with open(file_path, 'r') as file:
    file_contents = file.read()

In [7]:
file_contents

'The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. \n\nFurther, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. \n\nThe reproductive number – the number of secondary infections generated from one infected individual – is understood to be betwe

In [8]:
sentences = file_contents.split('.')

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
total_words = len(tokenizer.word_index) + 1

window_size = 3
tokenized_sentences = tokenizer.texts_to_sequences(sentences)

data, labels = [], []

for sentence in tokenized_sentences:
    for i, target_word in enumerate(sentence):
        context = [
            sentence[j] for j in range(i - window_size, i + window_size + 1)
            if j != i and 0 <= j < len(sentence)
        ]
        data.append(context)
        labels.append(target_word)
        
data = pad_sequences(data)
labels = np.array(labels)

In [18]:
model = Sequential()
model.add(Embedding(input_dim = total_words, output_dim = 50, input_length = window_size * 2))
model.add(GlobalMaxPooling1D())
model.add(Dense(total_words, activation = 'softmax'))

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 6, 50)             5150      
                                                                 
 global_max_pooling1d_2 (Gl  (None, 50)                0         
 obalMaxPooling1D)                                               
                                                                 
 dense_1 (Dense)             (None, 103)               5253      
                                                                 
Total params: 10403 (40.64 KB)
Trainable params: 10403 (40.64 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [21]:
model.compile(optimizer = 'adam', metrics = ['accuracy'], loss = 'sparse_categorical_crossentropy')

In [None]:
model.fit(data, labels, epochs = 200, verbose = 1)

In [35]:
word_embeddings = model.layers[0].get_weights()[0]
word_embeddings

array([[-0.29338855, -0.36919317, -0.25913265, ...,  0.55457073,
        -0.27990696, -0.29657188],
       [-0.2924454 , -0.26446927,  0.9963598 , ..., -0.1663595 ,
        -0.28159782, -0.28429142],
       [-0.2926269 , -0.32237428, -0.20211813, ..., -0.29282713,
         0.91963553, -0.28594324],
       ...,
       [-0.28385955, -0.24094178, -0.24471655, ..., -0.01307701,
        -0.28086254, -0.26331747],
       [-0.28379327, -0.2400152 , -0.24442363, ..., -0.0207559 ,
        -0.28011993, -0.26325712],
       [-0.2866952 , -0.00583646, -0.00885593, ...,  0.25312158,
        -0.27885467, -0.256797  ]], dtype=float32)

In [43]:
target_word = 'infection'
target_embedding = word_embeddings[tokenizer.word_index[target_word]]

similarities = cosine_similarity(target_embedding.reshape(1, -1), word_embeddings)[0]
most_similar_indices = similarities.argsort()[-5:][::1]

most_similar_words = [word for word, idx in tokenizer.word_index.items() if idx in most_similar_indices]

print(most_similar_words)

['between', 'infection', 'individual', 'however', 'context']
