# Assignment No 5

Implement the Continuous Bag of Words (CBOW) Model. Stages can be:
a. Data preparation
b. Generate training data
c. Train model
d. Output

In [None]:
#a. Data preparation

In [None]:
import numpy as np
import keras.backend as K #imp
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
import tensorflow as tf
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
import gensim

In [None]:
data = open(r"/content/corona.txt")
data

<_io.TextIOWrapper name='/content/corona.txt' mode='r' encoding='UTF-8'>

In [None]:
corona_data = [text for text in data if text.count(' ') >= 2]
corona_data

['The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. \n',
 'Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. \n',
 'The reproductive number – the number of secondary infections generated from one infected individual – is understood to b

In [None]:
vectorize = Tokenizer()
vectorize.fit_on_texts(corona_data)
corona_data = vectorize.texts_to_sequences(corona_data)

total_vocab = sum(len(s) for s in corona_data)
word_count = len(vectorize.index_word)+1
corona_data

[[1,
  38,
  2,
  8,
  9,
  39,
  40,
  41,
  2,
  42,
  13,
  1,
  43,
  23,
  3,
  44,
  11,
  24,
  45,
  46,
  47,
  1,
  14,
  25,
  48,
  10,
  26,
  2,
  27,
  12,
  11,
  24,
  15,
  16,
  1,
  14,
  13,
  49,
  50,
  17,
  4,
  5,
  6,
  1,
  15,
  16,
  7,
  4,
  5,
  6,
  9,
  51,
  10,
  18,
  19,
  52,
  20,
  28,
  7,
  3,
  6,
  1,
  15,
  16,
  9,
  29,
  20,
  30,
  53,
  31,
  3,
  32,
  54,
  55,
  17,
  4,
  5],
 [56,
  8,
  33,
  1,
  57,
  29,
  19,
  20,
  2,
  58,
  59,
  60,
  61,
  62,
  8,
  63,
  2,
  1,
  6,
  64,
  1,
  26,
  2,
  27,
  21,
  9,
  11,
  34,
  35,
  2,
  8,
  7,
  3,
  33,
  65,
  28,
  66,
  22,
  67,
  31,
  68,
  22,
  69,
  70,
  32,
  71,
  4,
  5,
  6,
  72,
  73,
  74,
  75,
  10,
  76,
  77,
  78,
  79,
  30,
  80,
  81,
  82,
  10,
  18,
  11,
  34,
  35,
  2,
  8],
 [1,
  83,
  36,
  21,
  1,
  36,
  2,
  84,
  85,
  86,
  25,
  87,
  88,
  89,
  21,
  9,
  90,
  10,
  18,
  13,
  37,
  12,
  37,
  19,
  7,
  4,
  5,
  6,
  91,
  

In [None]:
print(total_vocab)
print(word_count)

198
103


In [None]:
window_size = 2

In [None]:
# b. Generate training data

In [None]:
# Defining utility to generate context word pairs
def cbow_model(data, window_size, total_vocab):
    total_length = window_size*2
    for text in data:
        text_len = len(text)
#         print("zero",text)
        for idx, word in enumerate(text):
#             print("first",idx,word)
            context_word = []
            target   = []
            begin = idx - window_size
            end = idx + window_size + 1
            context_word.append([
                text[i]
                for i in range(begin, end)
                if 0 <= i < text_len
                and i != idx
            ])
            target.append(word)
#             print("second",context_word,target)
            contextual = pad_sequences(
                context_word,
                maxlen=total_length
            )
            final_target = tf.keras.utils.to_categorical(
                target,
                total_vocab
            )
#             print("third",contextual,final_target)
            yield(contextual, final_target)

In [None]:
# c. train model

In [None]:
# Defining the model architecture
model = Sequential()
model.add(
    Embedding(
        input_dim=total_vocab,
        output_dim=100,
        input_length=window_size*2
    )
)
model.add(
    Lambda(
        lambda x: K.mean(x, axis=1),
        output_shape=(100,)
    )
)
model.add(
    Dense(
        total_vocab,
        activation='softmax'
    )
)

In [None]:
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 100)            19800     
                                                                 
 lambda (Lambda)             (None, 100)               0         
                                                                 
 dense (Dense)               (None, 198)               19998     
                                                                 
Total params: 39798 (155.46 KB)
Trainable params: 39798 (155.46 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam'
)

In [None]:
for i in range(10):
    cost = 0
    for x, y in cbow_model(corona_data, window_size, total_vocab):
        cost += model.train_on_batch(x, y)
    print("Epoch ", i,"\t: ", cost)

Epoch  0 	:  1041.8278551101685
Epoch  1 	:  993.0189785957336
Epoch  2 	:  905.5759208202362
Epoch  3 	:  828.1839485168457
Epoch  4 	:  774.7983016967773
Epoch  5 	:  724.5076154470444
Epoch  6 	:  671.9924651384354
Epoch  7 	:  617.8237195014954
Epoch  8 	:  563.515346288681
Epoch  9 	:  510.6042321920395


In [None]:
dimensions = 100
vect_file = open('./vectors.txt','w')
vect_file.write('{} {}\n'.format(102, dimensions))


8

In [None]:
weights = model.get_weights()[0]
for text, i in vectorize.word_index.items():
    final_vec = ' '.join(map(str, list(weights[i, :])))
    vect_file.write('{} {}\n'.format(text, final_vec))
vect_file.close()

In [None]:
# d. Output

In [None]:
cbow_output = gensim.models.KeyedVectors.load_word2vec_format(
    'vectors.txt',
    binary=False
)

In [None]:
cbow_output.most_similar(positive=['speed'])


[('–transmission', 0.8709251284599304),
 ('number', 0.7510669827461243),
 ('difference', 0.7482622265815735),
 ('driver', 0.7140737175941467),
 ('interval', 0.7052752375602722),
 ('before', 0.6867612600326538),
 ('serial', 0.6823122501373291),
 ('symptoms', 0.6481000781059265),
 ('two', 0.6480923891067505),
 ('transmission', 0.6156459450721741)]

In [None]:
# import numpy as np: Imports the NumPy library.

# import keras.backend as K: Imports the Keras backend for custom loss function (though it doesn't appear to be used in the code).

# from keras.models import Sequential: Imports the Sequential model from Keras.

# from keras.layers import Dense, Embedding, Lambda: Imports layers from Keras.

# import tensorflow as tf: Imports TensorFlow.

# from keras.utils import pad_sequences: Imports pad_sequences from Keras.

# from keras.preprocessing.text import Tokenizer, text_to_word_sequence: Imports Tokenizer and text_to_word_sequence functions from Keras.

# import gensim: Imports the Gensim library for word vector operations.

# data = open(r"/content/corona.txt"): Opens a text file named "corona.txt" for reading. This file contains text data.

# corona_data = [text for text in data if text.count(' ') >= 2]: Reads the lines from the file and stores them in the corona_data list if they contain at least two spaces.
# This is likely filtering for lines with a minimum number of words.

# vectorize = Tokenizer(): Initializes a Tokenizer from Keras. This will be used for text tokenization.

# vectorize.fit_on_texts(corona_data): Fits the Tokenizer on the corona_data to create a vocabulary.

# corona_data = vectorize.texts_to_sequences(corona_data): Converts the text data to sequences using the vocabulary created by the Tokenizer.

# total_vocab = sum(len(s) for s in corona_data): Calculates the total number of words in the data.

# word_count = len(vectorize.index_word) + 1: Retrieves the count of unique words in the vocabulary created by the Tokenizer.

# Stage b: Generate training data
# window_size = 2: Sets the window size for context words.

# The cbow_model function is defined to generate training data. It iterates over the corona_data and generates context word pairs.

# Stage c: Train model
# The model architecture is defined using Keras Sequential model. It consists of an Embedding layer, a Lambda layer to calculate the mean, and a Dense layer for prediction.

# The model is compiled with categorical cross-entropy loss and the Adam optimizer.

# The code trains the model for 10 epochs. It iterates over the training data and performs batch training.

# Stage d: Output
# A file 'vectors.txt' is opened for writing. This file will store the word vectors.

# The word vectors are extracted from the trained model and written to 'vectors.txt'.

# The 'vectors.txt' file is saved with the word vectors.

# Finally, the code loads the saved word vectors using Gensim and uses the most_similar function to find words similar to 'speed' in the word vector space.

# The output of the code will include words that are similar to 'speed' based on the trained CBOW word vectors.
# This represents the semantic similarity between words in the context of the trained model.

# stage b
# cbow_model is a generator function that yields context word pairs for training the CBOW model.
# It takes three arguments: data (the tokenized text data), window_size (the context window size), and total_vocab (the total vocabulary size).
# It iterates over each text in the data.
# For each word in the text, it generates context words and a target word.
# The context words are determined by the window_size, and the target word is the current word being considered.
# contextual is created using pad_sequences to ensure all context sequences have the same length.
# final_target is one-hot encoded using tf.keras.utils.to_categorical.
# The function yields the contextual data and the corresponding target.

# stage c
# In this part, the model architecture is defined and trained.
# An embedding layer maps input words to a 100-dimensional vector space.
# A Lambda layer computes the mean of word vectors in the context.
# A dense layer with softmax activation predicts the target word.
# The model is compiled with categorical cross-entropy loss and the Adam optimizer.
# The model is trained for 10 epochs. For each epoch, it iterates over the training data generated by cbow_model and updates the model weights.
# The cost for each epoch is printed.

# stage d
# This part prepares and saves the word vectors obtained from the trained model.
# It writes the vectors to a file named 'vectors.txt'.
# The dimensions for the vectors are specified as 100.
# The word vectors are extracted from the model's weights and written to the file.
# Loading and Using the CBOW Model
# The code loads the saved word vectors from 'vectors.txt' using Gensim.
# It then uses the most_similar function to find words similar to 'speed' in the word vector space.

In [None]:
# Output explain -->

# Here's a detailed explanation of the outputs you provided:

# The variable data is assigned an open file object for a file named 'corona.txt'.

# The variable corona_data is created by filtering the lines in the 'corona.txt' file to include only those lines that have at least two spaces.
# This results in a list of sentences or paragraphs.

# The variable vectorize is initialized as a Tokenizer object, which is used to convert text data into sequences of integers.
# It is then fit on the corona_data, and the text data in corona_data is transformed into sequences of integers.
# The total number of words in the corona_data is calculated in the total_vocab variable.

# The window_size is set to 2. This variable will be used for generating context word pairs in the CBOW model.

# The cbow_model function is defined to generate context word pairs.
# It iterates through the sentences or paragraphs in data and for each word in the text, it forms a context window of words around it based on the window_size.
# It then yields the context window as the input and the target word as the output.

# The model architecture is defined using Keras:

# An Embedding layer is added, where the input dimension is set to total_vocab (the total number of unique words) and the output dimension is set to 100.
# The input_length is set to window_size*2.
# A Lambda layer calculates the mean of the embeddings along the axis of words, resulting in a context vector of size 100.
# A Dense layer with total_vocab units and softmax activation is added for prediction.
# The model summary is printed, showing the layers and their output shapes, as well as the total number of parameters.

# The model is compiled with a categorical cross-entropy loss function and the Adam optimizer.

# The model is trained for 10 epochs. In each epoch, the cbow_model function is used to generate training data, and the train_on_batch method is used to train the model. The training cost for each epoch is printed.

# A file named 'vectors.txt' is opened for writing, and the dimensions of the vectors (102 words, each with 100 dimensions) are written to the file.

# The word embeddings (word vectors) are extracted from the model's weights and written to 'vectors.txt'.

# The gensim library is used to load the word vectors from 'vectors.txt', and the most similar words to 'speed' are calculated using the most_similar method.

In [None]:
# Output -->
# total_vocab: This is the total number of unique words in the corona_data, and it is calculated to be 198.
# It represents the size of the vocabulary that the CBOW model will work with.

# word_count: The number of unique words plus one (103).
# It represents the total number of unique words plus one, which corresponds to the number of unique words in the vocabulary plus one for an out-of-vocabulary (OOV) token.

# Epoch-wise training costs: During training, the model is trained for 10 epochs, and the training cost for each epoch is printed.
# The cost decreases with each epoch as the model learns to predict context words from target words.
# The training cost measures how well the model is fitting the training data.

# Epoch 0: 1041.83
# Epoch 1: 993.02
# Epoch 2: 905.58
# Epoch 3: 828.18
# Epoch 4: 774.80
# Epoch 5: 724.51
# Epoch 6: 671.99
# Epoch 7: 617.82
# Epoch 8: 563.52
# Epoch 9: 510.60
# The training cost is a measure of the error during training, and the decreasing values indicate that the model is learning to predict context words more accurately.

# Dimensions of word vectors: The dimensions of the word vectors (word embeddings) are written to 'vectors.txt'.
# The file header specifies 102 words (the vocabulary size) and 100 dimensions for each word vector.

# cbow_output.most_similar(positive=['speed']): This line calculates the words that are most similar to the word 'speed' based on the word embeddings learned by the model.
# The output is a list of word similarity pairs with their similarity scores.

# ('–transmission', 0.8709): The word '-transmission' is most similar to 'speed' with a similarity score of approximately 0.8709.
# ('number', 0.7511): The word 'number' is similar to 'speed' with a similarity score of approximately 0.7511.
# ('difference', 0.7483): 'Difference' is another word similar to 'speed' with a similarity score of approximately 0.7483.
# ('driver', 0.7141): The word 'driver' is similar to 'speed' with a similarity score of approximately 0.7141.
# ('interval', 0.7053): The word 'interval' has a similarity score of approximately 0.7053.
# ('before', 0.6868): 'Before' is similar to 'speed' with a similarity score of approximately 0.6868.
# ('serial', 0.6823): The word 'serial' is also similar with a similarity score of approximately 0.6823.
# ('symptoms', 0.6481): 'Symptoms' has a similarity score of approximately 0.6481.
# ('two', 0.6481): The word 'two' is similar with a similarity score of approximately 0.6481.
# ('transmission', 0.6156): 'Transmission' is similar to 'speed' with a similarity score of approximately 0.6156.
# These similarity scores indicate how closely related each word is to 'speed' in the vector space learned by the model.
# Higher similarity scores indicate closer semantic relationships between words.

In [None]:
# Questions

# What is the Continuous Bag of Words (CBOW) model?
# CBOW is a type of word embedding model used to represent words as continuous vectors in a dense vector space.
# It aims to predict a target word based on its context words.

# How is the training data prepared for the CBOW model in this code?
# The training data is prepared by tokenizing the text, and each word is encoded as an integer.
# The context-target word pairs are generated from the text data.
# What is the role of the total_vocab variable?
# total_vocab represents the size of the vocabulary used by the CBOW model, which is 198 in this code.
# It's the total number of unique words.

# What does the model architecture of the CBOW model look like?
# The model architecture consists of an embedding layer, a Lambda layer for averaging context word embeddings, and a softmax-activated dense layer to predict the target word.
# The details are provided in the model summary.

# How does the model learn word embeddings in this code?
# The model learns word embeddings by minimizing the categorical cross-entropy loss during training.
# It updates the word embeddings to predict context words from target words.

# What is the significance of the word vectors written to 'vectors.txt'?
# The word vectors represent semantically meaningful embeddings for each word.
# They can be used for various natural language processing tasks, such as word similarity and text generation.

# What does the output of cbow_output.most_similar(positive=['speed']) mean?
# It lists words that are most similar to 'speed' based on the learned word embeddings.
# The output includes similar words and their similarity scores.

# How many dimensions are used for the word vectors in this code?
# The word vectors have 100 dimensions, as specified in the code.

# What are some potential applications of word embeddings created by this CBOW model?
# Word embeddings can be used for tasks like sentiment analysis, machine translation, document classification, and information retrieval.

# What can you do to improve the performance of the CBOW model?
# You can adjust hyperparameters, increase the amount of training data, or fine-tune the model architecture to improve performance.