Implement the Continuous Bag of Words (CBOW) Model for the given (textual document 1) using the below steps:

a. Data preparation

b. Generate training data

c. Train model

d. Output

In [1]:
#imports

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
import numpy as np
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.optimizers import SGD
from keras.utils import to_categorical
from sklearn.metrics.pairwise import euclidean_distances

In [3]:
data="""The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. The reproductive number – the number of secondary infections generated from one infected individual – is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. However, estimates for both COVID-19 and influenza viruses are very context and time-specific, making direct comparisons more difficult.  
"""

In [4]:
words = data.split()

In [5]:
words = [x.lower().replace(",","").replace(".","").replace("(","").replace(")","") for x in words]

In [6]:
words

['the',
 'speed',
 'of',
 'transmission',
 'is',
 'an',
 'important',
 'point',
 'of',
 'difference',
 'between',
 'the',
 'two',
 'viruses',
 'influenza',
 'has',
 'a',
 'shorter',
 'median',
 'incubation',
 'period',
 'the',
 'time',
 'from',
 'infection',
 'to',
 'appearance',
 'of',
 'symptoms',
 'and',
 'a',
 'shorter',
 'serial',
 'interval',
 'the',
 'time',
 'between',
 'successive',
 'cases',
 'than',
 'covid-19',
 'virus',
 'the',
 'serial',
 'interval',
 'for',
 'covid-19',
 'virus',
 'is',
 'estimated',
 'to',
 'be',
 '5-6',
 'days',
 'while',
 'for',
 'influenza',
 'virus',
 'the',
 'serial',
 'interval',
 'is',
 '3',
 'days',
 'this',
 'means',
 'that',
 'influenza',
 'can',
 'spread',
 'faster',
 'than',
 'covid-19',
 'further',
 'transmission',
 'in',
 'the',
 'first',
 '3-5',
 'days',
 'of',
 'illness',
 'or',
 'potentially',
 'pre-symptomatic',
 'transmission',
 '–transmission',
 'of',
 'the',
 'virus',
 'before',
 'the',
 'appearance',
 'of',
 'symptoms',
 '–',
 'is'

In [7]:
vocab = set(words)

In [8]:
vocab_size = len(vocab)
vocab_size

100

In [9]:
word_to_idx = {word : i for i, word in enumerate(vocab)}
idx_to_word = {i : word for i, word in enumerate(vocab)}

In [10]:
word_to_idx

{'reproductive': 0,
 'covid-19': 1,
 'we': 2,
 'however': 3,
 'people': 4,
 'driver': 5,
 'faster': 6,
 'not': 7,
 'and': 8,
 'present': 9,
 'making': 10,
 'estimated': 11,
 'while': 12,
 'shed': 13,
 'appear': 14,
 'to': 15,
 'both': 16,
 'days': 17,
 'point': 18,
 'learning': 19,
 'very': 20,
 'further': 21,
 'infections': 22,
 'infection': 23,
 'hours': 24,
 'incubation': 25,
 '3-5': 26,
 '25': 27,
 'influenza': 28,
 'cases': 29,
 'direct': 30,
 'time': 31,
 'viruses': 32,
 'can': 33,
 'higher': 34,
 'context': 35,
 'before': 36,
 'understood': 37,
 '–': 38,
 'there': 39,
 'infected': 40,
 'of': 41,
 'at': 42,
 'generated': 43,
 'estimates': 44,
 'or': 45,
 'virus': 46,
 'onset': 47,
 'pre-symptomatic': 48,
 'serial': 49,
 'individual': 50,
 'is': 51,
 'interval': 52,
 'that': 53,
 'difficult': 54,
 'time-specific': 55,
 'spread': 56,
 'the': 57,
 'in': 58,
 'major': 59,
 'appearance': 60,
 'are': 61,
 'between': 62,
 'comparisons': 63,
 'transmission': 64,
 'more': 65,
 'speed': 66

In [11]:
idx_to_word

{0: 'reproductive',
 1: 'covid-19',
 2: 'we',
 3: 'however',
 4: 'people',
 5: 'driver',
 6: 'faster',
 7: 'not',
 8: 'and',
 9: 'present',
 10: 'making',
 11: 'estimated',
 12: 'while',
 13: 'shed',
 14: 'appear',
 15: 'to',
 16: 'both',
 17: 'days',
 18: 'point',
 19: 'learning',
 20: 'very',
 21: 'further',
 22: 'infections',
 23: 'infection',
 24: 'hours',
 25: 'incubation',
 26: '3-5',
 27: '25',
 28: 'influenza',
 29: 'cases',
 30: 'direct',
 31: 'time',
 32: 'viruses',
 33: 'can',
 34: 'higher',
 35: 'context',
 36: 'before',
 37: 'understood',
 38: '–',
 39: 'there',
 40: 'infected',
 41: 'of',
 42: 'at',
 43: 'generated',
 44: 'estimates',
 45: 'or',
 46: 'virus',
 47: 'onset',
 48: 'pre-symptomatic',
 49: 'serial',
 50: 'individual',
 51: 'is',
 52: 'interval',
 53: 'that',
 54: 'difficult',
 55: 'time-specific',
 56: 'spread',
 57: 'the',
 58: 'in',
 59: 'major',
 60: 'appearance',
 61: 'are',
 62: 'between',
 63: 'comparisons',
 64: 'transmission',
 65: 'more',
 66: 'speed'

In [12]:
# generate context-word pairs
context_window = 2
embed_dims = 100

In [13]:
x_train=[]
y_train=[]

for index, i in enumerate(words):
    
    # edge case
    if index <=context_window-1 or index >= len(words)-context_window:
        continue
        
    #sliding context window
    start = index - context_window
    end = index + context_window +1
    
    #here, 2 words left and 2 words right of target word
    context = words[start:end]
    context.remove(i)
    target=i
    
    #data
    x_train.append([word_to_idx[w] for w in context])
    y_train.append([word_to_idx[target]])

In [14]:
x_train = np.array(x_train)
y_train = np.array(y_train)

In [15]:
y_train = to_categorical(y_train, num_classes=vocab_size)

In [16]:
# model building
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_dims, input_length=2*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_dims,)))
cbow.add(Dense(vocab_size, activation='softmax'))

In [17]:
cbow.compile(loss='categorical_crossentropy', optimizer=SGD(), metrics=['accuracy'])

In [18]:
cbow.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 100)            10000     
                                                                 
 lambda (Lambda)             (None, 100)               0         
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
Total params: 20,100
Trainable params: 20,100
Non-trainable params: 0
_________________________________________________________________


In [1]:
#cbow.fit(x_train, y_train, epochs=150000, batch_size=256)

In [25]:
preds = cbow.predict(x_train)



In [26]:
preds.shape

(182, 100)

In [27]:
pred = preds.argmax(axis=1)
pred.shape

(182,)

In [28]:
i=0
for context in x_train:
    print("...............................")
    print("===========Context :===========")
    for w in context:
        print(idx_to_word[w])
    print("=========Predicted Target :====")
    print(idx_to_word[pred[i]])
    i+=1

...............................
the
speed
transmission
is
of
...............................
speed
of
is
an
transmission
...............................
of
transmission
an
important
is
...............................
transmission
is
important
point
an
...............................
is
an
point
of
important
...............................
an
important
of
difference
point
...............................
important
point
difference
between
of
...............................
point
of
between
the
difference
...............................
of
difference
the
two
between
...............................
difference
between
two
viruses
the
...............................
between
the
viruses
influenza
two
...............................
the
two
influenza
has
viruses
...............................
two
viruses
has
a
influenza
...............................
viruses
influenza
a
shorter
has
...............................
influenza
has
shorter
median
a
...............................
has
a
median
inc