## In this notebook we will try to implement the CBOWS and Skipgram models

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import re
import string
torch.manual_seed(1)

<torch._C.Generator at 0x7ef833fb9590>

In [2]:
from sklearn.model_selection import train_test_split

In [3]:

def clean_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation using string.punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove extra whitespaces
    text = re.sub('\s+', ' ', text).strip()

    return text

In [None]:
with open('old-man-and-the-sea.txt', 'r') as file:
    text = file.read()

print(text)

In [5]:
cleaned_text = clean_text(text)
print(cleaned_text)

what i want you to do said mr george wright as he leaned towards the old sailor is to be an uncle to me aye aye said the mystified mr kemp pausing with a mug of beer midway to his lips a rich uncle continued the young man lowering his voice to prevent any keen ears in the next bar from acquiring useless knowledge an uncle from new zealand who is going to leave me all is money wheres it coming from demanded mr kemp with a little excitement it aint coming was the reply youve only got to say youve got it fact of the matter is ive got my eye on a young lady theres another chap after er too and if she thought id got a rich uncle it might make all the difference she knows i ad an uncle that went to new zealand and was never heard of since thats what made me think of it mr kemp drank his beer in thoughtful silence how can i be a rich uncle without any brass he inquired at length i should ave to lend you somea little said mr wright illustration what i want you to do said mr george wright is to

In [6]:
cleaned_text = cleaned_text.split()

In [7]:
print(cleaned_text)

['what', 'i', 'want', 'you', 'to', 'do', 'said', 'mr', 'george', 'wright', 'as', 'he', 'leaned', 'towards', 'the', 'old', 'sailor', 'is', 'to', 'be', 'an', 'uncle', 'to', 'me', 'aye', 'aye', 'said', 'the', 'mystified', 'mr', 'kemp', 'pausing', 'with', 'a', 'mug', 'of', 'beer', 'midway', 'to', 'his', 'lips', 'a', 'rich', 'uncle', 'continued', 'the', 'young', 'man', 'lowering', 'his', 'voice', 'to', 'prevent', 'any', 'keen', 'ears', 'in', 'the', 'next', 'bar', 'from', 'acquiring', 'useless', 'knowledge', 'an', 'uncle', 'from', 'new', 'zealand', 'who', 'is', 'going', 'to', 'leave', 'me', 'all', 'is', 'money', 'wheres', 'it', 'coming', 'from', 'demanded', 'mr', 'kemp', 'with', 'a', 'little', 'excitement', 'it', 'aint', 'coming', 'was', 'the', 'reply', 'youve', 'only', 'got', 'to', 'say', 'youve', 'got', 'it', 'fact', 'of', 'the', 'matter', 'is', 'ive', 'got', 'my', 'eye', 'on', 'a', 'young', 'lady', 'theres', 'another', 'chap', 'after', 'er', 'too', 'and', 'if', 'she', 'thought', 'id', 'go

In [8]:
split_index = int(len(cleaned_text) * 0.8)

In [9]:
train = cleaned_text[:split_index]
test = cleaned_text[split_index:]

In [10]:
print(train)

['what', 'i', 'want', 'you', 'to', 'do', 'said', 'mr', 'george', 'wright', 'as', 'he', 'leaned', 'towards', 'the', 'old', 'sailor', 'is', 'to', 'be', 'an', 'uncle', 'to', 'me', 'aye', 'aye', 'said', 'the', 'mystified', 'mr', 'kemp', 'pausing', 'with', 'a', 'mug', 'of', 'beer', 'midway', 'to', 'his', 'lips', 'a', 'rich', 'uncle', 'continued', 'the', 'young', 'man', 'lowering', 'his', 'voice', 'to', 'prevent', 'any', 'keen', 'ears', 'in', 'the', 'next', 'bar', 'from', 'acquiring', 'useless', 'knowledge', 'an', 'uncle', 'from', 'new', 'zealand', 'who', 'is', 'going', 'to', 'leave', 'me', 'all', 'is', 'money', 'wheres', 'it', 'coming', 'from', 'demanded', 'mr', 'kemp', 'with', 'a', 'little', 'excitement', 'it', 'aint', 'coming', 'was', 'the', 'reply', 'youve', 'only', 'got', 'to', 'say', 'youve', 'got', 'it', 'fact', 'of', 'the', 'matter', 'is', 'ive', 'got', 'my', 'eye', 'on', 'a', 'young', 'lady', 'theres', 'another', 'chap', 'after', 'er', 'too', 'and', 'if', 'she', 'thought', 'id', 'go

In [11]:
print(test)

['his', 'sex', 'and', 'then', 'merely', 'in', 'the', 'interests', 'of', 'natural', 'science', 'dont', 'you', 'worry', 'he', 'said', 'as', 'the', 'other', 'paused', 'from', 'exhaustion', 'it', 'wont', 'be', 'for', 'long', 'now', 'long', 'said', 'mr', 'wright', 'panting', 'first', 'thing', 'tomorrow', 'morning', 'you', 'have', 'a', 'telegram', 'calling', 'you', 'backa', 'telegram', 'that', 'must', 'be', 'minded', 'dye', 'see', 'no', 'i', 'dont', 'said', 'mr', 'kemp', 'plainly', 'im', 'not', 'going', 'back', 'never', 'no', 'morenever', 'im', 'going', 'to', 'stop', 'here', 'and', 'court', 'mrs', 'bradshaw', 'mr', 'wright', 'fought', 'for', 'breath', 'youyou', 'cant', 'he', 'gasped', 'im', 'going', 'to', 'have', 'a', 'try', 'said', 'the', 'old', 'man', 'im', 'sick', 'of', 'going', 'to', 'sea', 'and', 'itll', 'be', 'a', 'nice', 'comfortable', 'home', 'for', 'my', 'old', 'age', 'you', 'marry', 'bella', 'and', 'ill', 'marry', 'her', 'mother', 'happy', 'family', 'mr', 'wright', 'trembling', 'wi

In [12]:
vocab, _ = train_test_split(cleaned_text, test_size=0.2, random_state=42)

In [13]:
vocab = set(vocab)

In [14]:
vocab_size = len(vocab)

In [15]:
print(vocab)

{'moment', 'family', 'hesitating', 'listen', 'gasped', 'happen', 'hadnt', 'led', 'her', 'admiring', 'smokes', 'reward', 'reply', 'communicated', 'let', 'deal', 'determination', 'affair', 'drank', 'pausedjust', 'admission', 'beside', 'sailor', 'face', 'must', 'steps', 'meant', 'too', 'last', 'hes', 'cold', 'grin', 'wind', 'long', 'turned', 'court', 'settle', 'leave', 'dignity', 'overdoing', 'chain', 'open', 'eyes', 'produced', 'coughing', 'quid', 'zealand', 'dream', 'plainly', 'hand', 'street', 'well', 'home', 'wants', 'lose', 'interrupted', 'secure', 'had', 'mother', 'praps', 'seems', 'cigars', 'difficulties', 'finally', 'feminine', 'towards', 'chilljust', 'expenditure', 'us', 'another', 'wrights', 'easy', 'can', 'wealth', 'something', 'watch', 'charmed', 'fancy', 'says', 'following', 'cough', 'words', 'breakfast', 'simple', 'dear', 'lasted', 'sun', 'calling', 'a', 'emptied', 'uncle', 'his', 'annoyance', 'year', 'penetrating', 'difference', 'having', 'three', 'either', 'natural', 'dime

In [16]:
#vocab_test = set(test)
ct1= 0
for word in train:
  if word not in vocab:
    ct1+=1

print(ct1)
print(len(vocab))
print(len(test))
print(len(train))

86
919
804
3216


In [17]:
ct=0
for word in test:
  if word not in vocab:
    ct+=1
print(ct)

30


In [18]:
word_to_ix = {word: i+1 for i, word in enumerate(vocab)}

In [19]:
word_to_ix['<UNK>'] = 0

In [20]:
print(word_to_ix['i'])

553


In [22]:
ix_to_word = {}

In [23]:
for key, val in word_to_ix.items():
  ix_to_word[val] = key

In [25]:
CONTEXT_SIZE = 2

In [26]:
data_train = []
for i in range(CONTEXT_SIZE, len(train) - CONTEXT_SIZE):
    context = (
        [train[i - j - 1] for j in range(CONTEXT_SIZE)]
        + [train[i + j + 1] for j in range(CONTEXT_SIZE)]
    )
    target = train[i]
    data_train.append((context, target))
print(data_train[:5])


[(['i', 'what', 'you', 'to'], 'want'), (['want', 'i', 'to', 'do'], 'you'), (['you', 'want', 'do', 'said'], 'to'), (['to', 'you', 'said', 'mr'], 'do'), (['do', 'to', 'mr', 'george'], 'said')]


In [27]:
data_test = []
for i in range(CONTEXT_SIZE, len(test) - CONTEXT_SIZE):
    context = (
        [test[i - j - 1] for j in range(CONTEXT_SIZE)]
        + [test[i + j + 1] for j in range(CONTEXT_SIZE)]
    )
    target = test[i]
    data_test.append((context, target))
print(data_test[:5])

[(['sex', 'his', 'then', 'merely'], 'and'), (['and', 'sex', 'merely', 'in'], 'then'), (['then', 'and', 'in', 'the'], 'merely'), (['merely', 'then', 'the', 'interests'], 'in'), (['in', 'merely', 'interests', 'of'], 'the')]


In [28]:
def make_context_vector(batches, word_to_ix, vocab):
  f_idx = []
  f_trg = []
  for batch in batches:
    lst = []
    trg = []
    for idx, item in enumerate(batch):
      context = item[0]
      target = item[1]
      lst.append(context)
      trg.append(target)
    idxs = [[word_to_ix[w] if w in vocab else 0 for w in sample] for sample in lst]
    target = [word_to_ix[sample] if sample in vocab else 0 for sample in trg]

    f_idx.append(idxs)
    f_trg.append(target)
  return torch.tensor(f_idx, dtype=torch.long), torch.tensor(f_trg,dtype=torch.long)

In [29]:
batch_size = 64
num_batches = len(data_train) // batch_size
batches = [data_train[i*batch_size:(i+1)*batch_size] for i in range(num_batches)]



In [30]:
inputs,targets = make_context_vector(batches, word_to_ix,vocab)

In [31]:
print(inputs.shape)
print(targets.shape)

torch.Size([50, 64, 4])
torch.Size([50, 64])


# CBOW Model

In [32]:
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOWModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size+1, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size+1)

    def forward(self, inputs):
        embedded = self.embeddings(inputs).mean(dim=1)
        output = self.linear(embedded)
        return output

In [33]:
EMBEDDING_DIM = 300
LEARNING_RATE = 0.001

In [34]:
model = CBOWModel(vocab_size, EMBEDDING_DIM)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [35]:
NUM_EPOCHS=30

In [None]:
for epoch in range(NUM_EPOCHS):
        id = 0
        total_loss = 0
        for input,target in zip(inputs,targets):
            optimizer.zero_grad()
            outputs = model(input)
            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Batch {id+1}/50, Loss: {loss.item()}")
            id+=1
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {total_loss/50}")

In [37]:
context_tensor ,target = inputs[0], targets[0]

In [38]:
outputs = model(context_tensor[1].unsqueeze(0))
print(torch.argmax(outputs))
print(target[1])
#print(predicted_word_index)

tensor(498)
tensor(498)


In [39]:
inputs_test,targets_test = make_context_vector([data_test], word_to_ix,vocab)

In [40]:
print(inputs_test.shape)
print(targets_test.shape)

torch.Size([1, 800, 4])
torch.Size([1, 800])


In [41]:

test_vectors = inputs_test[0]
for i in range(100,200):
  print(data_test[i])
  output_test = model(test_vectors[i].unsqueeze(0))
  print(ix_to_word[int(torch.argmax(output_test))])


(['be', 'itll', 'nice', 'comfortable'], 'a')
only
(['a', 'be', 'comfortable', 'home'], 'nice')
i
(['nice', 'a', 'home', 'for'], 'comfortable')
in
(['comfortable', 'nice', 'for', 'my'], 'home')
in
(['home', 'comfortable', 'my', 'old'], 'for')
in
(['for', 'home', 'old', 'age'], 'my')
man
(['my', 'for', 'age', 'you'], 'old')
take
(['old', 'my', 'you', 'marry'], 'age')
said
(['age', 'old', 'marry', 'bella'], 'you')
said
(['you', 'age', 'bella', 'and'], 'marry')
have
(['marry', 'you', 'and', 'ill'], 'bella')
<UNK>
(['bella', 'marry', 'ill', 'marry'], 'and')
money
(['and', 'bella', 'marry', 'her'], 'ill')
at
(['ill', 'and', 'her', 'mother'], 'marry')
<UNK>
(['marry', 'ill', 'mother', 'happy'], 'her')
money
(['her', 'marry', 'happy', 'family'], 'mother')
of
(['mother', 'her', 'family', 'mr'], 'happy')
of
(['happy', 'mother', 'mr', 'wright'], 'family')
said
(['family', 'happy', 'wright', 'trembling'], 'mr')
mr
(['mr', 'family', 'trembling', 'with'], 'wright')
kemp
(['wright', 'mr', 'with', 'ra

In [51]:
test_case_words=['was','i','and','that']
test_case_nums = []
for w in test_case_words:
  if w in vocab:
    test_case_nums.append(word_to_ix[w])
  else:
    test_case_nums.append(0)

test_case_tensor = torch.tensor(test_case_nums)
print(test_case_nums)

output_test = model(test_case_tensor.unsqueeze(0))
print(ix_to_word[int(torch.argmax(output_test))])

[204, 553, 465, 878]
like


# Skipgram Model

In [89]:
class SkipgramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipgramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, vocab_size)
        self.linear2 = nn.Linear(embedding_dim, vocab_size)
        self.linear3 = nn.Linear(embedding_dim, vocab_size)
        self.linear4 = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embedded = self.embeddings(inputs)
        output1 = self.linear1(embedded)
        output2 = self.linear2(embedded)
        output3 = self.linear3(embedded)
        output4 = self.linear4(embedded)
        return output1, output2, output3, output4

In [None]:
print(inputs)

In [78]:
targets1_skip ,targets2_skip, targets3_skip, targets4_skip = inputs[:,:,0], inputs[:,:,1], inputs[:,:,2], inputs[:,:,3]

In [79]:
inputs_skip = targets

In [None]:
for target1,target2,target3,target4,input in zip(targets1_skip,targets2_skip,targets3_skip,targets4_skip,inputs_skip):
  print(target1,target2,target3,target4,input)

In [133]:
NUM_EPOCHS = 500
LEARNING_RATE = 0.005

In [134]:
model_skip = SkipgramModel(vocab_size+1, EMBEDDING_DIM)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_skip.parameters(), lr=LEARNING_RATE)

In [None]:
for epoch in range(NUM_EPOCHS):
        id = 0
        total_loss = 0
        for target1,target2,target3,target4,input in zip(targets1_skip,targets2_skip,targets3_skip,targets4_skip,inputs_skip):
            optimizer.zero_grad()
            output1, output2, output3, output4 = model_skip(input)
            #print(output1.shape)
            loss1 = criterion(output1, target1)
            loss2 = criterion(output2, target2)
            loss3 = criterion(output3, target3)
            loss4 = criterion(output4, target4)
            loss = (loss1+loss2+loss3+loss4)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            #print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Batch {id+1}/50, Loss: {loss.item()}")
            id+=1
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {total_loss/50}")

In [105]:
inputs_skip[0][0]

tensor(349)

In [136]:
for i in range(10):
  output1, output2, output3, output4 = model_skip(inputs_skip[0][i].unsqueeze(0))
  print(data_train[i])
  print('------')
  print(ix_to_word[int(torch.argmax(output1))])
  print(ix_to_word[int(torch.argmax(output2))])
  print(ix_to_word[int(torch.argmax(output3))])
  print(ix_to_word[int(torch.argmax(output4))])
  print('--------------------------------')

(['i', 'what', 'you', 'to'], 'want')
------
i
what
to
sixpenny
--------------------------------
(['want', 'i', 'to', 'do'], 'you')
------
did
i
go
got
--------------------------------
(['you', 'want', 'do', 'said'], 'to')
------
<UNK>
the
the
the
--------------------------------
(['to', 'you', 'said', 'mr'], 'do')
------
itll
you
said
go
--------------------------------
(['do', 'to', 'mr', 'george'], 'said')
------
he
to
mr
wright
--------------------------------
(['said', 'do', 'george', 'wright'], 'mr')
------
said
the
kemp
<UNK>
--------------------------------
(['mr', 'said', 'wright', 'as'], 'george')
------
mr
the
wright
said
--------------------------------
(['george', 'mr', 'as', 'he'], 'wright')
------
mr
said
with
the
--------------------------------
(['wright', 'george', 'he', 'leaned'], 'as')
------
far
mr
he
as
--------------------------------
(['as', 'wright', 'leaned', 'towards'], 'he')
------
as
his
said
in
--------------------------------
