## Importing required libraries

In [1]:
from tqdm import tqdm
import logging
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split


from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.vocab import GloVe, vocab
from torchtext.datasets import AG_NEWS
from torchtext.data.functional import to_map_style_dataset


from torchdata.datapipes.iter import IterableWrapper, Mapper

from gensim.models import Word2Vec

import seaborn as sns
from sklearn.manifold import TSNE
from IPython.core.display import display, SVG

%matplotlib inline

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

  from IPython.core.display import display, SVG


#### Checking if CUDA is available

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


#### Defing a funtion to plot word embeddings in 2d space

In [2]:
def plot_embeddings(word_embeddings, vocab=vocab):

    # performing t-SNE on the embeddings to reduce their dimentionality to 2D using "TSNE" function from "sklearn" libary
    tsne = TSNE(n_components=2, random_state=0)
    word_embeddings_2d = tsne.fit_transform(word_embeddings)

    # plotting the results with labels from vocab
    plt.figure(figsize=(15,15))
    for i, word in enumerate(vocab.get_itos()):
        plt.scatter(word_embeddings_2d[i,0], word_embeddings_2d[i,1])
        plt.annotate(word, (word_embeddings_2d[i,0], word_embeddings_2d[i,1]))

    plt.xlabel("t-SNE component 1")
    plt.ylabel("t-SNE component 2")
    plt.title("Word Embeddings visualized with t-SNE")
    plt.show()

#### Defining a function to return similar words to a given word by calculating Cosine distance~

In [3]:
def find_similar_word(word, word_embeddings, top_k=5):
    if word not in word_embeddings:
        print("Word not found in embeddings.")
        return []

    target_embedding = word_embeddings[word]

    # calculating cosine distange with all words
    similarities = {}
    for w, embedding in word_embeddings.items():
        if w != word:
            similarity = torch.dot(target_embedding, embedding) / (torch.norm(traget_embedding) * torch.norm(embedding))
            similarities[w] = similarity.item()

    # soting the similarities
    sorted_similarities = sorted(similarities.items(), key= lambda x: x[1], reverse=True)

    # returning the tok k similar words
    most_similar_word = [w for w, _ in sorted_similarities[:tok_k]]
    return most_similar_words

#### A toy dataset

In [4]:
toy_data = """I wish I was little bit taller
I wish I was a baller
She wore a small black dress to the party
The dog chased a big red ball in the park
He had a huge smile on his face when he won the race
The tiny kitten played with a fluffy toy mouse
The team celebrated their victory with a grand parade
She bought a small, delicate necklace for her sister
The mountain peak stood majestic and tall against the clear blue sky
The toddler took small, careful steps as she learned to walk
The house had a spacious backyard with a big swimming pool
He felt a sense of accomplishment after completing the challenging puzzle
The chef prepared a delicious, flavorful dish using fresh ingredients
The children played happily in the small, cozy room
The book had an enormous impact on readers around the world
The wind blew gently, rustling the leaves of the tall trees
She painted a beautiful, intricate design on the small canvas
The concert hall was filled with thousands of excited fans
The garden was adorned with colorful flowers of all sizes
I hope to achieve great success in my chosen career path
The skyscraper towered above the city, casting a long shadow
He gazed in awe at the breathtaking view from the mountaintop
The artist created a stunning masterpiece with bold brushstrokes
The baby took her first steps, a small milestone that brought joy to her parents
The team put in a tremendous amount of effort to win the championship
The sun set behind the horizon, painting the sky in vibrant colors
The professor gave a fascinating lecture on the history of ancient civilizations
The house was filled with laughter and the sound of children playing
She received a warm, enthusiastic welcome from the audience
The marathon runner had incredible endurance and determination
The child's eyes sparkled with excitement upon opening the gift
The ship sailed across the vast ocean, guided by the stars
The company achieved remarkable growth in a short period of time
The team worked together harmoniously to complete the project
The puppy wagged its tail, expressing its happiness and affection
She wore a stunning gown that made her feel like a princess
The building had a grand entrance with towering columns
The concert was a roaring success, with the crowd cheering and clapping
The baby took a tiny bite of the sweet, juicy fruit
The athlete broke a new record, achieving a significant milestone in her career
The sculpture was a masterpiece of intricate details and craftsmanship
The forest was filled with towering trees, creating a sense of serenity
The children built a small sandcastle on the beach, their imaginations running wild
The mountain range stretched as far as the eye could see, majestic and awe-inspiring
The artist's brush glided smoothly across the canvas, creating a beautiful painting
She received a small token of appreciation for her hard work and dedication
The orchestra played a magnificent symphony that moved the audience to tears
The flower bloomed in vibrant colors, attracting butterflies and bees
The team celebrated their victory with a big, extravagant party
The child's laughter echoed through the small room, filling it with joy
The sunflower stood tall, reaching for the sky with its bright yellow petals
The city skyline was dominated by tall buildings and skyscrapers
The cake was adorned with a beautiful, elaborate design for the special occasion
The storm brought heavy rain and strong winds, causing widespread damage
The small boat sailed peacefully on the calm, glassy lake
The artist used bold strokes of color to create a striking and vivid painting
The couple shared a passionate kiss under the starry night sky
The mountain climber reached the summit after a long and arduous journey
The child's eyes widened in amazement as the magician performed his tricks
The garden was filled with the sweet fragrance of blooming flowers
The basketball player made a big jump and scored a spectacular slam dunk
The cat pounced on a small mouse, displaying its hunting instincts
The mansion had a grand entrance with a sweeping staircase and chandeliers
The raindrops fell gently, creating a rhythmic patter on the roof
The baby took a big step forward, encouraged by her parents' applause
The actor delivered a powerful and emotional performance on stage
The butterfly fluttered its delicate wings, mesmerizing those who watched
The company launched a small-scale advertising campaign to test the market
The building was constructed with strong, sturdy materials to withstand earthquakes
The singer's voice was powerful and resonated throughout the concert hall
The child built a massive sandcastle with towers, moats, and bridges
The garden was teeming with a variety of small insects and buzzing bees
The athlete's muscles were well-developed and strong from years of training
The sun cast long shadows as it set behind the mountains
The couple exchanged heartfelt vows in a beautiful, intimate ceremony
The dog wagged its tail vigorously, a sign of excitement and happiness
The baby let out a tiny giggle, bringing joy to everyone around"""


#### Tokenizing and building the vocabulary

In [5]:
# The "basic_english" tokenizer from "torchtext" library
tokenizer = get_tokenizer("basic_english")

# A function to get tokenized text for one document at a time
def tokenize_data(sentences):
    for sentence in sentences:
        yield tokenizer(sentence)

# tokenizing the entire toy dataset
tokenized_toy_data = tokenizer(toy_data)

# building the vocabulary using 'bulid_vocab_from_iterator' function from "torchtext" library
vocab = build_vocab_from_iterator(tokenize_data(tokenized_toy_data), specials=['<unk>'])
vocab.set_default_index(vocab["<unk>"]) # This index will be returned when OOV token is queried

# printing the tokenized text and token indices of 1st example document
print("Example Document:-   ",toy_data.split('\n')[0])
print("Tokenized Document:- ",tokenizer(toy_data.split('\n')[0]))
print("Token Indices:-      ", vocab(tokenizer(toy_data.split('\n')[0])))

Example Document:-    I wish I was little bit taller
Tokenized Document:-  ['i', 'wish', 'i', 'was', 'little', 'bit', 'taller']
Token Indices:-       [20, 108, 20, 7, 272, 136, 376]


## Continuous Bag of Words (CBOW) model

#### Pre-processing the data for training

In [33]:
# generating target and context traing data
CONTEXT_SIZE = 2
cobow_data = []
for i in range(1, len(tokenized_toy_data)-CONTEXT_SIZE):
    
    context = (
        [tokenized_toy_data[i-j-1] for j in range(CONTEXT_SIZE)]+
        [tokenized_toy_data[i+j+1] for j in range(CONTEXT_SIZE)]
    )

    target = tokenized_toy_data[i]
    cobow_data.append((context, target))

print("Example trining data:- ([......Context words......], Traget word)")
print(f"                       {cobow_data[0]}")   
print(f"Token Indices       :- ({vocab(cobow_data[0][0])}, {vocab[cobow_data[0][1]]})")

Example trining data:- ([......Context words......], Traget word)
                       (['i', 'around', 'i', 'was'], 'wish')
Token Indices       :- ([20, 51, 20, 7], 108)


In [43]:
# a function to convert the pre-processed data into tensors for each "batch" from the "dataloader"
def collate_batch(batch):
    target_list, context_list, offsets = [], [], [0]

    for _context, _target in batch:
        target_list.append(vocab[_target])
        context_tensor = torch.tensor(vocab(_context), dtype=torch.int64)
        context_list.append(context_tensor)
        offsets.append(context_tensor.size(0))

    target_list_tensor = torch.tensor(target_list, dtype=torch.int64)
    context_list_tensor = torch.cat(context_list)
    offesets_tensor = torch.tensor(offsets[:-1]).cumsum(dim=0)
    
    return target_list_tensor.to(device), context_list_tensor.to(device), offesets_tensor.to(device)

# processing the first 10 traning data using the collate_batch function
target_tensor, context_tensor, offsets_tensor = collate_batch(cobow_data[0:10])
print("The first 10 traning data tensors -")
print("\noffsets_tensor :-\n\n", offsets_tensor)
print("\n\ncontext_tensor:-\n\n", context_tensor)
print("\n\ntarget_tensor:-\n\n", target_tensor)

The first 10 traning data tensors -

offsets_tensor :-

 tensor([ 0,  4,  8, 12, 16, 20, 24, 28, 32, 36], device='cuda:0')


context_tensor:-

 tensor([ 20,  51,  20,   7, 108,  20,   7, 272,  20, 108, 272, 136,   7,  20,
        136, 376, 272,   7, 376,  20, 136, 272,  20, 108, 376, 136, 108,  20,
         20, 376,  20,   7, 108,  20,   7,   2,  20, 108,   2, 133],
       device='cuda:0')


target_tensor:-

 tensor([108,  20,   7, 272, 136, 376,  20, 108,  20,   7], device='cuda:0')


#### Creating dataloaders for ML model

In [47]:
BATCH_SIZE = 64

# creating dataloaders using "DataLoader" function from "pytorch" library
dataloader_cbow = DataLoader(
    cobow_data,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_batch
)