# Assignment 1: Practical Deep Learning for Language Processing (DS405B)
submitted by Tim-Moritz Bündert (ID: 5635975) on November 23, 2021

In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# set data directory
data_dir = "/content/drive/MyDrive/University/DS405B_PDL_for_LP/Assignment_1/vocab/"

# Part A - Learning Word2Vec on Custom Dataset using Continuous Bag of Words

## Installing and loading packages

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.15.1-py3-none-any.whl (290 kB)
[K     |████████████████████████████████| 290 kB 4.9 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.0-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 52.7 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.4 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 46.4 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 33.6 MB/s 
Collecting async-timeout<5.0,>=4.0.0a3
  Downloading async_timeout-4.0.1-py3-none-any.whl (5.7 kB)
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.man

In [None]:
from datasets import load_dataset
import torchtext
import torch
import torch.nn as nn
import numpy as np
import random
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

## Load data and tokenizer

In [None]:
data_train = load_dataset('ag_news', split=('train'))

Downloading:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset ag_news/default (download: 29.88 MiB, generated: 30.23 MiB, post-processed: Unknown size, total: 60.10 MiB) to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548...


Downloading:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/751k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset ag_news downloaded and prepared to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548. Subsequent calls will reuse this data.


In [None]:
tokenizer = torchtext.data.get_tokenizer("basic_english")
tokens = tokenizer('This is an example sentence.') # ensure that tokenizer works
tokens

['this', 'is', 'an', 'example', 'sentence', '.']

## Build custom PyTorch `Dataset` class

In [None]:
class TextDataset(Dataset):
   def __init__(self, text_data):
     self.samples = text_data
     
   def __len__(self):
     return len(self.samples)

   def __getitem__(self,idx):
     return(self.samples[idx])
    
dataset_train = TextDataset(data_train["text"]) # build class from the imported Hugging Face dataset

## Create vocabulary

In [None]:
vocab = torchtext.vocab.build_vocab_from_iterator(map(tokenizer, dataset_train), specials=["<unk>"], min_freq=50)
vocab.set_default_index(vocab["<unk>"])

In [None]:
vocab(['here', 'is', 'an', 'example'])

[475, 21, 30, 5297]

## Set up Dataloader

In [None]:
WINDOW_SIZE = 11
MAX_SEQUENCE_LENGTH = 256

def collate_batches_BOW(batch):
  batch_input, batch_output = [], []
  for text in batch:
    text_token_ids = vocab(tokenizer(text)) # transform text into sequence of token IDs

    if len(text_token_ids) < WINDOW_SIZE:
      continue

    text_token_ids = text_token_ids[:MAX_SEQUENCE_LENGTH]

    for idx in range(len(text_token_ids) - (WINDOW_SIZE - 1)):
      token_id_sequence = text_token_ids[idx: idx+WINDOW_SIZE]
      output = token_id_sequence.pop(WINDOW_SIZE//2)
      input = token_id_sequence

      batch_input.append(input)
      batch_output.append(output)

  batch_input = torch.tensor(batch_input, dtype=torch.long)
  batch_output = torch.tensor(batch_output, dtype=torch.long)
  return batch_input, batch_output

train_dataloader = DataLoader(dataset_train, batch_size=96, shuffle=True, collate_fn=collate_batches_BOW)

## Define `word2vecModel`

In [None]:
class word2vecModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()

    self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=300) 
    self.linear = nn.Linear(in_features=300, out_features=vocab_size)

  def forward(self, x):
    x = self.embeddings(x)
    x = x.mean(axis=1) # for CBOW: averaging of context (input) words, see https://arxiv.org/pdf/1301.3781.pdf [alternative: use EmbeddingBag]
    x = self.linear(x)
    return x

## Training of the CBOW model

In [None]:
model = word2vecModel(vocab_size=len(vocab.get_stoi()))
device = torch.device("cuda")
model.to(device)

word2vecModel(
  (embeddings): Embedding(7934, 300)
  (linear): Linear(in_features=300, out_features=7934, bias=True)
)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

num_epochs = 15
for epoch in range(num_epochs):
  running_loss = []
  for batch_data in train_dataloader:
    inputs = batch_data[0].to(device)
    labels = batch_data[1].to(device)

    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    running_loss.append(loss.item())

  print(f'Epoch: {epoch} Training Loss: {np.mean(running_loss)}')

  # Note: experimenting with different window sizes (i.e., 7, 9, 11) and learning rates (i.e., 0.01, 0.02), all models performed similarly in terms of the training loss. 
  # The present configuration yielded the (slightly) best performance in terms of the training loss.

Epoch: 0 Training Loss: 4.939947088241577
Epoch: 1 Training Loss: 4.266551254463196
Epoch: 2 Training Loss: 4.05287971534729
Epoch: 3 Training Loss: 3.9275704530715942
Epoch: 4 Training Loss: 3.84435050239563
Epoch: 5 Training Loss: 3.784074138069153
Epoch: 6 Training Loss: 3.738144139289856
Epoch: 7 Training Loss: 3.702759964752197
Epoch: 8 Training Loss: 3.674602738380432
Epoch: 9 Training Loss: 3.65082503528595
Epoch: 10 Training Loss: 3.6314190202713013
Epoch: 11 Training Loss: 3.615622275733948
Epoch: 12 Training Loss: 3.601422473716736
Epoch: 13 Training Loss: 3.5891007692337036
Epoch: 14 Training Loss: 3.578954795837402


## Retrieve embeddings and list nearest neighbors of seed words

In [None]:
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

In [None]:
def cosine_similarity(word_vec, words_mat): # equivalent to torch.nn.CosineSimilarity(dim=1, eps=1e-6)
  word_vec = np.reshape(word_vec, (len(word_vec), 1))
  nominator = np.matmul(words_mat, word_vec).flatten() # compute inner product of word_vec with each row (word vector) of words_mat

  word_vec_norm = np.sqrt(np.sum(word_vec**2))
  words_mat_norms = np.sqrt(np.sum(words_mat**2, axis = 1))

  denominator = word_vec_norm * words_mat_norms

  cosine_similarities = np.true_divide(nominator, denominator)
  return cosine_similarities

In [None]:
def nearest_neighbours(word: str, topN: int = 10):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings[word_id]
    cos_sim = cosine_similarity(word_vec, embeddings) # compute cosine similarity of word_vec with all words in vocabulary

    topN_ids = np.argsort(-cos_sim)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = cos_sim[sim_word_id]
    return topN_dict

In [None]:
random.seed(42)

seed_words = [vocab.lookup_token(random.randint(0, len(vocab))) for p in range(0, 5)]
print(f'Randomly selected words: {seed_words}')

Randomly selected words: ['trademark', 'rebel', '10', 'texans', 'islamabad']


In [None]:
for seed_word in seed_words:
  print(f'Seed word: {seed_word} \nNearest neighbours: {[word for word, sim in nearest_neighbours(seed_word).items()]} \n')

Seed word: trademark 
Nearest neighbours: ['peers', 'pause', 'confidence', 'flagship', 'netflix', 'portfolio', 'bid-rigging', 'hollinger', 'dollar', 'amazon'] 

Seed word: rebel 
Nearest neighbours: ['rebels', 'insurgent', 'guerrilla', 'judgment', 'insurgents', 'rebel-held', 'militant', 'militia', 'insurgency', 'separatist'] 

Seed word: 10 
Nearest neighbours: ['20', '15', '14', '40', '30', 'eight', '50', 'five', 'seven', 'nine'] 

Seed word: texans 
Nearest neighbours: ['lions', 'astros', 'jaguars', 'pistons', 'bucks', 'bowl', 'yankees', 'lakers', 'tigers', 'offseason'] 

Seed word: islamabad 
Nearest neighbours: ['kabul', 'washington', 'athens', 'tehran', 'moscow', 'rome', 'london', 'kathmandu', 'port-au-prince', 'dhaka'] 



# Part B - Movie Genre Classification using Pre-trained Word Vectors

## Loading packages

In [None]:
import torchtext
import nltk
from nltk.corpus import stopwords 
import numpy as np
import pandas as pd
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Defining genres and movies and removing stopwords from movie titles

In [None]:
genres = ['action', 'adventure', 'comedy', 'drama', 'fantasy', 'horror', 'romance', 'thriller']
movies = ['The Hangover', 'Shutter Island', 'Fight Club', 'Jumanji', 'Narcos', 'The Matrix', 'Rush Hour', 'The Mummy', 'Iron Man', 'Silence of the Lambs', 'Batman Begins', 'Spider Man', 'The Hobbit', 'Troy', 'Jurassic Park', 'Scary Movie', 'Mission Impossible', 'Ted', 'Eat Pray Love', 'The Notebook', 'Love Actually', 'The Terminal', 'Crazy Stupid Love', 'Twilight', 'The Martian', 'Pursuit of Happyness']

In [None]:
tokenizer = torchtext.data.get_tokenizer("basic_english")

movies_filtered = []
for movie in movies: 
  filtered_words = [word for word in tokenizer(movie) if word.lower() not in set(stopwords.words("english"))] # remove stopwords
  movies_filtered.append(filtered_words)

print(movies_filtered)

[['hangover'], ['shutter', 'island'], ['fight', 'club'], ['jumanji'], ['narcos'], ['matrix'], ['rush', 'hour'], ['mummy'], ['iron', 'man'], ['silence', 'lambs'], ['batman', 'begins'], ['spider', 'man'], ['hobbit'], ['troy'], ['jurassic', 'park'], ['scary', 'movie'], ['mission', 'impossible'], ['ted'], ['eat', 'pray', 'love'], ['notebook'], ['love', 'actually'], ['terminal'], ['crazy', 'stupid', 'love'], ['twilight'], ['martian'], ['pursuit', 'happyness']]


## Construct the class `PreTrainedEmbeddings` to evaluate pre-trained embeddings

In [None]:
class PreTrainedEmbeddings(object):
  def __init__(self, vocab):
    self.vocab = vocab

  def get_word_vec(self, word):
    return self.vocab.vectors[self.vocab.stoi[word]] # retrieve work vector for given word

  def cosine_similarity(self, movie_title, genre): # equivalent to torch.nn.CosineSimilarity(dim=1, eps=1e-6)
    cos_sim = []    
    genre_vec = self.get_word_vec(genre).numpy()

    for word in movie_title:
      word_vec = self.get_word_vec(word).numpy()

      nominator = np.inner(word_vec, genre_vec)

      word_vec_norm = np.sqrt(np.sum(word_vec**2))
      genre_vec_norm = np.sqrt(np.sum(genre_vec**2))

      denominator = word_vec_norm * genre_vec_norm
      
      cos_sim.append(np.true_divide(nominator, denominator))

    return np.average(cos_sim) # output the average cosine similarity of the words in the movie_title with genre
    
  def evaluate_embeddings(self, print_cos_sim = False):
    emb_results = []
    for i in range(len(movies)):
      cos_sim_movie = [self.cosine_similarity(movies_filtered[i], genre) for genre in genres]
      idx_highest_sim = np.argmax(cos_sim_movie) # select index of genre with highest average cosine similarity
      if print_cos_sim:
        print(f'Movie: {movies[i]} \nHighest similarity: {genres[idx_highest_sim]} (cosine similarity = {cos_sim_movie[idx_highest_sim]})\n')
      emb_results.append({'movie': movies[i], 'genre': genres[idx_highest_sim], 'cos_sim': cos_sim_movie[idx_highest_sim]})

    return emb_results

## Loading embeddings

### GloVe: 42B

In [None]:
glove_42B = torchtext.vocab.GloVe(name='42B', dim=300, cache=data_dir) # options: 300d
print('Loaded {} words'.format(len(glove_42B.itos)))

Loaded 1917494 words


In [None]:
glove_42B_model = PreTrainedEmbeddings(glove_42B)
glove_42B_results = glove_42B_model.evaluate_embeddings()
del glove_42B, glove_42B_model # free up memory

### GloVe: 840B

In [None]:
glove_840B = torchtext.vocab.GloVe(name='840B', dim=300, cache=data_dir) # options: 300d
print('Loaded {} words'.format(len(glove_840B.itos)))

Loaded 2196017 words


In [None]:
glove_840B_model = PreTrainedEmbeddings(glove_840B)
glove_840B_results = glove_840B_model.evaluate_embeddings()
del glove_840B, glove_840B_model # free up memory

### GloVe: twitter.27B - 50d

In [None]:
glove_twitter_27B_50d = torchtext.vocab.GloVe(name='twitter.27B', dim=50, cache=data_dir) # options: 25d, 50d, 100d, & 200d
print('Loaded {} words'.format(len(glove_twitter_27B_50d.itos)))

Loaded 1193514 words


In [None]:
glove_twitter_27B_50d_model = PreTrainedEmbeddings(glove_twitter_27B_50d)
glove_twitter_27B_50d_results = glove_twitter_27B_50d_model.evaluate_embeddings()
del glove_twitter_27B_50d, glove_twitter_27B_50d_model # free up memory

### GloVe: twitter.27B - 200d

In [None]:
glove_twitter_27B_200d = torchtext.vocab.GloVe(name='twitter.27B', dim=200, cache=data_dir) # options: 25d, 50d, 100d, & 200d
print('Loaded {} words'.format(len(glove_twitter_27B_200d.itos)))

Loaded 1193514 words


In [None]:
glove_twitter_27B_200d_model = PreTrainedEmbeddings(glove_twitter_27B_200d)
glove_twitter_27B_200d_results = glove_twitter_27B_200d_model.evaluate_embeddings()
del glove_twitter_27B_200d, glove_twitter_27B_200d_model # free up memory

### GloVe: 6B - 50d

In [None]:
glove_6B_50d = torchtext.vocab.GloVe(name='6B', dim=50, cache=data_dir) # options: 50d, 100d, 200d, & 300d
print('Loaded {} words'.format(len(glove_6B_50d.itos)))

Loaded 400000 words


In [None]:
glove_6B_50d_model = PreTrainedEmbeddings(glove_6B_50d)
glove_6B_50d_results = glove_6B_50d_model.evaluate_embeddings()
del glove_6B_50d, glove_6B_50d_model # free up memory

### GloVe: 6B - 300d

In [None]:
glove_6B_300d = torchtext.vocab.GloVe(name='6B', dim=300, cache=data_dir) # options: 50d, 100d, 200d, & 300d
print('Loaded {} words'.format(len(glove_6B_300d.itos)))

Loaded 400000 words


In [None]:
glove_6B_300d_model = PreTrainedEmbeddings(glove_6B_300d)
glove_6B_300d_results = glove_6B_300d_model.evaluate_embeddings()
del glove_6B_300d, glove_6B_300d_model # free up memory

### FastText

In [None]:
fasttext = torchtext.vocab.FastText(language='en', cache=data_dir)
print('Loaded {} words'.format(len(fasttext.itos)))

Loaded 2519370 words


In [None]:
fasttext_model = PreTrainedEmbeddings(fasttext)
fasttext_results = fasttext_model.evaluate_embeddings()
del fasttext, fasttext_model # free up memory

## Evaluation of the different pre-trained embeddings

In [None]:
results = [glove_42B_results, glove_840B_results, glove_twitter_27B_50d_results, glove_twitter_27B_200d_results, glove_6B_50d_results, glove_6B_300d_results, fasttext_results]

df = pd.DataFrame(data={'Movie':[results[0][idx]['movie'] for idx in range(len(results[0]))],
                        
                        # GloVe 42B
                        'Genre1':[glove_42B_results[idx]['genre'] for idx in range(len(glove_42B_results))],
                        'Cosine sim.1':[glove_42B_results[idx]['cos_sim'] for idx in range(len(glove_42B_results))],

                        # GloVe 840B
                        'Genre2':[glove_840B_results[idx]['genre'] for idx in range(len(glove_840B_results))],
                        'Cosine sim.2':[glove_840B_results[idx]['cos_sim'] for idx in range(len(glove_840B_results))],

                         # GloVe twitter.27B - 50d
                        'Genre3':[glove_twitter_27B_50d_results[idx]['genre'] for idx in range(len(glove_twitter_27B_50d_results))],
                        'Cosine sim.3':[glove_twitter_27B_50d_results[idx]['cos_sim'] for idx in range(len(glove_twitter_27B_50d_results))],

                        # GloVe twitter.27B - 200d
                        'Genre4':[glove_twitter_27B_200d_results[idx]['genre'] for idx in range(len(glove_twitter_27B_200d_results))],
                        'Cosine sim.4':[glove_twitter_27B_200d_results[idx]['cos_sim'] for idx in range(len(glove_twitter_27B_200d_results))],

                        # GloVe 6B - 50d
                        'Genre5':[glove_6B_50d_results[idx]['genre'] for idx in range(len(glove_6B_50d_results))],
                        'Cosine sim.5':[glove_6B_50d_results[idx]['cos_sim'] for idx in range(len(glove_6B_50d_results))],

                        # GloVe 6B - 300d
                        'Genre6':[glove_6B_300d_results[idx]['genre'] for idx in range(len(glove_6B_300d_results))],
                        'Cosine sim.6':[glove_6B_300d_results[idx]['cos_sim'] for idx in range(len(glove_6B_300d_results))],

                        # FastText
                        'Genre7':[fasttext_results[idx]['genre'] for idx in range(len(fasttext_results))],
                        'Cosine sim.7':[fasttext_results[idx]['cos_sim'] for idx in range(len(fasttext_results))]
                        
                        }).set_index('Movie')
df
df.columns = [list(np.concatenate([2*['GloVe 42B (300d)'], 2*['GloVe 840B (300d)'], 2*['GloVe twitter.27B (50d)'], 2*['GloVe twitter.27B (200d)'], 2*['GloVe 6B (50d)'], 2*['GloVe 6B (300d)'], 2*['FastText (300d)']])), 
              list(np.concatenate([len(results)*['Genre', 'Cosine sim.']]))] # set column and subcolumn names
df.index.name = ''

df

Unnamed: 0_level_0,GloVe 42B (300d),GloVe 42B (300d),GloVe 840B (300d),GloVe 840B (300d),GloVe twitter.27B (50d),GloVe twitter.27B (50d),GloVe twitter.27B (200d),GloVe twitter.27B (200d),GloVe 6B (50d),GloVe 6B (50d),GloVe 6B (300d),GloVe 6B (300d),FastText (300d),FastText (300d)
Unnamed: 0_level_1,Genre,Cosine sim.,Genre,Cosine sim.,Genre,Cosine sim.,Genre,Cosine sim.,Genre,Cosine sim.,Genre,Cosine sim.,Genre,Cosine sim.
,,,,,,,,,,,,,,
The Hangover,comedy,0.36806,horror,0.33647,horror,0.575117,horror,0.433153,horror,0.473312,horror,0.204566,comedy,0.366995
Shutter Island,adventure,0.300728,adventure,0.214342,adventure,0.596485,adventure,0.342784,action,0.312471,adventure,0.138688,adventure,0.232774
Fight Club,action,0.481213,action,0.38607,action,0.67761,action,0.499022,action,0.59311,action,0.34593,adventure,0.228288
Jumanji,horror,0.161638,thriller,0.11397,adventure,0.473823,horror,0.214121,thriller,0.357306,thriller,0.216618,adventure,0.498415
Narcos,horror,-0.035211,thriller,0.050296,horror,0.052661,thriller,0.128229,horror,0.07239,thriller,0.05745,thriller,0.338854
The Matrix,action,0.362815,action,0.238645,fantasy,0.661613,action,0.363033,fantasy,0.331969,action,0.144498,action,0.200232
Rush Hour,action,0.383655,adventure,0.255677,action,0.635423,action,0.447791,action,0.522562,action,0.276815,drama,0.261956
The Mummy,horror,0.364619,horror,0.290128,adventure,0.430433,adventure,0.29742,horror,0.393163,horror,0.227057,horror,0.44333
Iron Man,action,0.396015,action,0.275752,action,0.578071,action,0.414325,action,0.419585,action,0.212302,adventure,0.21327


Based on the above comparison, all of the pre-trained word embeddings provide reasonable results with suitable classifications for most of the movies.

Subjectively, I would tend to think that the `FastText` embeddings worked overall best considering the suitable classication of more difficult movies such as *The Hangover* and *Spider Man*. In particular, `FastText` performs well for the genre *comedy* with movies such as *Ted* or *Crazy Stupid Love*.
Also,  `GloVe 42B (300d)` yields a very good performance.

On the other hand, `GloVe 840B (300d)`, for example, performs well on genres such as *adventure* and *action*.

Furthermore, when considering the pre-trained embeddings which were evaluated both using a smaller and a higher dimensionality of word vectors, both word dimensionalities predominantly lead to the same predicted genre. However, it can be noticed that the embeddings with lower dimensionality usually provide a higher cosine similarity for that genre as does its higher-dimensional counterpart.

In general, it is important to consider that this task is inherently subjective as also humans would disagree regarding the specific genre of some movies. In addition, the title alone may not suffice to infer the genre of a film.