In [10]:
import torchtext.vocab as vocab
import torch
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /Users/abhiojha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abhiojha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/abhiojha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/abhiojha/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### TODO:
1. Download pretrained models - [Fasttext](https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip) and [Glove](https://nlp.stanford.edu/data/glove.6B.zip)
2. Unzip and place the downloaded files (*only* `glove.6B.300d.text` and `wiki-news-300d-1M.vec`) in the `data` directory at the root of this repo.

In [15]:
sample_text = "The quick brown fox jumps over the lazy dog."
tokens = sample_text.lower().split()

### GloVE

In [7]:
import numpy as np

def preprocess_text(text: str) -> list:
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join(c for c in text if c not in '.,;:-')
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return tokens

def load_glove_model(file) -> dict:
    # init an empty dict to store "word" as key and its "embedding" as value.
    glove_model = {}
    with open(file,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    return glove_model

embedding_dict = load_glove_model("../data/glove.6B.300d.txt")

# Let's check embeddings of a word
hello_embedding = embedding_dict['hello']
print(hello_embedding)
# Let's print the embedding vector dimension
# This should be 300 as we are using the pretrained model, which generates 300 dim embedding vector
print(hello_embedding.shape)

[-3.3712e-01 -2.1691e-01 -6.6365e-03 -4.1625e-01 -1.2555e+00 -2.8466e-02
 -7.2195e-01 -5.2887e-01  7.2085e-03  3.1997e-01  2.9425e-02 -1.3236e-02
  4.3511e-01  2.5716e-01  3.8995e-01 -1.1968e-01  1.5035e-01  4.4762e-01
  2.8407e-01  4.9339e-01  6.2826e-01  2.2888e-01 -4.0385e-01  2.7364e-02
  7.3679e-03  1.3995e-01  2.3346e-01  6.8122e-02  4.8422e-01 -1.9578e-02
 -5.4751e-01 -5.4983e-01 -3.4091e-02  8.0017e-03 -4.3065e-01 -1.8969e-02
 -8.5670e-02 -8.1123e-01 -2.1080e-01  3.7784e-01 -3.5046e-01  1.3684e-01
 -5.5661e-01  1.6835e-01 -2.2952e-01 -1.6184e-01  6.7345e-01 -4.6597e-01
 -3.1834e-02 -2.6037e-01 -1.7797e-01  1.9436e-02  1.0727e-01  6.6534e-01
 -3.4836e-01  4.7833e-02  1.6440e-01  1.4088e-01  1.9204e-01 -3.5009e-01
  2.6236e-01  1.7626e-01 -3.1367e-01  1.1709e-01  2.0378e-01  6.1775e-01
  4.9075e-01 -7.5210e-02 -1.1815e-01  1.8685e-01  4.0679e-01  2.8319e-01
 -1.6290e-01  3.8388e-02  4.3794e-01  8.8224e-02  5.9046e-01 -5.3515e-02
  3.8819e-02  1.8202e-01 -2.7599e-01  3.9474e-01 -2

In [14]:
# Now let's create the embedding matrix for sample_text
sample_tokens = preprocess_text(sample_text)
sample_embedding_matrix = []

for sample_token in sample_tokens:
    sample_embedding_matrix.append(embedding_dict[sample_token])

# we should have as many embedding vectors (rows of embedding matrix) as there are sample tokens
assert len(sample_embedding_matrix) == len(sample_tokens)

# lets print a token and its embedding
print(sample_tokens[2])
print(sample_embedding_matrix[2])

brown
[ 0.2793     0.18372   -0.11257    0.21734   -0.21657   -0.50335
 -0.27194    0.32181    0.031892  -0.37998    0.15544   -0.32953
 -0.19827    0.20403    0.26768    0.292     -0.34187   -0.10766
 -0.43697   -0.14488    0.14634    0.21591    0.12576    0.14895
 -0.21763    0.030797   0.10949   -0.41689   -0.30296   -0.14592
 -0.56228    0.33282   -0.20436   -0.24403   -1.4732     0.68345
  0.45336    0.43671   -0.15641    0.15075   -0.24265   -0.040059
  0.22323    0.19523    0.37445   -0.18509   -0.10302   -0.055363
 -0.17274   -0.45401   -0.14729   -0.24133   -0.043826  -0.23243
  0.42367    0.15906   -0.14039   -0.36185   -0.26695   -0.42724
 -0.08843   -0.099597   0.24257   -0.05424    0.10746   -1.1304
  0.024651  -0.10212    0.046319  -0.68792    0.4214    -0.25844
  0.17052    0.097878   0.026835   0.32044    0.0062988  0.24575
  0.20126   -0.16771    0.19825    0.28939   -0.064994  -0.38766
  0.52509    0.38195    0.32421    0.20683   -0.48472   -0.080334
 -0.15345    0.35

### FastText

In [16]:
def load_fasttext_model(file) -> dict:
    # init an empty dict to store "word" as key and its "embedding" as value.
    fasttext_model = {}
    with open(file,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            fasttext_model[word] = embedding
    return fasttext_model

fasttext_embeddings_dict = load_fasttext_model("../data/wiki-news-300d-1M.vec")

In [18]:
# Now let's create the embedding matrix for sample_text
sample_tokens = preprocess_text(sample_text)
sample_embedding_matrix = []

for sample_token in sample_tokens:
    sample_embedding_matrix.append(fasttext_embeddings_dict[sample_token])

# we should have as many embedding vectors (rows of embedding matrix) as there are sample tokens
assert len(sample_embedding_matrix) == len(sample_tokens)

# lets print a token and its embedding
print(sample_tokens[5])
print(sample_embedding_matrix[5])

over
[-0.0083  0.0266  0.0112  0.0168 -0.0291 -0.034   0.0204 -0.074  -0.1905
 -0.0266  0.0245 -0.0254  0.0589  0.0443 -0.0458 -0.0519  0.0587  0.1551
 -0.1766 -0.0956 -0.0185 -0.0804 -0.0007 -0.1643  0.0288 -0.0548  0.0263
  0.0897  0.0194 -0.1682 -0.0025  0.0461 -0.0981 -0.0895  0.0057  0.0592
 -0.0685 -0.0009  0.0894 -0.0738  0.008   0.0338 -0.114   0.0358 -0.0308
  0.0174  0.0246 -0.023   0.0236  0.0314 -0.0347 -0.2517 -0.6334  0.0026
  0.0807  0.0234 -0.0055  0.0734 -0.0869 -0.0892 -0.0552 -0.0022  0.0213
  0.0716 -0.0678 -0.0334  0.0464 -0.077  -0.0407 -0.0212  0.0162  0.0318
 -0.0227 -0.1448 -0.0689 -0.0721  0.0393  0.0293 -0.0636 -0.0816  0.0161
  0.0443  0.0326 -0.2373  0.0244  0.0431 -0.0376  0.0537  0.041   0.0221
  0.0108 -0.0856 -0.0308 -0.1633 -0.1061  0.0288 -0.0396 -0.0036  0.0098
 -0.1006 -0.1569 -0.0825  0.0698  0.0929 -0.3181 -0.0673  0.3534  0.033
  0.0156  0.0464  0.053   0.1033 -0.1109  0.0439 -0.0069 -0.0607  0.0438
  0.0525 -0.0186 -0.1776 -0.0716 -0.0673  0.077