In [None]:
!wget -c 'https://www.dropbox.com/s/1agrh5hdnkqd24c/en_wiki.txt?dl=0' -O en_wiki.txt

from google.colab import drive
drive.mount('/content/drive')
! cp -R '/content/drive/My Drive/web' ./

--2020-11-05 17:17:45--  https://www.dropbox.com/s/1agrh5hdnkqd24c/en_wiki.txt?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.5.1, 2620:100:601f:1::a27d:901
Connecting to www.dropbox.com (www.dropbox.com)|162.125.5.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/1agrh5hdnkqd24c/en_wiki.txt [following]
--2020-11-05 17:17:45--  https://www.dropbox.com/s/raw/1agrh5hdnkqd24c/en_wiki.txt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uca3d5dd66e4807ef2cf5b6145c5.dl.dropboxusercontent.com/cd/0/inline/BCovwoduYGjzx6DSqJTP-E1e8jBu2AX1M0rPB15J16x4L2KS-hbdPtoy86guu-BO3u6xna0mEtu7p8k-WKy_LVLS5ETqYWWRGafFRj3ESJf6OHLMm--GxIApE873nvWYijk/file# [following]
--2020-11-05 17:17:45--  https://uca3d5dd66e4807ef2cf5b6145c5.dl.dropboxusercontent.com/cd/0/inline/BCovwoduYGjzx6DSqJTP-E1e8jBu2AX1M0rPB15J16x4L2KS-hbdPtoy86guu-BO3u6xna0mEtu7p8k-WKy_LVLS5ETqYWWRGafFRj3ESJf6OHLM

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import defaultdict
import numpy as np
from scipy import sparse
import random

from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999
from web.embeddings import fetch_GloVe
from web.evaluate import evaluate_similarity
from six import iteritems

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!




In [None]:
random.seed(4)
text = open('/content/en_wiki.txt').read()
text = text[:len(text)//5]
text = text.replace('\n','')
sent_tokenized_corpus = sent_tokenize(text)
tokenized_corpus = []
for sent in sent_tokenized_corpus:
  words = word_tokenize(sent)
  tokenized_corpus.append(words)

In [None]:
window_size = 4
threshold = 10
dimension = 100
alpha = 0.75
x_max = 100
learning_rate = 0.01
no_of_epochs = 20

In [None]:
# Steps:
# build vocabulary with freq and index
# build co-occurence matrix
# train word vectors

In [None]:
def get_vocabulary(data):
  vocabulary_dict = defaultdict(int)
  for sent in data:
    for word in sent:
      vocabulary_dict[word] += 1

  vocab = {token:(index, frequency) for index, (token, frequency) in enumerate(vocabulary_dict.items())}
  token_index_map = {token:index for token, (index,frequency) in vocab.items()}
  return vocab, token_index_map

testdata = [["My", "name", "is", "Umang."],["I", "am", "21", "years", "old."]]
vocab, token_index_map = get_vocabulary(testdata)
print(vocab)
print(token_index_map)

{'My': (0, 1), 'name': (1, 1), 'is': (2, 1), 'Umang.': (3, 1), 'I': (4, 1), 'am': (5, 1), '21': (6, 1), 'years': (7, 1), 'old.': (8, 1)}
{'My': 0, 'name': 1, 'is': 2, 'Umang.': 3, 'I': 4, 'am': 5, '21': 6, 'years': 7, 'old.': 8}


In [None]:
def build_X(vocab, token_index_map, train_set, window_size, threshold):

  vocab_size = len(vocab)

  X = sparse.lil_matrix((vocab_size, vocab_size))
  id2word = dict((i, word) for word, (i, _) in vocab.items())
  for sent in train_set:
    token_indices = [token_index_map[word] for word in sent]

    for sent_index, token_index in enumerate(token_indices):

      context_indices = token_indices[max(0, sent_index-window_size) : sent_index]
      context_len = len(context_indices)

      for left_sent_index, left_token_index in enumerate(context_indices):
        dist = len(context_indices) - left_sent_index

        X[token_index,left_token_index] += 1
        X[left_token_index,token_index] += 1

  X_tuples = []

  for i, (row, data) in enumerate(zip(X.rows,X.data)):
        if vocab[id2word[i]][1] < threshold:
            continue

        for index, j in enumerate(row):
            if vocab[id2word[i]][1] < threshold:
                continue

            X_tuples.append((i, j, data[index])) 

  return X_tuples

In [None]:
def epoch(vocab, data):

  global_cost = 0
  random.shuffle(data)

  for (word_vector, context_vector, word_bias, context_bias, word_vector_gradsq, 
       context_vector_gradsq, word_bias_gradsq, context_bias_gradsq, x_ij) in data:

    if x_ij < x_max:
      weight = pow(x_ij/x_max, alpha)
    else:
      weight = 1

      inner_cost = word_vector.dot(context_vector) + word_bias[0] + context_bias[0] - np.log(x_ij)

      cost = weight*(pow(inner_cost,2))

      global_cost += 0.5*cost
      word_grad = inner_cost * context_vector
      context_grad = inner_cost * word_vector
      word_bias_grad = inner_cost
      context_bias_grad = inner_cost

      word_vector -= (learning_rate*word_grad)/np.sqrt(word_vector_gradsq)
      context_vector -= (learning_rate*context_grad)/np.sqrt(context_vector_gradsq)
      word_bias -= (learning_rate*word_bias_grad)/np.sqrt(word_bias_gradsq)
      context_bias -= (learning_rate*context_bias_grad)/np.sqrt(context_bias_gradsq)

      word_vector_gradsq += np.square(word_grad)
      context_vector_gradsq += np.square(context_grad)
      word_bias_gradsq += pow(word_bias_grad,2)
      context_bias_gradsq += pow(context_bias_grad,2)

  return global_cost

In [None]:
def train_glove(vocab, X_tuples):

  vocab_size = len(vocab)
  W = (np.random.rand(vocab_size*2, dimension)-0.5)/float(dimension)
  B = (np.random.rand(vocab_size*2)-0.5)/float(dimension)
  gradient_sq_W = np.ones((vocab_size*2, dimension))
  gradient_sq_B = np.ones((vocab_size*2))

  data = []
  for (word_index, context_index, x_ij) in X_tuples:
    data_entry = ( W[word_index], W[context_index+vocab_size], B[word_index:word_index+1], B[context_index+vocab_size: context_index+vocab_size+1],
                  gradient_sq_W[word_index], gradient_sq_W[context_index+vocab_size],
                  gradient_sq_B[word_index: word_index+1], gradient_sq_B[context_index+vocab_size: context_index+vocab_size+1],
                  x_ij)
    data.append(data_entry)

  for i in range(no_of_epochs):
    print("Iteration Number:",i+1)
    cost = epoch(vocab, data)
    print("Cost =", cost)
    print('')

  return W,B

In [None]:
vocab, token_index_map = get_vocabulary(tokenized_corpus)
# print(vocab['The'])
# print(token_index_map['The'])
X_tuples = build_X(vocab, token_index_map, tokenized_corpus, window_size, threshold)

In [None]:
W, B = train_glove(vocab, X_tuples)

Iteration Number: 1
Cost = 241912.99212779765

Iteration Number: 2
Cost = 209048.29277624196

Iteration Number: 3
Cost = 136041.34292156497

Iteration Number: 4
Cost = 76095.61382862608

Iteration Number: 5
Cost = 53454.34053044682

Iteration Number: 6
Cost = 42731.68987090381

Iteration Number: 7
Cost = 36026.42721954118

Iteration Number: 8
Cost = 31214.91260206464

Iteration Number: 9
Cost = 27516.427733289416

Iteration Number: 10
Cost = 24557.443094084025

Iteration Number: 11
Cost = 22132.454892497106

Iteration Number: 12
Cost = 20108.22723551717

Iteration Number: 13
Cost = 18396.182518164038

Iteration Number: 14
Cost = 16931.85261266378

Iteration Number: 15
Cost = 15669.864382774802

Iteration Number: 16
Cost = 14573.9449782668

Iteration Number: 17
Cost = 13617.670373072317

Iteration Number: 18
Cost = 12778.120843805278

Iteration Number: 19
Cost = 12039.382841251712

Iteration Number: 20
Cost = 11385.886221033013



In [None]:
vocab_size = len(vocab)
main_w = W[:vocab_size,:]
context_w = W[vocab_size:, :]
main_b = B[:vocab_size]
context_b = B[vocab_size:]

In [None]:
glove_vector = fetch_GloVe(corpus="wiki-6B", dim=100)


Dataset created in /root/web_data/embeddings

Downloading data from http://nlp.stanford.edu/data/glove.6B.zip ...


100%|██████████| 862M/862M [06:27<00:00, 2.22Mb/s]


...done. (388 seconds, 6 min)
Extracting data from /root/web_data/embeddings/glove.6B/glove.6B.zip...
   ...done.


In [None]:
tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353()
}

In [None]:
vocab_vector = {}
for token, id in token_index_map.items():
  vocab_vector[token] = main_w[id]

task_WS353 = [[],[]]
name = "WS353"
for i in range(len(tasks[name].X)):
  if tasks[name].X[i][0] in vocab_vector and tasks[name].X[i][1] in vocab_vector:
    task_WS353[0].append([tasks[name].X[i][0], tasks[name].X[i][1]])
    task_WS353[1].append(tasks[name].y[i])
task_WS353[0] = np.array(task_WS353[0])
task_WS353[1] = np.array(task_WS353[1])


task_MEN = [[], []]
name = "MEN"

for i in range(len(tasks[name].X)):
  if tasks[name].X[i][0] in vocab_vector and tasks[name].X[i][1] in vocab_vector:
    task_MEN[0].append([tasks[name].X[i][0], tasks[name].X[i][1]])
    task_MEN[1].append(tasks[name].y[i][0])
task_MEN[0] = np.array(task_MEN[0])
task_MEN[1] = np.array(task_MEN[1])

In [None]:
name = "WS353"
print ("Spearman correlation of scores on {}= {} for embeddings obtained using implementation".format(name, evaluate_similarity(vocab_vector, task_WS353[0], task_WS353[1])))
print ("Spearman correlation of scores on {}= {} for pretrained embeddings".format(name, evaluate_similarity(glove_vector, task_WS353[0], task_WS353[1])))

  A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
  B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])
Missing 21 words. Will replace them with mean vector


Spearman correlation of scores on WS353= 0.022085059258130542 for embeddings obtained using implementation
Spearman correlation of scores on WS353= 0.5316690635677356 for pretrained embeddings


In [None]:
name = "MEN"
print ("Spearman correlation of scores on {}= {} for embeddings obtained using implementation".format(name, evaluate_similarity(vocab_vector, task_MEN[0], task_MEN[1])))
print ("Spearman correlation of scores on {}= {} for pretrained embeddings".format(name, evaluate_similarity(glove_vector, task_MEN[0], task_MEN[1])))

Spearman correlation of scores on MEN= 0.02950118267341735 for embeddings obtained using implementation
Spearman correlation of scores on MEN= 0.6965263061520983 for pretrained embeddings


  A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
  B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])
