In [31]:
import re
import nltk
nltk.download('brown')
from nltk.corpus import brown
import itertools

corpus = []

for cat in ['news']:
    for text_id in brown.fileids(cat):
        raw_text = list(itertools.chain.from_iterable(brown.sents(text_id)))
        text = ' '.join(raw_text)
        text = text.lower()
        text.replace('\n', ' ')
        text = re.sub('[^a-z ]+', '', text)
        corpus.append([w for w in text.split() if w != ''])

[nltk_data] Downloading package brown to
[nltk_data]     /home/csgrad/souvikda/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [32]:
len(corpus)

44

In [28]:
corpus[0]

['fulton',
 'county',
 'grand',
 'jury',
 'said',
 'friday',
 'an',
 'investigation',
 'atlantas',
 'recent',
 'primary',
 'election',
 'produced',
 'no',
 'evidence',
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 'jury',
 'further',
 'said',
 'termend',
 'presentments',
 'city',
 'executive',
 'committee',
 'which',
 'had',
 'overall',
 'charge',
 'election',
 'deserves',
 'praise',
 'and',
 'thanks',
 'city',
 'atlanta',
 'manner',
 'which',
 'election',
 'conducted',
 'septemberoctober',
 'term',
 'jury',
 'had',
 'been',
 'charged',
 'by',
 'fulton',
 'superior',
 'court',
 'judge',
 'durwood',
 'pye',
 'investigate',
 'reports',
 'possible',
 'irregularities',
 'hardfought',
 'primary',
 'which',
 'won',
 'by',
 'mayornominate',
 'ivan',
 'allen',
 'jr',
 'only',
 'relative',
 'handful',
 'such',
 'reports',
 'received',
 'jury',
 'said',
 'considering',
 'widespread',
 'interest',
 'election',
 'number',
 'of',
 'voters',
 'size',
 'this',
 'city',
 'jury',
 'said',
 'i

![image.png](attachment:image.png)

In [33]:
from collections import Counter
import random, math

def subsample_frequent_words(corpus):
    filtered_corpus = []
    word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
    sum_word_counts = sum(list(word_counts.values()))
    word_counts = {word: word_counts[word]/float(sum_word_counts) for word in word_counts}
    for text in corpus:
        filtered_corpus.append([])
        for word in text:
            if random.random() < (1+math.sqrt(word_counts[word] * 1e3)) * 1e-3 / float(word_counts[word]):
                filtered_corpus[-1].append(word)
    return filtered_corpus

In [3]:
corpus = subsample_frequent_words(corpus)
vocabulary = set(itertools.chain.from_iterable(corpus))

word_to_index = {w: idx for (idx, w) in enumerate(vocabulary)}
index_to_word = {idx: w for (idx, w) in enumerate(vocabulary)}

![image.png](attachment:image.png)

In [4]:
import numpy as np

context_tuple_list = []
w = 4

for text in corpus:
    for i, word in enumerate(text):
        first_context_word_index = max(0,i-w)
        last_context_word_index = min(i+w, len(text))
        for j in range(first_context_word_index, last_context_word_index):
            if i!=j:
                context_tuple_list.append((word, text[j]))

In [30]:
context_tuple_list[0:10]

[('fulton', 'county'),
 ('fulton', 'grand'),
 ('fulton', 'jury'),
 ('county', 'fulton'),
 ('county', 'grand'),
 ('county', 'jury'),
 ('county', 'said'),
 ('grand', 'fulton'),
 ('grand', 'county'),
 ('grand', 'jury')]

In [35]:
print("There are {} pairs of target and context words".format(len(context_tuple_list)))

There are 473861 pairs of target and context words


![image.png](attachment:image.png)

In [6]:
import torch
import torch.nn as  nn
import torch.autograd  as autograd
import torch.optim as optim
import torch.nn.functional as F


class Word2Vec(nn.Module):

    def __init__(self, embedding_size, vocab_size):
        super(Word2Vec, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocab_size)
        
    def forward(self, context_word):
        emb = self.embeddings(context_word)
        hidden = self.linear(emb)
        out = F.log_softmax(hidden)
        return out

![image.png](attachment:image.png)

In [7]:
class EarlyStopping():
    def __init__(self, patience=5, min_percent_gain=0.15):
        self.patience = patience
        self.loss_list = []
        self.min_percent_gain = min_percent_gain / 100.
        
    def update_loss(self, loss):
        self.loss_list.append(loss)
        if len(self.loss_list) > self.patience:
            del self.loss_list[0]
    
    def stop_training(self):
        if len(self.loss_list) == 1:
            return False
        gain = (max(self.loss_list) - min(self.loss_list)) / max(self.loss_list)
        print("Loss gain: {}%".format(round(100*gain,2)))
        if gain < self.min_percent_gain:
            return True
        else:
            return False

![image.png](attachment:image.png)

In [8]:
from tqdm import tqdm

In [13]:
vocabulary_size = len(vocabulary)

net = Word2Vec(embedding_size=2, vocab_size=vocabulary_size)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters())
early_stopping = EarlyStopping()
context_tensor_list = []

for target, context in tqdm(context_tuple_list):
    target_tensor = autograd.Variable(torch.LongTensor([word_to_index[target]]))
    context_tensor = autograd.Variable(torch.LongTensor([word_to_index[context]]))
    context_tensor_list.append((target_tensor, context_tensor))
    
while True:
    losses = []
    for target_tensor, context_tensor in tqdm(context_tensor_list[:10000]):
        net.zero_grad()
        log_probs = net(context_tensor)
        loss = loss_function(log_probs, target_tensor)
        loss.backward()
        optimizer.step()
        losses.append(loss.data)
    print("Loss: ", np.mean(losses))
    early_stopping.update_loss(np.mean(losses))
    if early_stopping.stop_training():
        break

100%|██████████| 473861/473861 [00:05<00:00, 90201.74it/s] 
  out = F.log_softmax(hidden)
100%|██████████| 10000/10000 [00:08<00:00, 1134.60it/s]


Loss:  9.002841


100%|██████████| 10000/10000 [00:09<00:00, 1084.93it/s]


Loss:  6.9405665
Loss gain: 22.91%


100%|██████████| 10000/10000 [00:08<00:00, 1139.90it/s]


Loss:  6.711876
Loss gain: 25.45%


100%|██████████| 10000/10000 [00:09<00:00, 1083.79it/s]


Loss:  6.660246
Loss gain: 26.02%


100%|██████████| 10000/10000 [00:10<00:00, 976.70it/s]


Loss:  6.650625
Loss gain: 26.13%


100%|██████████| 10000/10000 [00:08<00:00, 1129.75it/s]


Loss:  6.6478686
Loss gain: 4.22%


100%|██████████| 10000/10000 [00:08<00:00, 1144.67it/s]


Loss:  6.645508
Loss gain: 0.99%


100%|██████████| 10000/10000 [00:08<00:00, 1134.10it/s]


Loss:  6.6430955
Loss gain: 0.26%


100%|██████████| 10000/10000 [00:08<00:00, 1125.71it/s]

Loss:  6.6407166
Loss gain: 0.15%





In [14]:
import numpy as np

def get_closest_word(word, topn=5):
    word_distance = []
    emb = net.embeddings
    pdist = nn.PairwiseDistance()
    i = word_to_index[word]
    lookup_tensor_i = torch.tensor([i], dtype=torch.long)
    v_i = emb(lookup_tensor_i)
    for j in range(len(vocabulary)):
        if j != i:
            lookup_tensor_j = torch.tensor([j], dtype=torch.long)
            v_j = emb(lookup_tensor_j)
            word_distance.append((index_to_word[j], float(pdist(v_i, v_j))))
    word_distance.sort(key=lambda x: x[1])
    return word_distance[:topn]

In [27]:
get_closest_word("food")

[('skips', 0.017352640628814697),
 ('resolve', 0.021886317059397697),
 ('surprise', 0.026888087391853333),
 ('blasting', 0.02943773940205574),
 ('aunts', 0.03072146512567997)]

#HW

1. How to train faster?
2. Better technique than softmax?