In [1]:
from collections import Counter
import random
import numpy as np
import utils

# read in the extracted text file      
with open('../data/text8') as f:
    text = f.read()

# get list of words
words = utils.preprocess(text)
print(words[:30])

# print some stats about this word data
print("Total words in text: {}".format(len(words)))
print("Unique words: {}".format(len(set(words)))) # `set` removes any duplicate words

vocab_to_int, int_to_vocab = utils.create_lookup_tables(words)
int_words = [vocab_to_int[word] for word in words]

print(int_words[:30])

threshold = 1e-5
word_counts = Counter(int_words)
#print(list(word_counts.items())[0:9])  # dictionary of int_words, how many times they appear

# discard some frequent words, according to the subsampling equation
# create a new list of words for training
def discard_prob(threshold, frac):
    return 1 - np.sqrt(threshold/frac)

train_words = [idx for idx in int_words if random.random() > discard_prob(threshold, word_counts[idx]/len(int_words)) ]
print(train_words[:30])

# Since the more distant words are usually less related to the current word than those close to it, 
# we give less weight to the distant words by sampling less from those words in our training examples.
# If we choose 𝐶=5, for each training word we will select randomly a number 𝑅 in range  [1:𝐶], 
# and then use 𝑅 words from history and 𝑅 words from the future of the current word as correct labels.

def get_target(words, idx, window_size=5):
    ''' Get a list of words in a window around an index. '''
    
    # implement this function
    R = np.ceil(random.random()/0.2).astype(np.int64)
    past_idx = list(range(np.max([idx-R, 0]), idx))
    future_idx = list(range(idx+1, np.min([idx+R+1, len(words)])))
    
    return [words[idx] for idx in past_idx + future_idx]

def get_batches(words, batch_size, window_size=5):
    ''' Create a generator of word batches as a tuple (inputs, targets) '''
    
    n_batches = len(words)//batch_size
    
    # only full batches
    words = words[:n_batches*batch_size]
    
    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx:idx+batch_size]
        for ii in range(len(batch)):
            batch_x = batch[ii]
            batch_y = get_target(batch, ii, window_size)
            y.extend(batch_y)
            x.extend([batch_x]*len(batch_y))
        yield x, y

def cosine_similarity(embedding, valid_size=16, valid_window=100, device='cpu'):
    """ Returns the cosine similarity of validation words with words in the embedding matrix.
        Here, embedding should be a PyTorch embedding module.
    """
    
    # Here we're calculating the cosine similarity between some random words and 
    # our embedding vectors. With the similarities, we can look at what words are
    # close to our random words.
    
    # sim = (a . b) / |a||b|
    
    embed_vectors = embedding.weight
    
    # magnitude of embedding vectors, |b|
    magnitudes = embed_vectors.pow(2).sum(dim=1).sqrt().unsqueeze(0)
    
    # pick N words from our ranges (0,window) and (1000,1000+window). lower id implies more frequent 
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples, random.sample(range(1000,1000+valid_window), valid_size//2))
    valid_examples = torch.LongTensor(valid_examples).to(device)
    
    valid_vectors = embedding(valid_examples)
    similarities = torch.mm(valid_vectors, embed_vectors.t())/magnitudes
        
    return valid_examples, similarities


['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst']
Total words in text: 16680599
Unique words: 63641
[5233, 3080, 11, 5, 194, 1, 3133, 45, 58, 155, 127, 741, 476, 10571, 133, 0, 27349, 1, 0, 102, 854, 2, 0, 15067, 58112, 1, 0, 150, 854, 3580]
[5233, 10571, 27349, 854, 15067, 58112, 10712, 2731, 371, 539, 2757, 7088, 247, 5233, 44611, 792, 5233, 602, 8983, 4147, 4186, 153, 5233, 447, 1818, 4860, 6753, 7573, 1774, 566]


In [2]:
import torch
from torch import nn
import torch.optim as optim

# Now define this SkipGram model
class SkipGram(nn.Module):
    def __init__(self, n_vocab, n_embed):
        super().__init__()
        
        # complete this SkipGram model
        self.n_vocabulary = n_vocab
        self.n_embedding = n_embed
        self.in_embed = nn.Embedding(n_vocab, n_embed)
        self.fc = nn.Linear(n_embed, n_vocab)
        self.LogSoftmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x):
        
        # define the forward behavior
        x_embed = self.in_embed(x)
        x_out = self.fc(x_embed)
        x = self.LogSoftmax(x_out)
        
        return x

# Define the skipgram model with negative sampling
class SkipGramNeg(nn.Module):
    def __init__(self, n_vocab, n_embed, noise_dist=None):
        super().__init__()
        
        self.n_vocab = n_vocab
        self.n_embed = n_embed
        self.noise_dist = noise_dist
        
        # define embedding layers for input and output words
        self.in_embed = nn.Embedding(n_vocab, n_embed)
        self.out_embed = nn.Embedding(n_vocab, n_embed)
        
        # Initialize both embedding tables with uniform distribution
        self.in_embed.weight.data.uniform_(-1, 1)
        self.out_embed.weight.data.uniform_(-1, 1)
        
    def forward_input(self, input_words):
        # return input vector embeddings

        return self.in_embed(input_words)
    
    def forward_output(self, output_words):
        # return output vector embeddings

        return self.out_embed(output_words)
    
    def forward_noise(self, batch_size, n_samples):
        """ Generate noise vectors with shape (batch_size, n_samples, n_embed) """
        if self.noise_dist is None:
            # Sample words uniformly
            noise_dist = torch.ones(self.n_vocab)
        else:
            noise_dist = self.noise_dist
            
        # Sample words from our noise distribution
        noise_words = torch.multinomial(noise_dist,
                                        batch_size * n_samples,
                                        replacement=True)
        
        device = "cuda" if self.out_embed.weight.is_cuda else "cpu"
        noise_words = noise_words.to(device)
        
        ## TODO: get the noise embeddings
        noise_vectors = self.out_embed(noise_words)
        
        # reshape the embeddings so that they have dims (batch_size, n_samples, n_embed)
        noise_vectors = noise_vectors.view(batch_size, n_samples, self.n_embed)
        
        return noise_vectors

# Define the loss function for skipgram with negative sampling
class NegativeSamplingLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, input_vectors, output_vectors, noise_vectors):
        
        batch_size, embed_size = input_vectors.shape
        
        # Input vectors should be a batch of column vectors
        input_vectors = input_vectors.view(batch_size, embed_size, 1)
        
        # Output vectors should be a batch of row vectors
        output_vectors = output_vectors.view(batch_size, 1, embed_size)
        
        # bmm = batch matrix multiplication
        # correct log-sigmoid loss
        out_loss = torch.bmm(output_vectors, input_vectors).sigmoid().log()
        out_loss = out_loss.squeeze()
        
        # incorrect log-sigmoid loss
        noise_loss = torch.bmm(noise_vectors.neg(), input_vectors).sigmoid().log()
        noise_loss = noise_loss.squeeze().sum(1)  # sum the losses over the sample of noise vectors

        # negate and sum correct and noisy log-sigmoid losses
        # return average batch loss
        return -(out_loss + noise_loss).mean()

In [8]:
## Now train the model

# check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

NEG = True

if NEG == True:
    # Get our noise distribution, using word frequencies calculated earlier in the notebook
    freqs = Counter(int_words)
    word_freqs = np.array(sorted(freqs.values(), reverse=True))
    unigram_dist = word_freqs/word_freqs.sum()
    noise_dist = torch.from_numpy(unigram_dist**(0.75)/np.sum(unigram_dist**(0.75)))
    embedding_dim = 300
    epochs = 5
    print_every = 1500
    batch_size = 512
    model = SkipGramNeg(len(vocab_to_int), embedding_dim, noise_dist=noise_dist).to(device)
    criterion = NegativeSamplingLoss() 
    optimizer = optim.Adam(model.parameters(), lr=0.003)
else:
    embedding_dim = 300 # you can change, if you want
    epochs = 5
    print_every = 500
    batch_size = 512
    model = SkipGram(len(vocab_to_int), embedding_dim).to(device)
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.003)

# Now begin training for some number of epochs
steps = 0
for e in range(epochs):
    
    # get input and target batches
    for inputs, targets in get_batches(train_words, batch_size):
        steps += 1
        inputs, targets = torch.LongTensor(inputs), torch.LongTensor(targets)
        inputs, targets = inputs.to(device), targets.to(device)

        if NEG == True:
            # input, outpt, and noise vectors
            input_vectors = model.forward_input(inputs) # batch_size x n_embedding
            output_vectors = model.forward_output(targets) # batch_size x n_embedding
            noise_vectors = model.forward_noise(inputs.shape[0], 5) # batch_size x n_samples x n_embedding
            # negative sampling loss
            loss = criterion(input_vectors, output_vectors, noise_vectors)           
        else:
            log_ps = model(inputs)
            loss = criterion(log_ps, targets)
            
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if steps % print_every == 0:                  
            # getting examples and similarities      
            valid_examples, valid_similarities = cosine_similarity(model.in_embed, device=device)
            _, closest_idxs = valid_similarities.topk(6) # topk highest similarities
            
            valid_examples, closest_idxs = valid_examples.to('cpu'), closest_idxs.to('cpu')
            for ii, valid_idx in enumerate(valid_examples):
                closest_words = [int_to_vocab[idx.item()] for idx in closest_idxs[ii]][1:]
                print(int_to_vocab[valid_idx.item()] + " | " + ', '.join(closest_words))
            print("...")

three | five, of, reputation, anselm, jefferson
would | vols, greatly, groundbreaking, participating, conclude
at | the, of, energy, on, dynasty
a | of, the, and, in, identified
th | budget, does, warmer, asserts, mercury
he | buna, confronted, its, tobago, representative
are | temple, monck, musical, which, the
these | ghz, mandan, ross, flat, classification
shows | neal, pigeon, belarusians, shaded, civilians
operating | powerbook, referee, humor, hugging, checkpoint
orthodox | handles, subsidiary, scientists, lost, winfrey
applications | ot, benjamin, amman, stabilized, lammas
freedom | buffy, xs, pessoa, reciprocated, jointed
woman | michael, diphtheria, stressing, technically, divisional
quite | showing, protestant, spending, gigs, pnc
something | inker, whosoever, mature, lying, mediawiki
...
into | guthrie, two, seven, the, exercise
not | the, a, to, and, in
their | of, to, five, the, and
who | zero, with, their, production, of
often | two, beth, an, s, especially
been | this, n

In [10]:
# save the model
if NEG == True:
    model_name_f = 'skipgramNEG_embeddim300_anarchism_wiki.pth'
    model_name_w = 'skipgramNEG_embeddim300_anarchism_wiki_weights.pth'
else:
    model_name_f = 'skipgram_embeddim300_anarchism_wiki.pth'
    model_name_w = 'skipgram_embeddim300_anarchism_wiki_weights.pth'

torch.save(model.state_dict(), 'models/' + model_name_w)
torch.save(model, 'models/' + model_name_f)

In [None]:
# Visualizing the word vectors
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# getting embeddings from the embedding layer of our model, by name
embeddings = model.in_embed.weight.to('cpu').data.numpy()
print(f'Size of embedding {embeddings.shape[0]} x {embeddings.shape[1]}')

viz_words = 380
tsne = TSNE()
embed_tsne = tsne.fit_transform(embeddings[:viz_words, :])

fig, ax = plt.subplots(figsize=(16, 16))
for idx in range(viz_words):
    plt.scatter(*embed_tsne[idx, :], color='steelblue')
    plt.annotate(int_to_vocab[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)