<a href="https://colab.research.google.com/github/ttolofari/hello-world/blob/master/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import numpy as np
from torch import nn
import torch.nn.functional as F
import utils

In [0]:
!wget -cq "https://s3.amazonaws.com/video.udacity-data.com/topher/2018/October/5bbe6499_text8/text8.zip"

In [0]:
!unzip text8.zip

Archive:  text8.zip
replace text8? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [0]:
# Loading the data

with open('text8') as f:
    text = f.read()

In [0]:
print(text[:100])

In [0]:
# Preprocessing the data usin

words = utils.preprocess(text)
print(words[:30])

In [0]:
# print some stats about this word data
print("Total words in text: {}".format(len(words)))
print("Unique words: {}".format(len(set(words)))) # `set` removes any duplicate words

In [0]:
# Creating dictionaries to convert words to integers and back again (integers to words)

vocab_to_int, int_to_vocab = utils.create_lookup_tables(words)

int_words = [vocab_to_int[word] for word in words]

print(int_words[:30])

In [0]:
# Subsampling

# Discarding words given that they surpass the 



from collections import Counter
import random

threshold = 1e-5
word_counts = Counter(int_words)


print(list(word_counts.items())[0]) # Showing how many times each word appears in the text


In [0]:
total_count = len(int_words)


freqs = {word: count/total_count for word, count in word_counts.items()}
p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts}
# discard some frequent words, according to the subsampling equation
# create a new list of words for training
train_words = [word for word in int_words if random.random() < (1 - p_drop[word])]

print(train_words[:30])

In [0]:
# Making Batches

#With the skip-gram architecture, for each word in he text, we want to define a surrounding context and grab all the words in a window around that word, with size C
#We usually give less weigh to distant words because distant words are usually less related to the current word.


def get_target(words, idx, window_size=5):
    '''Geat a list of words in a window around an index'''
    
    R = np.random.randint(1, window_size + 1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target_words = words[start : idx] + words[idx+1 : stop+1]
    
    return list(target_words)

In [0]:
# Testing the code

int_text = [i for i in range(10)]
print('Input: ', int_text)
idx = 5 # word index of interest

target = get_target(int_text, idx = idx, window_size = 5)
print('Target: ', target)

In [0]:
# Generating Batches

def get_batches(words, batch_size, window_size = 5):
    '''Create a generator of word batches as a tuple (inputs, targets)'''
    n_batches = len(words)//batch_size
    
    words = words[:n_batches*batch_size]
    
    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx:idx + batch_size]
        
        for ii in range(len(batch)):
          batch_x = batch[ii]
          batch_y = get_target(batch, ii, window_size)
          y.extend(batch_y)
          x.extend([batch_x]*len(batch_y))
          yield x, y

In [0]:
int_text = [i for i in range(20)]
x, y = next(get_batches(int_text, batch_size = 4, window_size = 5))

print('x\n', x)
print('y\n', y)

In [0]:
# Building the Graph


# This uses the cosine similarity to caclculate how similar words are to each other.


def cosine_similarity(embedding, valid_size=16, valid_window=100, device='cpu'):
    """ Returns the cosine similarity of validation words with words in the embedding matrix.
        Here, embedding should be a PyTorch embedding module.
    """
    
    # Here we're calculating the cosine similarity between some random words and 
    # our embedding vectors. With the similarities, we can look at what words are
    # close to our random words.
    
    # sim = (a . b) / |a||b|
    
    embed_vectors = embedding.weight
    
    # magnitude of embedding vectors, |b|
    magnitudes = embed_vectors.pow(2).sum(dim=1).sqrt().unsqueeze(0)
    
    # pick N words from our ranges (0,window) and (1000,1000+window). lower id implies more frequent 
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples,
                               random.sample(range(1000,1000+valid_window), valid_size//2))
    valid_examples = torch.LongTensor(valid_examples).to(device)
    
    valid_vectors = embedding(valid_examples)
    similarities = torch.mm(valid_vectors, embed_vectors.t())/magnitudes
        
    return valid_examples, similarities

In [0]:
# Define and train the SkipGram model.


import torch
from torch import nn
import torch.optim as optim

In [0]:
class SkipGram(nn.Module):
    def __init__(self, n_vocab, n_embed):
        super().__init__()
        
        self.embed = nn.Embedding(n_vocab, n_embed)
        self.output = nn.Linear(n_embed, n_vocab)
        self.log_softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x):
        x = self.embed(x)
        scores = self.output(x)
        log_ps = self.log_softmax(scores)
        
        return log_ps

In [0]:
# Training


# check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

embedding_dim=300 # you can change, if you want

model = SkipGram(len(vocab_to_int), embedding_dim).to(device)
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

print_every = 500
steps = 0
epochs = 5

# train for some number of epochs
for e in range(epochs):
    
    # get input and target batches
    for inputs, targets in get_batches(train_words, 512):
        steps += 1
        inputs, targets = torch.LongTensor(inputs), torch.LongTensor(targets)
        inputs, targets = inputs.to(device), targets.to(device)
        
        log_ps = model(inputs)
        loss = criterion(log_ps, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if steps % print_every == 0:                  
            # getting examples and similarities      
            valid_examples, valid_similarities = cosine_similarity(model.embed, device=device)
            _, closest_idxs = valid_similarities.topk(6) # topk highest similarities
            
            valid_examples, closest_idxs = valid_examples.to('cpu'), closest_idxs.to('cpu')
            for ii, valid_idx in enumerate(valid_examples):
                closest_words = [int_to_vocab[idx.item()] for idx in closest_idxs[ii]][1:]
                print(int_to_vocab[valid_idx.item()] + " | " + ', '.join(closest_words))
            print("...")