In [101]:
# Program Hyperparameters

neighborhood_size = 2
embedding_dims = 2
negative = 5
device = 'cuda'
num_epochs = 200
learning_rate = 2e-1
lr_decay = 0.99
loss_hist = []

In [67]:
import torch
from torch.autograd import Variable
from torch.utils.data import DataLoader

import pandas as pd
import numpy as np

from sklearn import decomposition
from pathlib import Path
from random import sample

import warnings
warnings.filterwarnings("ignore")

import nltk

with open('walks') as f:
    corpus = f.read().split('\n')

corpus = sample(corpus, len(corpus) // 200)

In [68]:
from IPython.display import display
import ipywidgets as widgets

def create_vocabulary(corpus):
    '''Creates a dictionary with all unique words in corpus with id'''
    vocabulary = {}
    i = 0
    for s in corpus:
        for w in s.split():
            if w not in vocabulary:
                vocabulary[w] = i
                i+=1
    return vocabulary

def prepare_set(corpus, n_gram = 1):
    '''Creates a dataset with Input column and Outputs columns for neighboring words. 
       The number of neighbors = n_gram*2'''
    columns = ['Input'] + [f'Output{i+1}' for i in range(n_gram*2)]
    result = pd.DataFrame(columns = columns)
    for sentence in corpus:
        for i,w in enumerate(sentence.split()):
            inp = [w]
            out = []
            for n in range(1,n_gram+1):
                # look back
                if (i-n)>=0:
                    out.append(sentence.split()[i-n])
                else:
                    out.append('<padding>')
                
                # look forward
                if (i+n)<len(sentence.split()):
                    out.append(sentence.split()[i+n])
                else:
                    out.append('<padding>')
            row = pd.DataFrame([inp+out], columns = columns)
            result = result.append(row, ignore_index = True)
    return result

def prepare_set_ravel(corpus, n_gram = 1):
    '''Creates a dataset with Input column and Output column for neighboring words. 
       The number of neighbors = n_gram*2'''
    columns = ['Input', 'Output']
    result = pd.DataFrame(columns = columns)
    k = 0
    o = widgets.HTML()
    display(o)
    for sentence in corpus:
        o.value = (f'<p>{k / len(corpus)}% ({k} / {len(corpus)})</p>')
        k += 1
        for i,w in enumerate(sentence.split()):
            inp = w
            for n in range(1,n_gram+1):
                # look back
                if (i-n)>=0:
                    out = sentence.split()[i-n]
                    row = pd.DataFrame([[inp,out]], columns = columns)
                    result = result.append(row, ignore_index = True)
                
                # look forward
                if (i+n)<len(sentence.split()):
                    out = sentence.split()[i+n]
                    row = pd.DataFrame([[inp,out]], columns = columns)
                    result = result.append(row, ignore_index = True)
    return result

In [69]:
# Create Vocabulary for indexing
vocabulary = create_vocabulary(corpus)

print("Vocab done")

# Create train embedding
# train_emb = prepare_set(corpus, n_gram=neighborhood_size)
train_emb = prepare_set_ravel(corpus, n_gram=neighborhood_size)
train_emb.Input = train_emb.Input.map(vocabulary)
train_emb.Output = train_emb.Output.map(vocabulary)

Vocab done


HTML(value='')

In [116]:
import torch.nn as nn

class SkipGram(nn.Module):
    """
    Implementation of Skip-Gram model described in paper:
    https://arxiv.org/abs/1301.3781
    """
    def __init__(self):
        super(SkipGram, self).__init__()
        vocab_size = len(vocabulary)
        
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dims
        )

        
        self.linear = nn.Linear(
            in_features=embedding_dims,
            out_features=vocab_size,
        )

    def forward(self, x):
        print(x.shape)
        x = self.embeddings(x)
        print(x.shape)
        x = self.linear(x)
        print(x.shape)
        return x

def print_size(model):    
    param_size = 0
    buffer_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()

    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    print((param_size + buffer_size) / (1024 ** 2))

In [117]:
# define loss func
model = SkipGram()
model = model.to(device)

loss_f = torch.nn.CrossEntropyLoss()
optim = torch.optim.SGD(model.parameters(), lr=learning_rate)

inputs_dataloader = DataLoader(train_emb.Input.values, batch_size=train_emb.shape[0])
label_dataloader = DataLoader(train_emb.Output.values, batch_size=train_emb.shape[0])

for epo in range(num_epochs):
    for x,y in zip(inputs_dataloader, label_dataloader):
        
        x = x.to(device)
        y = y.to(device)
        
        optim.zero_grad()
        y_pred = model(x)

        #compute loss
        loss = loss_f(y_pred, y)
        
        # bakpropagation step
        loss.backward()
        optim.step()
        
    if epo % 10 == 0:
        learning_rate *= lr_decay
    loss_hist.append(loss)
    if epo % 50 == 0:
        print(f'Epoch {epo}, loss = {loss}')

torch.Size([246768])
torch.Size([246768, 1])


OutOfMemoryError: CUDA out of memory. Tried to allocate 35.20 GiB (GPU 0; 7.93 GiB total capacity; 11.67 MiB already allocated; 6.61 GiB free; 26.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF