In [1]:
# Program Hyperparameters

neighborhood_size = 2
embedding_dims = 2
negative = 5
device = 'cuda'
num_epochs = 200
learning_rate = 2e-1
lr_decay = 0.99
loss_hist = []

In [6]:
import torch
from torch.autograd import Variable
from torch.utils.data import DataLoader

import pandas as pd
import numpy as np

from sklearn import decomposition
from pathlib import Path
from random import sample

import warnings
warnings.filterwarnings("ignore")

import nltk

with open('walks') as f:
    corpus = f.read().split('\n')

corpus = sample(corpus, len(corpus) // 200)

In [7]:
print(sum(map(lambda x: len(x.split(' ')), corpus)))

64664


In [8]:
from IPython.display import display
import ipywidgets as widgets

def create_vocabulary(corpus):
    '''Creates a dictionary with all unique words in corpus with id'''
    vocabulary = {}
    i = 0
    for s in corpus:
        for w in s.split():
            if w not in vocabulary:
                vocabulary[w] = i
                i+=1
    return vocabulary

def prepare_set(corpus, n_gram = 1):
    '''Creates a dataset with Input column and Outputs columns for neighboring words. 
       The number of neighbors = n_gram*2'''
    columns = ['Input'] + [f'Output{i+1}' for i in range(n_gram*2)]
    result = pd.DataFrame(columns = columns)
    for sentence in corpus:
        for i,w in enumerate(sentence.split()):
            inp = [w]
            out = []
            for n in range(1,n_gram+1):
                # look back
                if (i-n)>=0:
                    out.append(sentence.split()[i-n])
                else:
                    out.append('<padding>')
                
                # look forward
                if (i+n)<len(sentence.split()):
                    out.append(sentence.split()[i+n])
                else:
                    out.append('<padding>')
            row = pd.DataFrame([inp+out], columns = columns)
            result = result.append(row, ignore_index = True)
    return result

def prepare_set_ravel(corpus, n_gram = 1):
    '''Creates a dataset with Input column and Output column for neighboring words. 
       The number of neighbors = n_gram*2'''
    columns = ['Input', 'Output']
    result = pd.DataFrame(columns = columns)
    k = 0
    o = widgets.HTML()
    display(o)
    for sentence in corpus:
        o.value = (f'<p>{k / len(corpus)}% ({k} / {len(corpus)})</p>')
        k += 1
        for i,w in enumerate(sentence.split()):
            inp = w
            for n in range(1,n_gram+1):
                # look back
                if (i-n)>=0:
                    out = sentence.split()[i-n]
                    row = pd.DataFrame([[inp,out]], columns = columns)
                    result = result.append(row, ignore_index = True)
                
                # look forward
                if (i+n)<len(sentence.split()):
                    out = sentence.split()[i+n]
                    row = pd.DataFrame([[inp,out]], columns = columns)
                    result = result.append(row, ignore_index = True)
    return result

In [9]:
# Create Vocabulary for indexing
vocabulary = create_vocabulary(corpus)

print("Vocab done")

# Create train embedding
# train_emb = prepare_set(corpus, n_gram=neighborhood_size)
train_emb = prepare_set_ravel(corpus, n_gram=neighborhood_size)
train_emb.Input = train_emb.Input.map(vocabulary)
train_emb.Output = train_emb.Output.map(vocabulary)

Vocab done


HTML(value='')

In [10]:
import torch.nn as nn

class SkipGram(nn.Module):
    """
    Implementation of Skip-Gram model described in paper:
    https://arxiv.org/abs/1301.3781
    """
    def __init__(self):
        super(SkipGram, self).__init__()
        vocab_size = len(vocabulary)
        
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dims
        )
        
        self.linear = nn.Linear(
            in_features=embedding_dims,
            out_features=vocab_size,
        )

    def forward(self, x):
        x = self.embeddings(x)
        x = self.linear(x)
        return x

In [12]:
%%time
        
# define loss func
model = SkipGram()
model = model.to(device)

loss_f = torch.nn.CrossEntropyLoss() # see details: https://pytorch.org/docs/stable/nn.html
optim = torch.optim.SGD(model.parameters(), lr=learning_rate)

input_loader = DataLoader(train_emb.Input.values, batch_size=1024)
label_loader = DataLoader(train_emb.Output.values, batch_size=1024)


for epo in range(num_epochs):
    for x,y in zip(input_loader, label_loader):
        
        x = x.to(device)
        y = y.to(device)
        
        optim.zero_grad()
     
        y_pred = model(x)

        #compute loss
        loss = loss_f(y_pred, y)
        
        # bakpropagation step
        loss.backward()
        optim.step()
        
    if epo%10 == 0:
        learning_rate *= lr_decay
    loss_hist.append(loss)
    if epo%50 == 0:
        print(f'Epoch {epo}, loss = {loss}')

Epoch 0, loss = 10.742963790893555
Epoch 50, loss = 9.851836204528809
Epoch 100, loss = 9.75027084350586
Epoch 150, loss = 9.688007354736328
CPU times: user 7min 8s, sys: 504 ms, total: 7min 9s
Wall time: 7min 11s


In [20]:
df = pd.read_csv("results.csv")
df.index = df['_id']
df = df['_labels'].apply(lambda x: x[1:])

In [26]:
list(map(lambda x: model.embeddings(torch.tensor(x, device=device)), list(df.index)))

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [35]:
%%time

loss_f = torch.nn.CrossEntropyLoss() # see details: https://pytorch.org/docs/stable/nn.html
optim = torch.optim.SGD(model.parameters(), lr=learning_rate)

input_loader = DataLoader(df.index, batch_size=1024)
label_loader = DataLoader(df, batch_size=1024)

convert = {
    'Author': torch.tensor(0,device = device),
    'Paper': torch.tensor(1, device=device),
    'Conference': torch.tensor(2, device=device)
}


for epo in range(num_epochs):
    for x,y in zip(input_loader, label_loader):
        
        x = torch.tensor(x, device=device)
        y = torch.cat(list(map(lambda z: convert[z], y)))
        
        optim.zero_grad()
     
        y_pred = model.embeddings(x)

        #compute loss
        loss = loss_f(y_pred, y)
        
        # bakpropagation step
        loss.backward()
        optim.step()
        
    if epo%10 == 0:
        learning_rate *= lr_decay
    loss_hist.append(loss)
    if epo%50 == 0:
        print(f'Epoch {epo}, loss = {loss}')

RuntimeError: zero-dimensional tensor (at position 0) cannot be concatenated