In [None]:
import torch, json
import numpy as np
from utils.misc import AverageMeter

with open('./data/vocabulary.json') as fp:
    vocab = json.load(fp)
    artist_weights = [(i[1].get('id'), i[1].get('count')) for i in vocab.items()]
    artist_names = [i[1].get('name') for i in vocab.items()]

epochs = 10
vocab_size = len(artist_weights)
# Best practice for embedding size is the 4th root of the number of categories 
# https://developers.googleblog.com/2017/11/introducing-tensorflow-feature-columns.html
embedding_size = round(vocab_size ** 0.25)

In [None]:
from utils.data import ArtistPairDataset
from torch.utils.data import DataLoader, random_split

ds = ArtistPairDataset('./data', artist_weights = artist_weights, negative_samples = 20, batch_size = 1024)
dl = DataLoader(ds, batch_size=1, num_workers=5, shuffle=True)
#train_dataset, test_dataset = random_split(apds, [train_size, test_size])

In [None]:
# https://dl-nlp.github.io/word2vec2.pdf

import torch.nn as nn
from tqdm import tqdm_notebook
from model import Skipgram
from tensorboardX import SummaryWriter

model = Skipgram(vocab_size, embedding_size).cuda()
optimizer = torch.optim.SparseAdam(model.parameters())
writer = SummaryWriter()
losses = AverageMeter()

for e in range(epochs):
    losses.reset()
    
    for i, (inputs_batch, labels_batch, neg_samples) in enumerate(tqdm_notebook(dl)):
        n_iter = e * len(dl) + i
        
        loss = model.forward(
            inputs_batch.squeeze().cuda(), 
            labels_batch.squeeze().cuda(),
            neg_samples.squeeze().cuda()
        )
        
        # compute gradient and do optimizer step
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()
        
        losses.update(loss.item(), ds.batch_size)
        
        if i % 10000 == 0:
            for name, param in model.named_parameters():
                writer.add_histogram(name, param, n_iter)
                writer.add_histogram('{}-gradient'.format(name), param.grad.to_dense(), n_iter)
            
            writer.add_scalar('loss', losses.val, n_iter)
            writer.add_scalar('avg_loss', losses.avg, n_iter)

    writer.add_embedding(model.input_embeddings(), metadata = artist_names, global_step=n_iter)

In [None]:
np.save('./embeddings/aDE17_{}d-{}e-{}ns-{}bs.npy'.format(
    embedding_size, epochs, ds.negative_samples, ds.batch_size), model.input_embeddings())