## Inference

In [1]:
# imports
import numpy as np
import random
import pickle

In [2]:
experiment_name = "default_hyperparams"
random.seed(42)

### Load Tokenizer and Embeddings

In [3]:
experiment_dir = "../experiments/" + experiment_name

In [4]:
# load tokenizer
tokenizer = pickle.load(open(experiment_dir + "/data/tokenizer.pkl", "rb"))
random.sample(list(tokenizer.word_index.keys()), 5)

['the pigeon detectives - unforgettable',
 'young money - bedrock',
 'you me at six - fireworks',
 'gin wigmore - devil in me',
 'snow patrol - chasing cars']

In [5]:
# search for tracks / artists
for track_name in tokenizer.word_index.keys():
    if "bloc party" in track_name:
        print(track_name)

bloc party - one month off
bloc party - helicopter
bloc party - hunting for witches
bloc party - i still remember
bloc party - on
bloc party - signs


In [6]:
# load embeddings
embedding_weights = pickle.load(open(experiment_dir + "/embeddings.pkl", "rb"))
print("embeddings shape:", embedding_weights.shape)

embeddings shape: (8630, 100)


### Get top-n most similar tracks

In [7]:
# function to get top-n most similar tracks
def get_most_similar_tracks(track_name, n=10, tokenizer=tokenizer, embedding_weights=embedding_weights):
    
    # get track embedding
    track_idx = tokenizer.word_index[track_name]
    track_vector = embedding_weights[track_idx, :].reshape(1, -1)

    # compute similarities against other tracks
    similarities = np.dot(track_vector, embedding_weights.T) / (np.linalg.norm(track_vector) * np.linalg.norm(embedding_weights, axis=1))
    similarities = similarities.reshape(-1)

    # get most similar tracks' indices
    most_similar_idxs = np.argpartition(similarities, -(n+1))[-(n+1):]
    most_similar_idxs = most_similar_idxs[np.argsort(similarities[most_similar_idxs])][::-1][1:]

    # print most similar tracks, along with their positions in training data
    print("top {} tracks most similar to '{}' (pos. {}):".format(n, track_name, track_idx))
    for idx in most_similar_idxs:
        print("- (sim. {:.3f}): '{}' (pos. {})".format(similarities[idx], tokenizer.index_word[idx], idx))

### Try it out

In [8]:
# specify track name and top n
track_name = "bloc party - helicopter"
n = 10

In [9]:
get_most_similar_tracks(track_name, n=n)

top 10 tracks most similar to 'bloc party - helicopter' (pos. 5053):
- (sim. 0.450): 'pierce the veil - hell above' (pos. 657)
- (sim. 0.372): 'kanye west - hell of a life' (pos. 5055)
- (sim. 0.367): 'foster the people - helena beat' (pos. 5052)
- (sim. 0.358): 'markus krunegård - hela livet var ett disco' (pos. 5050)
- (sim. 0.355): 'shout out louds - fall hard' (pos. 4396)
- (sim. 0.339): 'matmatah - l'apologie' (pos. 2011)
- (sim. 0.338): 'tom petty and the heartbreakers - it'll all work out' (pos. 5676)
- (sim. 0.332): 'lord huron - love like ghosts' (pos. 6263)
- (sim. 0.327): 'håkan hellström - jag vet inte vem jag är men jag vet att jag är din' (pos. 2398)
- (sim. 0.319): 'tired pony - held in the arms of your words' (pos. 5051)
