<a href="https://www.kaggle.com/code/vladyslavhutov/music-similarity-2?scriptVersionId=115102629" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [2]:
from fastai.tabular.all import *
from fastai.collab import *
import json
from pathlib import Path
from collections import namedtuple
from sklearn.neighbors import KNeighborsClassifier
import torch
from torch.utils.data import Dataset
import math
import functools
import logging


In [3]:
logger = logging.getLogger(__name__)

c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)

logger.addHandler(c_handler)

In [4]:
p = Path('/kaggle/input/spotify-playlists-csv')
paths = list(p.iterdir())

In [5]:
users_followers = pd.read_csv('/kaggle/input/users-followers/user_stat.csv')
common_artists = pd.read_csv('/kaggle/input/common-spotify-artists/common_artist_uri.csv')

## Negative samples

In [6]:
def get_factor(N, ser):
    return max(math.ceil(N / len(ser)), 1)

def generate_negative_pairs(N, users, items):
    itemsF = get_factor(N, items)
    usersF = get_factor(N, users)
    
    items = items.repeat(itemsF).sample(frac=1, ignore_index=True)
    users = users.repeat(usersF).sample(frac=1, ignore_index=True)
    
    df = pd.concat([users, items], axis=1).dropna().iloc[:N]
    df['rating'] = 0
    return df

def generate_negative_artists(N, users, artists):
    df = generate_negative_pairs(N, users, artists)
    df = df.set_index(['pid', 'artist_uri'])
    return df

def generate_negative_tracks(N, users, tracks):
    df = generate_negative_pairs(N, users, tracks)
    df.pid = df.pid.astype('int64')
    df = df.set_index(['pid', 'track_uri'])
    return df

In [7]:
def join_dfs(df1, df2):
    res = pd.concat([df1, df2], sort=True, copy=False)
    del df1
    del df2
    return res


def reindex(df):
    df.reset_index(inplace=True)
    return df

In [8]:
N_10k = 10_000
N_50k = 50_000
N_88k = 88_000

def get_common_artists_index(mentions):
    df = common_artists[common_artists['count'] > mentions] 
    df = df[df.artist_uri != '#na#']
    df = df.set_index('artist_uri')
    return df.index

In [21]:
def read_for_artists(whitelist):
    df = pd.concat((pd.read_csv(f, usecols=['pid', 'artist_uri']) for f in paths), ignore_index=True)
    df = df[df.artist_uri.isin(whitelist)]
    return df
    
    
def read_for_tracks(whitelist):
    df = pd.concat((pd.read_csv(f, usecols=['pid', 'artist_uri', 'track_uri']) for f in paths), ignore_index=True)
    df = df[df.artist_uri.isin(whitelist)]
    df.drop(columns=['artist_uri'], inplace=True)
    return df
    
def clean_by_users(df, u_min, u_max):    
    user_counts = df.groupby('pid')['pid'].count()
    
    drop_df = ((user_counts < u_min) | (user_counts > u_max))
    drop_index = drop_df[drop_df].index
    df = df[~df.pid.isin(drop_index)]

    return df


def preprocess_artists(df):
    df.drop_duplicates(subset=['pid', 'artist_uri'], inplace=True)
    return df


def prepare_positive(df, item_name, f = lambda x: 1):
    df['rating'] = df.pid.apply(f)
    df['pid'] = df.pid.astype('int64')
    df = df.set_index(['pid', item_name])
    return df




In [10]:
Model = namedtuple('Model', ['learn', 'dls'])
    
def setup_cosine(df, item_name, bs=4096):
    dls = CollabDataLoaders.from_df(df, item_name=item_name, rating_name='rating', username='pid', bs=bs, device=torch.device('cuda'))
    learn = collab_learner(dls, n_factors=50, y_range=(0, 1))
    
    return Model(learn=learn, dls=dls)

def setup_nn(df, item_name, layers, bs=4096):
    dls = CollabDataLoaders.from_df(df, item_name=item_name, rating_name='rating', username='pid', bs=bs, device=torch.device('cuda'))
    learn = collab_learner(dls, layers=layers, use_nn=True, y_range=(0, 1))
    
    return Model(learn=learn, dls=dls)

In [11]:
def get_weights(model):
    if isinstance(model.learn.model, EmbeddingNN):
        emb = model.learn.model.embeds[1].weight
    else:
        emb = model.learn.model.i_weight.weight
    return emb.cpu().data.numpy()

def get_knn(model, class_name, k=21):
    knn = KNeighborsClassifier(n_neighbors=k)
    weights = get_weights(model)
    items = model.dls.classes[class_name].items
    knn.fit(weights, items)
    print('trained knn classifier')
    return knn

def get_emb(model, items):
    # todo can get N items
    if isinstance(model.learn.model, EmbeddingNN):
        emb = model.learn.model.embeds[1].weight[items]
    else:
        emb = model.learn.model.i_weight.weight[items]
    return emb.cpu().data.numpy()

def get_similar(knn, model, items):
    emb = get_emb(model, items)
    return knn.kneighbors(emb, return_distance=False)

In [12]:
def model_classes(model):
    if 'track_uri' in model.dls.classes:
        return model.dls.classes['track_uri']
    else:
        return model.dls.classes['artist_uri']

def to_uri(model, i):
    return model_classes(model).items[i]
    
def total_items(model):
    return len(model_classes(model).items)

def compute_similarity(model, knn, bs):
    items = total_items(model)

    stream = (
        (idx, similar_arr)
        for start_idx in range(0, items, bs)
        for (idx, similar_arr) in zip(
            range(start_idx, start_idx+bs),
            get_similar(knn, model, slice(start_idx, start_idx+bs)),
        )
    )
    
    return (
        (to_uri(model, idx), [to_uri(model, i) for i in similar if i != idx])
        for idx, similar in stream
    )
    

def store_similarity_pairs(similarity, name):
    with open(name, 'w') as f:
        for k, v in similarity:
            f.write(k)
            f.write(' ')
            f.write(' '.join(v))
            f.write('\n')

In [22]:
allowed_artists = get_common_artists_index(N_50k)
df = read_for_tracks(allowed_artists)
negative_ratio = 0.5
df = clean_by_users(df, 4, 30)
track_counts = df.groupby('track_uri').track_uri.count()
track_wl = track_counts[track_counts > 100].index
df = df[df.track_uri.isin(track_wl)]
del track_counts
df2 = generate_negative_tracks(math.ceil(negative_ratio * len(df)), df.pid.drop_duplicates(), track_wl.to_series())
df = prepare_positive(df, 'track_uri')
df = join_dfs(df, df2)
df = reindex(df)
df

Unnamed: 0,pid,track_uri,rating
0,896625,0Zbbxnx4SGGHoIow4PpISP,1
1,896625,2Gl0FzuLxflY6nPifJp5Dr,1
2,896625,48UPSzbZjgc449aqz8bxox,1
3,896625,2mb6YdoavL6db22p7XKJQZ,1
4,896626,7DFnq8FYhHMCylykf6ZCxA,1
...,...,...,...
9891822,561731,2ncMlnrDwSfnYnV5ixxuib,0
9891823,590718,2vTdEBKuXAFi6T0IbRuDff,0
9891824,441613,4ehErMHPM40f92dks3vb2p,0
9891825,325701,15MTd64KUMG7CF6mOyovsQ,0


In [23]:
track_model = setup_nn(df, 'track_uri', layers=[50, 35, 20])
track_model.learn.fit_one_cycle(3, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.157627,0.155853,05:28
1,0.123063,0.124634,05:28
2,0.086869,0.119675,05:27


In [24]:
knn = get_knn(track_model, 'track_uri', k=11)

trained knn classifier


In [25]:
it = compute_similarity(track_model, knn, 128)

In [26]:
store_similarity_pairs(it, 'track_similarity.txt')

In [None]:
def artist_training_set(n_common, users_min_max, negative_ratio):
    logger.info('Reading allowed artists set')
    allowed_artists = get_common_artists_index(n_common)
    
    logger.info('Reading events dataset for artists training')
    df = read_for_artists(allowed_artists)
    
    logger.info('Cleaning dataset: removing noisy users')
    df = clean_by_users(df, users_min_max[0], users_min_max[1])
    
    logger.info('Preprocessing dataset for artists training')
    df = preprocess_artists(df)

    logger.info('Generating negative pairs')
    df2 = generate_negative_artists(math.ceil(negative_ratio * len(df)), df.pid.drop_duplicates(), allowed_artists.to_series())
    
    logger.info('Preparing positive examples')
    df = prepare_positive(df, 'artist_uri')

    logger.info('Joining datasets')
    df = join_dfs(df, df2)
    
    logger.info('Reindexing dataset')
    return reindex(df)

def artist_flow(n_common, users_min_max, negative_ratio, layers, epochs, filename):
    logger.info('Preparing training dataset')
    df = artist_training_set(n_common, users_min_max, negative_ratio)
    
    logger.info('Setting up model')
    author_model = setup_nn(df, 'artist_uri', n_factors, layers=layers)
    
    logger.info('Training Neural Network model')
    author_model.learn.fit_one_cycle(epochs, 5e-3, wd=0.1)

    logger.info('Training KNN model')
    knn = get_knn(author_model, 'artist_uri')
    
    logger.info('Computing similarity map')
    similarity = compute_similarity_map(author_model, knn)
    
    logger.info('Writing similarity pairs')
    store_similarity_pairs(similarity, filename)
    



In [None]:
# artist_flow(n_common=N_10k, 
#             users_min_max=(5, 40), 
#             negative_ratio=1,
#             layers=[50, 50, 25], 
#             epochs=4,
#             filename='similarity-smaller.txt')

In [None]:
# artist_flow(n_common=N_10k, 
#             users_min_max=(5, 40), 
#             negative_ratio=1, 
#             n_factors=50, 
#             layers=[50, 50, 25, 25], 
#             epochs=4,
#             filename='similarity-bigger.txt')

In [None]:
# artist_flow(n_common=N_10k, 
#             users_min_max=(5, 40), 
#             negative_ratio=0.5, 
#             n_factors=50,
#             layers=[100, 50, 50, 25], 
#             epochs=4,
#             filename='similarity-smaller-05.txt')

In [None]:
# artist_flow(n_common=N_10k, 
#             users_min_max=(5, 40), 
#             negative_ratio=0.5, 
#             n_factors=50, 
#             layers=[100, 100, 50, 50, 25, 25], 
#             epochs=4,
#             filename='similarity-bigger-05.txt')