In [24]:
import pandas as pd

song_record_data = pd.read_csv("song_record.csv")
song_record_data

Unnamed: 0,user_id,song_id,plays
0,641b13763a63f32bb9d71460,Z66D0F60,165
1,641b13763a63f32bb9d71460,Z6BIWFEZ,95
2,641b13763a63f32bb9d71460,Z6B0IC89,23
3,641b13763a63f32bb9d71460,Z6AFUB8O,410
4,641b13763a63f32bb9d71460,ZZF0WU6O,428
...,...,...,...
82,test2,ZO6EIB7B,142
83,test2,Z6UBADAF,413
84,test2,ZW807CUD,332
85,test1,ZW6CZ0OW,385


In [37]:
from scipy.sparse import csr_matrix
wide_song_data =song_record_data.pivot(index="song_id",columns="user_id",values="plays").fillna(0)

wide_song_data_sparse = csr_matrix(wide_song_data.values)

In [40]:
wide_song_data_sparse.toarray()

array([[165.,   0.,   0.,   0.],
       [279.,   0.,   0.,   0.],
       [179.,   0.,   0., 402.],
       [323.,   0.,   0.,   0.],
       [423.,   0.,   0.,   0.],
       [380.,   0.,   0.,   0.],
       [ 19.,   0.,   0.,   0.],
       [457.,   0.,   0.,   0.],
       [ 87.,   0.,   0.,   0.],
       [484.,   0.,   0.,   0.],
       [410.,   0.,   0.,   0.],
       [ 23.,   0.,   0.,   0.],
       [221.,   0.,   0.,   0.],
       [382.,   0.,   0.,   0.],
       [281.,   0.,   0.,   0.],
       [ 97.,   0.,   0.,   0.],
       [ 95.,   0.,   0.,   0.],
       [236.,   0.,   0.,   0.],
       [167.,   0.,   0.,   0.],
       [218.,   0.,   0.,   0.],
       [430.,   0.,   0.,   0.],
       [ 41.,   0.,   0.,   0.],
       [ 57.,   0.,   0.,   0.],
       [ 61.,   0.,   0.,   0.],
       [194.,   0.,   0.,   0.],
       [436.,   0., 413.,   0.],
       [ 40.,   0.,   0.,   0.],
       [ 13.,   0.,   0.,   0.],
       [307.,   0.,   0.,   0.],
       [ 67.,   0.,   0.,   0.],
       [ 9

In [26]:
import numpy as np


def save_sparse_csr(filename, array):
    np.savez(
        filename,
        data=array.data,
        indices=array.indices,
        indptr=array.indptr,
        shape=array.shape,
    )


def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix(
        (loader["data"], loader["indices"], loader["indptr"]), shape=loader["shape"]
    )
save_sparse_csr("song_data_sparse_matrix.npz",wide_song_data_sparse)

In [27]:
wide_song_data.to_csv("song_play_record.csv")

In [46]:
from sklearn.neighbors import NearestNeighbors

model_knn_cosine = NearestNeighbors(metric="cosine",algorithm="brute",n_neighbors=15)
model_knn_cosine.fit(wide_song_data_sparse)

In [49]:
model_knn_euclidean = NearestNeighbors(
    metric="euclidean", algorithm="brute", n_neighbors=15
)
model_knn_euclidean.fit(wide_song_data_sparse)

In [43]:
import pickle
from fuzzywuzzy import fuzz

In [75]:
def print_song_recommendations(song_id, song_play_matrix, knn_model, k):
    """
    Inputs:
        song_id: id of song
        song_play_matrix: song_play_matrix (not the sparse one, the pandas dataframe)
        knn_model: our previously fitted knn_model
        k: the number of nearest neighbors
    Prints: Song recommendations for the provided song id
    Return: None
    """
    query_index = None
    ratio_tuples = []

    for i in song_play_matrix.index:
        ratio = fuzz.ratio(i, song_id)
        if ratio == 100:
            current_index_query = song_play_matrix.index.tolist().index(i)
            ratio_tuples.append((i, ratio, current_index_query))
    try:
        query_index = max(ratio_tuples, key=lambda x: x[1])[2]
    except:
        print("Your song didn't match any song id in the data. Try again")
        return None

    distances, indices = knn_model.kneighbors(
        song_play_matrix.iloc[query_index, :].values.reshape(1, -1), n_neighbors=k + 1
    )
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print(
                "Cosine similarity compare with {0}\n".format(song_play_matrix.index[query_index])
            )
        else:
            print(
                "{0}: {1}, with similarity of {2}".format(
                    i,
                    song_play_matrix.index[indices.flatten()[i]],
                    1 - distances.flatten()[i],
                )
            )
    return None

In [73]:
def build_model(metric):
    model_knn = NearestNeighbors(metric=metric, algorithm="auto", n_neighbors=15)

    model_knn.fit(wide_song_data_sparse)
    return model_knn

In [76]:
print_song_recommendations("ZWA7CE0B", wide_song_data, build_model("cosine"), k=5)


Cosine similarity compare with ZWA7CE0B

1: ZW6CZ0OW, with similarity of 0.9081041999676464
2: ZWZD9DCB, with similarity of 0.8238213698182139
3: ZW7IUA6E, with similarity of 0.6196819964162361
4: ZW8W777E, with similarity of 0.35087374133219607
5: ZW9CFDU9, with similarity of 0.35087374133219607


In [69]:
print_song_recommendations("ZWA7CE0B", wide_song_data, build_model("manhattan"), k=5)


Recommendation for ZWA7CE0B

1: ZW6CZ0OW, with distance of 261.0
2: ZWZD9DCB, with distance of 403.0
3: ZZ9IZ898, with distance of 418.0
4: ZZU9060B, with distance of 421.0
5: Z66D0F60, with distance of 422.0


In [71]:
print_song_recommendations("ZWA7CE0B", wide_song_data, build_model("euclidean"), k=5)


Recommendation for ZWA7CE0B

1: ZW6CZ0OW, with distance of 236.4339231159522
2: ZWZD9DCB, with distance of 337.9778099224859
3: ZW7IUA6E, with distance of 345.05072090925995
4: ZZ9IZ898, with distance of 411.0596063833079
5: ZZU9060B, with distance of 411.1216365018995
