In [1]:
# library
import pandas as pd
import numpy as np

In [213]:
def get_user_profile_matrix(track_data,
                            triplets_data,
                            export_as_csv = True,
                            export_path = "user_profile_mat.csv") :
    
    user_median_play_count = (triplets_data.groupby(by=["user_id"])[["play_count"]]
                                           .median())
    user_profile_mat = []
    user_profile_mat_columns = (track_data.select_dtypes(include = ['number'])
                                          .columns)
    
    for tuple in user_median_play_count.itertuples():
        user_id = tuple.Index
        median_threshold = tuple.play_count
        user_triplets = triplets_data.loc[user_id]
        filtered_songids = user_triplets.loc[(user_triplets.play_count >= median_threshold)].song_id
        filtered_songs = track_data[track_data.index.isin(filtered_songids)]
        user_profile = filtered_songs.mean(axis=0)
        user_profile_mat.append(user_profile)
    
    user_profile_mat_df = pd.DataFrame(user_profile_mat, 
                                       columns=user_profile_mat_columns,
                                       index=user_median_play_count.index)
    
    if (export_as_csv):
        user_profile_mat_df.to_csv(export_path, index=True)
    
    return user_profile_mat_df

In [208]:
one_user_triplets_data = (pd.read_csv("one_user_triplets.csv")
                            .set_index(["user_id"]))
one_user_track_data = (pd.read_csv("one_user_track_data.csv")
                         .set_index(["song_id"]))

In [214]:
one_user_user_profile_mat = get_user_profile_matrix(one_user_track_data,
                                                    one_user_triplets_data,
                                                    export_as_csv = True,
                                                    export_path = "one_user_user_profile_mat.csv")

In [210]:
five_users_triplets_data = (pd.read_csv("five_users_triplets.csv")
                              .set_index(["user_id"]))
five_users_track_data = (pd.read_csv("five_users_track_data.csv")
                           .set_index(["song_id"]))

In [215]:
five_users_user_profile_mat = get_user_profile_matrix(five_users_track_data,
                                                      five_users_triplets_data,
                                                      export_as_csv = True,
                                                      export_path = "five_users_user_profile_mat.csv")

In [230]:
def get_user_song_count_idxmat(user_profile_mat_df, 
                               track_data,
                               triplets_data):
    
    user_ids = user_profile_mat_df.index
    user_ids_dict = {user_id: index[0] for index, user_id in np.ndenumerate(user_ids)}
    song_ids = track_data.index
    song_ids_dict = {song_id: index[0] for index, song_id in np.ndenumerate(song_ids)}
    user_song_count_idxmat = np.full((len(user_ids),len(song_ids)), False)
    
    for triplet in triplets_data.itertuples():
        user_id = triplet.Index
        song_id = triplet.song_id
        user_id_idx = user_ids_dict[user_id]
        song_id_idx = song_ids_dict[song_id]
        user_song_count_idxmat[user_id_idx, song_id_idx] = True
    
    return user_song_count_idxmat

In [264]:
one_user_user_song_count_idxmat = get_user_song_count_idxmat(one_user_user_profile_mat, 
                                                             one_user_track_data,
                                                             one_user_triplets_data)

In [235]:
five_users_user_song_count_idxmat = get_user_song_count_idxmat(five_users_user_profile_mat, 
                                                               five_users_track_data,
                                                               five_users_triplets_data)

In [260]:
np.unique(five_users_user_song_count_idxmat[0], return_counts = True)

(array([False,  True]), array([141,  51]))

In [262]:
len(five_users_triplets_data.loc[five_users_user_profile_mat.index[0],].song_id)

51

In [299]:
from copy import deepcopy

class contentbasedRec:
    
    def __init__(self, 
                 user_profile_df : pd.core.frame.DataFrame, 
                 track_df: pd.core.frame.DataFrame,
                 user_song_count_idxmat,
                 similarity_measures = ["cosine", "euclidean", "pearson"]):
        
        self.user_profile_df = user_profile_df
        self.track_df = track_df
        self.user_song_count_idxmat = user_song_count_idxmat
        self.similarity_measures = similarity_measures
        
        self.cosine_similarity_mat = None
        self.euclid_similarity_mat = None
        self.pearson_similarity_mat = None
        
        self.cosine_rank_mat = None
        self.euclid_rank_mat = None
        self.pearson_rank_mat = None
    
    def train_model(self):
        user_profile_mat = self.user_profile_df.to_numpy()
        track_mat = self.track_df.to_numpy()
        
        if "cosine" in self.similarity_measures:
            self.cosine_similarity_mat = pairwise_distances(X = user_profile_mat, 
                                                            Y = track_mat,
                                                            metric = "cosine")
            self.cosine_rank_mat = deepcopy(self.cosine_similarity_mat)
            self.cosine_rank_mat[self.user_song_count_idxmat] = -1
            
        
        if "euclidean" in self.similarity_measures:
            self.euclid_similarity_mat = pairwise_distances(X = user_profile_mat, 
                                                            Y = track_mat,
                                                            metric = "euclidean")
            self.euclid_rank_mat = deepcopy(self.euclid_similarity_mat)
            self.euclid_rank_mat[self.user_song_count_idxmat] = -1
            
        if "pearson" in self.similarity_measures:
            self.pearson_similarity_mat = pairwise_distances(X = user_profile_mat, 
                                                             Y = track_mat,
                                                             metric = "correlation")
            self.pearson_rank_mat = deepcopy(self.pearson_similarity_mat)
            self.pearson_rank_mat[self.user_song_count_idxmat] = -1
        
    def get_rank_matrix(self, similarity_measure):
        if similarity_measure == "cosine":
            return self.cosine_rank_mat
        elif similarity_measure == "euclidean":
            return self.euclid_rank_mat
        elif similarity_measure == "pearson":
            return self.pearson_rank_mat
        else :
            raise Exception("contentbasedRec: unsupported similarity measure:", similarity_measure)
     
    def get_similarity_matrix(self, similarity_measure):
        if similarity_measure == "cosine":
            return self.cosine_similarity_mat
        elif similarity_measure == "euclidean":
            return self.euclid_similarity_mat
        elif similarity_measure == "pearson":
            return self.pearson_similarity_mat
        else :
            raise Exception("contentbasedRec: unsupported similarity measure:", similarity_measure)

In [304]:
one_user_contentbasedModel = contentbasedRec(user_profile_df = one_user_user_profile_mat, 
                                             track_df = one_user_track_data,
                                             user_song_count_idxmat = one_user_user_song_count_idxmat,
                                             similarity_measures = ["cosine", "euclidean", "pearson"])

In [305]:
one_user_contentbasedModel.train_model()

In [306]:
one_user_contentbasedModel.get_rank_matrix("cosine")

array([[-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1.]])

In [307]:
one_user_contentbasedModel.get_similarity_matrix("cosine")

array([[0.00456773, 0.00229487, 0.02040282, 0.00210494, 0.05268596,
        0.00143027, 0.00212368, 0.00329451, 0.00353369, 0.03472449,
        0.00301217, 0.00599521, 0.01317689, 0.00431755, 0.00771602,
        0.00274429, 0.00184319, 0.00502366, 0.0115661 , 0.00158068,
        0.00239489, 0.01237354, 0.00564642, 0.00614419, 0.00488489,
        0.00345697, 0.00183337, 0.00293211, 0.06424435, 0.00198611,
        0.01397982, 0.04665674, 0.00302289, 0.00558816, 0.00106238,
        0.00197098, 0.00323199, 0.02419769, 0.00503696, 0.00183442,
        0.00680343, 0.00228804, 0.02480027, 0.00216363, 0.00287868,
        0.018375  , 0.00396836, 0.00267028, 0.00261581, 0.00222803,
        0.00990175, 0.04067543, 0.00733465, 0.00200688, 0.01158184,
        0.00428961, 0.0149963 , 0.00145735, 0.00233697, 0.01176011,
        0.00552849, 0.00245553, 0.00337232, 0.00574594, 0.00161139,
        0.00181391, 0.01262923, 0.0028816 , 0.00763823, 0.00209533,
        0.0014891 , 0.00586602, 0.00380273, 0.00

In [309]:
five_users_contentbasedModel = contentbasedRec(user_profile_df = five_users_user_profile_mat, 
                                               track_df = five_users_track_data,
                                               user_song_count_idxmat = five_users_user_song_count_idxmat,
                                               similarity_measures = ["cosine", "euclidean", "pearson"])

In [310]:
five_users_contentbasedModel.train_model()

In [311]:
five_users_contentbasedModel.get_rank_matrix("cosine")

array([[-1.00000000e+00,  5.35162719e-03, -1.00000000e+00,
        -1.00000000e+00,  4.77368715e-03,  2.95858327e-03,
        -1.00000000e+00,  1.83925046e-02, -1.00000000e+00,
         8.94369751e-03, -1.00000000e+00,  1.94753058e-03,
         3.73815367e-03, -1.00000000e+00,  2.17155062e-02,
         5.57994205e-02,  1.71207848e-03, -1.00000000e+00,
         4.61934196e-03,  4.50133355e-03,  3.45638308e-02,
        -1.00000000e+00, -1.00000000e+00,  2.84942826e-03,
        -1.00000000e+00,  7.29129876e-04,  3.77304930e-02,
         6.90883400e-03,  1.20708137e-02,  3.66137905e-03,
         4.61067447e-03, -1.00000000e+00,  6.88196931e-03,
         9.89047408e-03,  2.29453309e-02,  4.05210888e-03,
         2.19040467e-03,  5.56315644e-03,  5.45256921e-03,
        -1.00000000e+00, -1.00000000e+00,  1.24571172e-02,
        -1.00000000e+00, -1.00000000e+00,  1.65810586e-03,
         1.33524701e-02,  3.44010010e-03,  3.34771910e-03,
         4.40203437e-02, -1.00000000e+00,  9.15221082e-0