# Preprocessing and Modeling

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import euclidean_distances, pairwise_distances
from sklearn.neighbors import NearestNeighbors

from scipy.spatial.distance import cosine, cdist
from scipy.stats import pearsonr
from scipy.linalg import svd

from collections import defaultdict
import flatdict

import spotipy as sp
import sys
from sys import argv, exit
from spotipy.oauth2 import SpotifyClientCredentials
%matplotlib inline

In [2]:
df_cont= pd.read_csv('/Users/josephlim/Desktop/Data Science/Capstone Projects/Music Recommendation System- Capstone 3/Data/df_content_cleaned.csv')
df_collab= pd.read_csv('/Users/josephlim/Desktop/Data Science/Capstone Projects/Music Recommendation System- Capstone 3/Data/df_collab_cleaned.csv')

In [3]:
df_cont.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist            2000 non-null   object 
 1   song              2000 non-null   object 
 2   duration_ms       2000 non-null   int64  
 3   explicit          2000 non-null   bool   
 4   year              2000 non-null   int64  
 5   popularity        2000 non-null   int64  
 6   danceability      2000 non-null   float64
 7   energy            2000 non-null   float64
 8   key               2000 non-null   int64  
 9   loudness          2000 non-null   float64
 10  mode              2000 non-null   int64  
 11  speechiness       2000 non-null   float64
 12  acousticness      2000 non-null   float64
 13  instrumentalness  2000 non-null   float64
 14  liveness          2000 non-null   float64
 15  valence           2000 non-null   float64
 16  tempo             2000 non-null   float64


### Collaborative Filtering Recommendation System

#### Singular Value Decomposition (SVD)

In [4]:
df_collab.columns

Index(['user_id', 'artist', 'song', 'playlistname'], dtype='object')

In [5]:
df_collab.shape

(12856838, 4)

In [6]:
df_collab.artist.nunique()

289603

In [7]:
df_collab.user_id.nunique()

15914

In [8]:
df_collab.playlistname.nunique()

157320

In [9]:
df_collab.song.nunique()

2004523

Since we're planning on using collaborative filtering model in conjunction to our content-based filtering model, it makes sense to only keep the songs that are in both content-based filtering dataset and collaborative filtering dataset.

In [10]:
songs_cont= df_cont.song.unique().tolist()
songs_collab= df_collab.song.unique().tolist()

common_songs= [x for x in songs_collab if x in songs_cont]
len(common_songs)

1515

In [11]:
df_common_song= df_collab.loc[df_collab['song'].isin(common_songs)]
df_common_song['song'].nunique()

1515

In [12]:
# we can make sure that we're comparing correct songs by matching artist names as well
content_artist= df_cont['artist'].unique().tolist()
collab_artist= df_collab['artist'].unique().tolist()

common_artist= [x for x in collab_artist if x in(content_artist)]
df_common_collab= df_common_song.loc[df_common_song.artist.isin(common_artist)]

df_common_collab['song'].nunique()

1393

In [13]:
df_comm_coll_fil= df_common_collab[['user_id', 'song']]

In [14]:
# user-playlist matrix
M_playlist= pd.pivot_table(data=df_comm_coll_fil,
                columns='song',
                index='user_id',
                aggfunc=np.count_nonzero).fillna(0).astype(bool).replace({False:0, True:1})

unique_users= df_common_collab['user_id'].unique().tolist()
M_playlist.index= range(len(unique_users))
M_playlist

song,#SELFIE - Original Mix,#thatPOWER,'Till I Collapse,1 Thing,17,1973,21 Guns,21 Questions,21 Seconds,212,...,You're Beautiful,"You're Gonna Go Far, Kid",Young Forever,"Young, Wild & Free (feat. Bruno Mars)",Your Body,Your Love Alone Is Not Enough (feat. Nina Persson),Your Love Is My Drug,Youth of the Nation,human,m.A.A.d city
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12832,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12833,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12834,0,0,0,0,0,0,1,0,0,1,...,1,1,0,0,0,0,0,0,0,0


In [15]:
M_playlist.shape

(12836, 1393)

In [16]:
user_id_pivot=list(M_playlist.index)
song_titles_pivot= list(M_playlist.columns)

In [17]:
# cosine similarity for users and songs they added into their playlists

cosine_sim= 1- pairwise_distances(M_playlist, metric='cosine')
df_cos= pd.DataFrame(cosine_sim)

df_cos.index= user_id_pivot

df_cos

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12826,12827,12828,12829,12830,12831,12832,12833,12834,12835
0,1.000000,0.048507,0.000000,0.083406,0.000000,0.000000,0.000000,0.061721,0.057470,0.000000,...,0.000000,0.129777,0.000000,0.000000,0.000000,0.063246,0.0,0.000000,0.142306,0.031623
1,0.048507,1.000000,0.084440,0.000000,0.073127,0.171499,0.000000,0.037424,0.023231,0.000000,...,0.000000,0.078689,0.062622,0.000000,0.000000,0.000000,0.0,0.000000,0.062753,0.076696
2,0.000000,0.084440,1.000000,0.000000,0.000000,0.000000,0.000000,0.080582,0.033347,0.000000,...,0.037113,0.084717,0.044947,0.000000,0.077850,0.000000,0.0,0.000000,0.180162,0.027524
3,0.083406,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.096523,0.099860,0.000000,...,0.000000,0.270604,0.053838,0.122975,0.000000,0.131876,0.0,0.000000,0.134877,0.032969
4,0.000000,0.073127,0.000000,0.000000,1.000000,0.000000,0.000000,0.093048,0.000000,0.077850,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.078013,0.047673
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12831,0.063246,0.000000,0.000000,0.131876,0.000000,0.000000,0.000000,0.048795,0.060578,0.000000,...,0.000000,0.000000,0.000000,0.046625,0.000000,1.000000,0.0,0.000000,0.061365,0.000000
12832,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,0.000000,0.000000
12833,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.095783,0.000000,...,0.000000,0.000000,0.000000,0.049147,0.000000,0.000000,0.0,1.000000,0.021562,0.000000
12834,0.142306,0.062753,0.180162,0.134877,0.078013,0.045739,0.000000,0.139735,0.154892,0.050104,...,0.082745,0.167892,0.066806,0.200282,0.115711,0.061365,0.0,0.021562,1.000000,0.092048


In [18]:
df_cos.shape

(12836, 12836)

In [19]:
pearson_sim= 1- pairwise_distances(M_playlist, metric= 'correlation')
df_pearsons= pd.DataFrame(pearson_sim)

df_pearsons.index= user_id_pivot

df_pearsons

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12826,12827,12828,12829,12830,12831,12832,12833,12834,12835
0,1.000000,0.034224,-0.021058,0.067352,-0.012061,-0.005126,-0.003623,0.039408,0.021016,-0.014104,...,-0.017125,0.110143,-0.014104,-0.024982,-0.008114,0.052556,-0.003623,-0.010901,0.096251,0.009135
1,0.034224,1.000000,0.068670,-0.014402,0.063953,0.168464,-0.002979,0.018637,-0.008039,-0.011597,...,-0.014080,0.061662,0.051753,-0.020540,-0.006671,-0.009452,-0.002979,-0.008963,0.018787,0.059190
2,-0.021058,0.068670,1.000000,-0.020183,-0.013897,-0.005907,-0.004175,0.055347,-0.010233,-0.016252,...,0.018129,0.060847,0.029484,-0.028786,0.069581,-0.013246,-0.004175,-0.012561,0.129439,0.001481
3,0.067352,-0.014402,-0.020183,1.000000,-0.011560,-0.004913,-0.003473,0.075986,0.067131,-0.013518,...,-0.016413,0.254967,0.041065,0.102159,-0.007777,0.122441,-0.003473,-0.010449,0.090460,0.011454
4,-0.012061,0.063953,-0.013897,-0.011560,1.000000,-0.003383,-0.002391,0.079129,-0.025994,0.069275,...,-0.011301,-0.014940,-0.009308,-0.016487,-0.005355,-0.007586,-0.002391,-0.007194,0.045450,0.033225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12831,0.052556,-0.009452,-0.013246,0.122441,-0.007586,-0.003224,-0.002279,0.034734,0.038550,-0.008872,...,-0.010772,-0.014240,-0.008872,0.031872,-0.005104,1.000000,-0.002279,-0.006857,0.028967,-0.014621
12832,-0.003623,-0.002979,-0.004175,-0.003473,-0.002391,-0.001016,-0.000718,-0.004726,-0.007809,-0.002796,...,-0.003395,-0.004489,-0.002796,-0.004953,-0.001609,-0.002279,1.000000,-0.002161,-0.012198,-0.004609
12833,-0.010901,-0.008963,-0.012561,-0.010449,-0.007194,-0.003058,-0.002161,-0.014218,0.076594,-0.008413,...,-0.010215,-0.013504,-0.008413,0.035240,-0.004840,-0.006857,-0.002161,1.000000,-0.012932,-0.013865
12834,0.096251,0.018787,0.129439,0.090460,0.045450,0.033032,-0.012198,0.075652,0.044658,0.007867,...,0.033988,0.110817,0.026316,0.139673,0.100045,0.028967,-0.012198,-0.012932,1.000000,0.024367


In [20]:
# function to find k-similar users given user id and user-playlist matrix (M_playlist)

def k_sim_users(user_id, matrix, metric= 'cosine', k=5):
    similarities=[]
    indices=[]
    model= NearestNeighbors(metric=metric)
    model.fit(matrix)
    
    distances, indices= model.kneighbors(matrix.iloc[user_id-1, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()
    print ('{0} most similar users for User {1}:\n'.format(k, user_id))
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == user_id:
            continue;

        else:
            print ('{0}: User {1}, {2}'.format(i, indices.flatten()[i]+1, similarities.flatten()[i]))
            
    return similarities,indices



In [21]:
similarities, indices= k_sim_users(16, M_playlist, metric='cosine')



5 most similar users for User 16:

1: User 3998, 0.3922322702763681
2: User 5622, 0.3922322702763681
3: User 11587, 0.3202563076101743
4: User 5071, 0.3202563076101743
5: User 2750, 0.3202563076101743


In [22]:
similarities, indices= k_sim_users(16, M_playlist, metric='correlation')



5 most similar users for User 16:

1: User 3998, 0.39067830955707283
2: User 5622, 0.39067830955707283
3: User 2750, 0.3175992093362974
4: User 5071, 0.3175992093362958
5: User 11587, 0.31759920933628827


In [23]:
def collab_music_rec(user, song, M_playlist, metric= 'cosine', k=5):
    prediction=0
    similarities, indices= k_sim_users(user, M_playlist, metric=metric, k=k)
    
    user_songs= M_playlist.iloc[user]
    user_songlist= user_songs[user_songs.eq(1)].index.to_list()
    
    sim_user_songs= M_playlist.iloc[indices.flatten()[1]+1]
    sim_user_songlist= sim_user_songs[sim_user_songs.eq(1)].index.to_list()
    
    user_rec= [x for x in sim_user_songlist if x not in user_songlist][:5]
    
    print ('\nRecommended songs for user {0}:'.format(user_id_pivot[user]), *user_rec, sep= "\n")

In [24]:
collab_music_rec(18, 305, M_playlist)



5 most similar users for User 18:

1: User 4661, 0.7385489458759966
2: User 5324, 0.5477225575051662
3: User 1052, 0.5107539184552492
4: User 832, 0.492365963917331
5: User 12291, 0.4714045207910318

Recommended songs for user 18:
Clarity
Demons
It Girl
One More Night
Paradise


### Content-Based Filtering Recommendation System

We will recommend songs based on songs' audio features and metadata. First, we will compute average vectors of features of songs the user has listened to. We will then recommend datapoints that are closest to those vectors.

In [25]:
# a fxn to use for songs not in our dataset
def find_song(song, artist):
    song_data= defaultdict()
    query= sp.search(q= 'track:{} artist: {}'.format(song, artist), limit=1)
    
    if query['tracks']['items']==[]:
        return None
    
    query['tracks']['items'][0]
    track_id= query['id']    
    audio_features= sp.audio_features(track_id)[0]
    
    song_data['song'] = [title]
    song_data['year'] = [year]
    song_data['explicit'] = [int(query['explicit'])]
    song_data['duration_ms'] = [query['duration_ms']]
    song_data['popularity'] = [query['popularity']]
    
    for k, v in audio_features.items():
        song_data[k]= v
    
    return pd.DataFrame(song_data)

In [26]:
def gather_data(song_provided, dataset):
#    song is also a dataframe
    try:
        song_data= dataset[(dataset['song']==song_provided['song']) & (dataset['artist']== song_provided['artist'])].iloc[0]
        
        return song_data
    
    except IndexError:
        
        find_song(song_provided['song'], song_provided['artist'])

In [27]:
num_cols= df_cont.select_dtypes(np.number).columns.tolist()

def mean_vector(track_list, dataset):
    vector_list=[]
    
    for song in track_list:
        song_data= gather_data(song, dataset)
        
        if song_data is None:
            print('{} not found.'.format(song['song']))
            continue
            
        vector= song_data[num_cols].values
        vector_list.append(vector)
    
    matrix= np.array(list(vector_list))
    return np.mean(matrix, axis=0)

In [28]:
def flatten_dict(track_dict_list):
    
    flattened_dict = defaultdict()
    for key in track_dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in track_dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict


In [29]:
# KMeans clustering
KMeans_pipe= Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans())])
X= df_cont[num_cols]
KMeans_pipe.fit(X)

def content_based_recommendation(track_dict_list, dataset, n_songs=6):

    basic_info= ['song', 'artist']
    song_dict= flatten_dict(track_dict_list)
    
    cluster_center= mean_vector(track_dict_list, dataset)
    scaler= KMeans_pipe.steps[0][1]
    scaled_datapoint= scaler.transform(dataset[num_cols])
    scaled_centers= scaler.transform(cluster_center.reshape(1, -1))
    distances= cdist(scaled_centers, scaled_datapoint, 'cosine')   #cdist used because the data is more than 1-D
    index_list= list(np.argsort(distances)[:,:int(n_songs)][0])
    
    rec_data= dataset.iloc[index_list]
    recommendations= rec_data[~rec_data['song'].isin(song_dict['song'])]
    return recommendations[basic_info].to_dict(orient= 'record')

In [30]:
test_track_list= [{'song':'Oops!...I Did It Again', 'artist': 'Britney Spears'}, 
                  {'song':'All The Small Things', 'artist': 'blink-182'},
                  {'song':'Breathe', 'artist':'Faith Hill' }]

content_based_recommendation(test_track_list, df_cont)

  return recommendations[basic_info].to_dict(orient= 'record')


[{'song': 'She Hates Me', 'artist': 'Puddle Of Mudd'},
 {'song': 'Life Is A Rollercoaster', 'artist': 'Ronan Keating'},
 {'song': 'Sk8er Boi', 'artist': 'Avril Lavigne'},
 {'song': 'Grace Kelly', 'artist': 'MIKA'},
 {'song': "Lovin' Each Day", 'artist': 'Ronan Keating'}]

### Hybrid Filtering Recommendation System

We will first get content-based recommendation system to initial recommendations. We will then use collaborative filtering to only retain recommendations with recommended artists.

In [31]:
df_comm_cont_artist= df_cont.loc[df_cont['artist'].isin(common_artist)]
df_comm_cont= df_comm_cont_artist.loc[df_comm_cont_artist['song'].isin(common_songs)]

df_comm_cont

Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
0,Britney Spears,Oops!...I Did It Again,211160,False,2000,77,0.751,0.834,1,-5.444,0,0.0437,0.3000,0.000018,0.3550,0.894,95.053,pop
1,blink-182,All The Small Things,167066,False,1999,79,0.434,0.897,0,-4.918,1,0.0488,0.0103,0.000000,0.6120,0.684,148.726,"rock, pop"
2,Faith Hill,Breathe,250546,False,1999,66,0.529,0.496,7,-9.007,1,0.0290,0.1730,0.000000,0.2510,0.278,136.859,"pop, country"
3,Bon Jovi,It's My Life,224493,False,2000,78,0.551,0.913,0,-4.063,0,0.0466,0.0263,0.000013,0.3470,0.544,119.992,"rock, metal"
4,*NSYNC,Bye Bye Bye,200560,False,2000,65,0.614,0.928,8,-4.806,0,0.0516,0.0408,0.001040,0.0845,0.879,172.656,pop
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1990,Sam Smith,How Do You Sleep?,202204,False,2019,73,0.477,0.682,1,-4.931,0,0.0925,0.1530,0.000000,0.0763,0.345,110.567,pop
1991,NSG,Options,240081,True,2020,57,0.836,0.621,1,-4.684,0,0.0894,0.3890,0.000092,0.1040,0.762,101.993,"World/Traditional, hip hop"
1995,Jonas Brothers,Sucker,181026,False,2019,79,0.842,0.734,1,-5.065,0,0.0588,0.0427,0.000000,0.1060,0.952,137.958,pop
1996,Taylor Swift,Cruel Summer,178426,False,2019,78,0.552,0.702,9,-5.707,1,0.1570,0.1170,0.000021,0.1050,0.564,169.994,pop


In [32]:
def hybrid_music_rec(recently_listened_track, content_based_dataset, collab_based_dataset, user, n_songs):
    cont_rec_songs_artists=content_based_recommendation(recently_listened_track, content_based_dataset,
                                                        n_songs= n_songs*100)
    
    content_recommended_songs=[x['song'] for x in cont_rec_songs_artists]
    
    df_collab_cont_filtered= collab_based_dataset.loc[collab_based_dataset.song.isin(content_recommended_songs)]
    df_coll_cont_art_song= df_collab_cont_filtered[['user_id', 'song']]
    
    M_playlist= pd.pivot_table(data= df_coll_cont_art_song,
                columns='song',
                index='user_id',
                aggfunc=np.count_nonzero).fillna(0).astype(bool).replace({False:0, True:1})
    
    unique_users= df_coll_cont_art_song['user_id'].unique().tolist()
    M_playlist.index= range(len(unique_users))
    M_playlist_col= M_playlist.columns.tolist()
    song= M_playlist_col.index(content_recommended_songs[0])
    
    return collab_music_rec(user, song, M_playlist)

In [33]:
recently_listened_track= [{'song':'Oops!...I Did It Again', 'artist': 'Britney Spears'}]

hybrid_music_rec(recently_listened_track, df_comm_cont, df_common_collab, user=109, n_songs=10)

  return recommendations[basic_info].to_dict(orient= 'record')


5 most similar users for User 109:

1: User 2232, 0.816496580927726
2: User 6661, 0.7071067811865475
3: User 3268, 0.7071067811865475
4: User 8089, 0.7071067811865475
5: User 5894, 0.7071067811865475

Recommended songs for user 109:
21 Guns
Bootylicious
Cooler Than Me - Single Mix
Demons
Don't Stop Movin'


