<div style="font-size: 48px; font-weight: bold">Music Recommendation System</div>

# Download data

In [1]:
!mkdir -p data
!wget -qc http://millionsongdataset.com/sites/default/files/AdditionalFiles/unique_tracks.txt -O data/unique_tracks.txt
!wget -qc http://millionsongdataset.com/sites/default/files/AdditionalFiles/tracks_per_year.txt -O data/tracks_per_year.txt
!wget -qc http://millionsongdataset.com/sites/default/files/AdditionalFiles/artist_location.txt -O data/artist_location.txt

!wget -qc http://millionsongdataset.com/sites/default/files/AdditionalFiles/track_metadata.db -O data/track_metadata.db
!wget -qc http://www.ee.columbia.edu/~thierry/artist_term.db -O data/artist_term.db
!wget -qc http://www.ee.columbia.edu/~thierry/artist_similarity.db -O data/artist_similarity.db

In [2]:
!wget -qc http://millionsongdataset.com/sites/default/files/challenge/train_triplets.txt.zip -O data/train_triplets.txt.zip

In [3]:
!unzip -o data/train_triplets.txt.zip -d data

Archive:  data/train_triplets.txt.zip
  inflating: data/train_triplets.txt  


In [4]:
!rm data/train_triplets.txt.zip

In [5]:
!ls -lah data

total 1.5G
drwxr-xr-x 3 root root   12 Apr 15 13:36 .
drwxrwxrwx 5 root root   10 Apr 15 13:36 ..
-rw-r--r-- 1 root root 1.1M Jan 25  2011 artist_location.txt
-rw-r--r-- 1 root root 322M Jan  1  2011 artist_similarity.db
-rw-r--r-- 1 root root 133M Jan  1  2011 artist_term.db
drwxr-xr-x 4 1721 1721    4 Dec 19  2010 MillionSongSubset
-rw-r--r-- 1 root root  11M Apr 14 13:41 SongCSV.csv
-rw-r--r-- 1 root root 712M Mar 27  2011 track_metadata.db
-rw-r--r-- 1 root root  34M Jan 25  2011 tracks_per_year.txt
-rw-r--r-- 1 root root 2.8G Dec 19  2011 train_triplets.txt
-rw-rw-r-- 1 root root 121M Apr 14 15:55 triplets_file_unknown.csv
-rw-r--r-- 1 root root  81M Jan 25  2011 unique_tracks.txt


# Model evaluation methods

In [6]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if len(actual)==0:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

# Load data

In [56]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import sqlite3
from sklearn.model_selection import train_test_split

In [8]:
from pathlib import Path
BASE_PATH = Path('data')

In [9]:
def get_tables(conn):
    cursorObj = conn.cursor()
    cursorObj.execute('SELECT name from sqlite_master where type= "table"')
    return cursorObj.fetchall()

In [10]:
triplets_df = pd.read_csv(BASE_PATH / 'triplets_file_unknown.csv')
songs_csv_df = pd.read_csv(BASE_PATH / 'SongCSV.csv')
unique_tracks_df = pd.read_csv(BASE_PATH / 'unique_tracks.txt', sep='<SEP>', names=['track_id', 'song_id', 'artist_name', 'song_title'])
tracks_per_year_df = pd.read_csv(BASE_PATH / 'tracks_per_year.txt', sep='<SEP>', names=['year', 'track_id', 'artist_name', 'song_title'])
artist_location_df = pd.read_csv(BASE_PATH / 'artist_location.txt', sep='<SEP>', names=['artist_id', 'artist_latitude', 'artist_longitude', 'artist_name', 'artist_location'])

In [11]:
track_metadata_db = sqlite3.connect(BASE_PATH / 'track_metadata.db')
artist_term_db = sqlite3.connect(BASE_PATH / 'artist_term.db')
artist_similarity_db = sqlite3.connect(BASE_PATH / 'artist_similarity.db')

In [12]:
songs_data_df = pd.read_sql_query("SELECT * FROM songs", track_metadata_db)
artist_term_df = pd.read_sql_query("SELECT * FROM artist_term", artist_term_db).groupby('artist_id').agg(tuple).applymap(list).reset_index()
artist_mbtag_df = pd.read_sql_query("SELECT * FROM artist_mbtag", artist_term_db).groupby('artist_id').agg(tuple).applymap(list).reset_index()
artist_similarity_df = pd.read_sql_query("SELECT * FROM similarity", artist_similarity_db).groupby('target').agg(tuple).applymap(list).reset_index()
songs_data_df.drop_duplicates(['song_id'], inplace=True)

In [13]:
songs_csv_df

Unnamed: 0,SongNumber,SongID,AlbumID,AlbumName,ArtistID,MusicbrainzID,PlayMeID,7DigitalID,ArtistFamiliarity,ArtistLatitude,...,KeySignature,KeySignatureConfidence,Mode,ModeConfidence,Tempo,TimeSignature,TimeSignatureConfidence,ArtistTerms,ArtistTermsFrequency,StartOfFadeOut
0,1,SOMJBYD12A6D4F8557,25824,Da Ghetto Psychic,ARD0S291187B9B7BF5,56503d6d-094e-4c28-ae3d-04cc748ade5b,-1,17970,0.556496,,...,1,0.000,1,0.000,111.787,1,0.000,['breakbeat' 'dirty south rap' 'hip hop' 'elec...,[1.0 0.8386187613378114 0.9353130749575775 0.9...,114.782
1,2,SOQHXMF12AB0182363,539081,I Need You,ARGSJW91187B9B1D6B,eb722df4-372e-4742-8586-f7ff658012d6,-1,60488,0.360031,35.21962,...,5,0.354,0,0.485,146.765,1,0.000,['alternative rock' 'indie rock' 'rock' 'alter...,[0.9560807508575261 0.9213064099169126 1.0 0.6...,207.012
2,3,SONHOTT12A8C13493C,287650,Friend Or Foe,AR7G5I41187FB4CE6C,e188a520-9cb7-4f73-a3d7-2f70c6538e92,12697,19072,0.630382,,...,0,0.751,1,0.749,119.293,4,0.000,['pop rock' 'new wave' 'dance rock' 'rock' 'ne...,[0.9885838625154639 0.9672504640243684 0.82060...,217.124
3,4,SOUDSGM12AC9618304,692313,Superinstrumental,ARNTLGG11E2835DDB9,4d96f7d0-2f0e-4e92-ba70-a405f96f8cec,-1,242273,0.550514,,...,7,0.053,0,0.473,114.041,4,0.878,['breakcore' 'miami bass' 'new rave' 'grime' '...,[0.9156017479341086 0.862352379556314 0.862352...,261.747
4,5,SOHKNRJ12A6701D1F8,8876,Gin & Phonic,AR10USD1187B99F3F1,d89de379-665d-425c-b2e9-41b95d1edb36,-1,21128,0.801136,,...,4,0.000,0,0.160,101.430,3,0.408,['post-hardcore' 'screamo' 'emo' 'hardcore' 'p...,[0.999818045125181 0.9999454585723618 1.0 0.81...,181.023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,SOJZLAJ12AB017E8A2,346402,Reality,ARS1DCR1187B9A4A56,0159799c-ef7f-4c37-a011-25b6572c0f62,-1,7562,0.433508,,...,1,0.931,1,0.565,118.123,4,0.205,['lovers rock' 'reggae' 'roots reggae' 'dub' '...,[0.9601150044322461 1.0 0.7945583536354357 0.8...,186.015
9996,9997,SOFAOMI12A6D4FA2D8,64501,Once Upon A Shattered Life,ARYXOV81187B99831D,56e7a15e-8b19-4f09-b3e7-c5fb830e3120,7399,39252,0.609182,,...,2,0.315,0,0.406,150.575,4,0.317,['christian rock' 'christian alternative rock'...,[0.926219623593863 0.8377585867605521 0.837758...,193.167
9997,9998,SOUWMIW12AB0184748,760072,The Unforgettable Grandpa Jones,ARQ91R31187FB38A88,8488e884-f8b9-4c31-a6ec-a92be00d450f,23838,39299,0.450646,37.82245,...,2,0.101,1,0.394,119.271,4,0.150,['bluegrass' 'classic country' 'country gospel...,[1.0 0.962214591561547 0.9322226256056128 0.81...,136.615
9998,9999,SOJARSR12AB0184939,637025,Gift Grub 10,AROIHOI122988FEB8E,,-1,15464,0.334543,,...,7,0.546,1,0.630,90.050,4,0.433,['irish' 'comedy' 'funny' 'parody' 'stand-up c...,[0.9412616260534631 1.0 0.9250172052920823 0.8...,182.671


In [14]:
songs_data_df

Unnamed: 0,track_id,title,song_id,release,artist_id,artist_mbid,artist_name,duration,artist_familiarity,artist_hotttnesss,year,track_7digitalid,shs_perf,shs_work
0,TRMMMYQ128F932D901,Silent Night,SOQMMHC12AB0180CB8,Monster Ballads X-Mas,ARYZTJS1187B98C555,357ff05d-848a-44cf-b608-cb34b5701ae5,Faster Pussy cat,252.05506,0.649822,0.394032,2003,7032331,-1,0
1,TRMMMKD128F425225D,Tanssi vaan,SOVFVAK12A8C1350D9,Karkuteillä,ARMVN3U1187FB3A1EB,8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9,Karkkiautomaatti,156.55138,0.439604,0.356992,1995,1514808,-1,0
2,TRMMMRX128F93187D9,No One Could Ever,SOGTUKN12AB017F4F1,Butter,ARGEKB01187FB50750,3d403d44-36ce-465c-ad43-ae877e65adc4,Hudson Mohawke,138.97098,0.643681,0.437504,2006,6945353,-1,0
3,TRMMMCH128F425532C,Si Vos Querés,SOBNYVR12A8C13558C,De Culo,ARNWYLR1187B9B2F9C,12be7648-7094-495f-90e6-df4189d68615,Yerba Brava,145.05751,0.448501,0.372349,2003,2168257,-1,0
4,TRMMMWA128F426B589,Tangle Of Aspens,SOHSBXH12A8C13B0DF,Rene Ablaze Presents Winter Sessions,AREQDTE1269FB37231,,Der Mystic,514.29832,0.000000,0.000000,0,2264873,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRYYYUS12903CD2DF0,O Samba Da Vida,SOTXAME12AB018F136,Pacha V.I.P.,AR7Z4J81187FB3FC59,9d50cb20-7e42-45cc-b0dd-154c3e92a577,Kiko Navarro,217.44281,0.528617,0.411595,0,7522478,-1,0
999996,TRYYYJO128F426DA37,Jago Chhadeo,SOXQYIQ12A8C137FBB,Naale Baba Lassi Pee Gya,ART5FZD1187B9A7FCF,2357c400-9109-42b6-b3fe-9e2d9f8e3872,Kuldeep Manak,244.16608,0.401500,0.374866,0,1632096,-1,0
999997,TRYYYMG128F4260ECA,Novemba,SOHODZI12A8C137BB3,Dub_Connected: electronic music,ARZ3R6M1187B9AF750,8b97e9c8-61f5-4615-9a96-276f24204e34,Gabriel Le Mar,553.03791,0.556918,0.336914,0,2219291,-1,0
999998,TRYYYDJ128F9310A21,Faraday,SOLXGOR12A81C21EB7,The Trance Collection Vol. 2,ARCMCOK1187B9B1073,4ac5f3de-c5ad-475e-ad50-41f1ef9dba20,Elude,484.51873,0.403668,0.256935,0,5472456,-1,0


In [15]:
artist_term_df

Unnamed: 0,artist_id,term
0,AR002UA1187B9A637D,"[garage rock, country rock, free jazz, oi, spa..."
1,AR003FB1187B994355,"[rock, punk, alternative rock, hip hop, texas,..."
2,AR006821187FB5192B,"[orchestra, opera, religious music, requiem, c..."
3,AR009211187B989185,"[lovers rock, reggae, roots reggae, uk garage,..."
4,AR009SZ1187B9A73F4,"[chill-out, future jazz, neofolk, downtempo, f..."
...,...,...
43938,ARZZXJY1187B99E2BB,"[world, acoustic, folk, country, flute, tribal..."
43939,ARZZXT51187FB4627E,"[blues-rock, psychedelic rock, soul jazz, jazz..."
43940,ARZZYRB1187B99D0B6,"[acid jazz, jazz, ambient, psychedelic rock, v..."
43941,ARZZYRH11C8A416A12,"[dark wave, shoegaze, pop rock, emo, indie roc..."


In [46]:
print(len(triplets_df))
# Remove users who have listened to less than 30 songs
triplets_df = triplets_df[triplets_df.groupby('user_id').user_id.transform('count')>29]
print(len(triplets_df))

2000000
1277683


In [59]:
songs_df_raw = pd.merge(triplets_df, songs_data_df, on='song_id', how='left')
songs_df_raw = pd.merge(songs_df_raw, artist_term_df, on='artist_id', how='left')
songs_df_raw = pd.merge(songs_df_raw, artist_mbtag_df, on='artist_id', how='left')
songs_df_raw = pd.merge(songs_df_raw, artist_location_df.drop(['artist_name'], axis=1), on='artist_id', how='left')
songs_df_raw['song_display_name'] = songs_df_raw['title'] + ' - ' + songs_df_raw['artist_name']
songs_df_raw = songs_df_raw.head(10000)
display(songs_df_raw)

Unnamed: 0,user_id,song_id,listen_count,track_id,title,release,artist_id,artist_mbid,artist_name,duration,...,year,track_7digitalid,shs_perf,shs_work,term,mbtag,artist_latitude,artist_longitude,artist_location,song_display_name
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,TRIQAUQ128F42435AD,The Cove,Thicker Than Water,ARC8CQZ1187B98DECA,ff6e677f-91dd-4986-a174-8db0474b1799,Jack Johnson,112.63955,...,0,2093263,-1,0,"[rock, singer-songwriter, folk, soundtrack, mo...","[folk, soft rock, acoustic, rock, american, su...",40.70380,-73.83168,"Queens, NY",The Cove - Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,TRMHBXZ128F4238406,Entre Dos Aguas,Flamenco Para Niños,ARC1SF21187FB51D0F,7bc8dd01-35ea-4b18-a64d-3a364dc76a52,Paco De Lucia,358.24281,...,1976,2631392,-1,0,"[flamenco, tango, world fusion, british pop, c...","[spanish, flamenco]",,,,Entre Dos Aguas - Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,TRHNCIR128F42334A5,Stronger,Graduation,ARRH63Y1187FB47783,164f0d73-1234-4e2c-8743-d77bf2191051,Kanye West,311.84934,...,2007,1351505,-1,0,"[hip hop, rap, soul, east coast rap, alternati...","[hip-hop, american, hip hop rnb and dance hall...",51.34481,12.38551,"Atlanta, GA",Stronger - Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,TRYBNIB128F428E704,Constellations,In Between Dreams,ARC8CQZ1187B98DECA,ff6e677f-91dd-4986-a174-8db0474b1799,Jack Johnson,201.63873,...,2005,2424087,-1,0,"[rock, singer-songwriter, folk, soundtrack, mo...","[folk, soft rock, acoustic, rock, american, su...",40.70380,-73.83168,"Queens, NY",Constellations - Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,TRKRHYM128F42934A9,Learn To Fly,There Is Nothing Left To Lose,AR6XPWV1187B9ADAEB,67f66c07-6e61-4026-ade5-7e782fad3a5d,Foo Fighters,235.28444,...,1999,3360473,-1,0,"[grunge, alternative rock, hard rock, rock, se...","[rock, post-grunge, american, alternative rock...",,,,Learn To Fly - Foo Fighters
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,8244d38b6505d44c6f44a4b29ad7fa260578e419,SODASIJ12A6D4F5D89,1,TRVPJPK128F1459159,The Invisible Man,The Invisible Man,ARSBWPY1187FB3C199,ac82b7db-5910-41c2-9d10-f02df4f4e1e1,Michael Cretu,301.06077,...,1985,242060,-1,0,"[disco, italian disco, downtempo, europop, pop...",,,,,The Invisible Man - Michael Cretu
9996,8244d38b6505d44c6f44a4b29ad7fa260578e419,SOEFSXD12AF72ACED4,1,TRVVRNY128F931D017,Crazy In Love,Dangerously In Love,AR65K7A1187FB4DAA4,183105b5-3e68-4748-9086-2c1c11bf7a3d,Beyoncé feat. Jay-Z,235.93751,...,2003,6008835,-1,0,"[hip hop, pop, rock, soul, adult contemporary,...","[hip hop rnb and dance hall, rnb, pop]",,,,Crazy In Love - Beyoncé feat. Jay-Z
9997,8244d38b6505d44c6f44a4b29ad7fa260578e419,SOEGIYH12A6D4FC0E3,1,TRLGMFJ128F4217DBE,Horn Concerto No. 4 in E flat K495: II. Romanc...,Mozart - Eine kleine Nachtmusik,ARDIGKU124549A43DE,,Barry Tuckwell/Academy of St Martin-in-the-Fie...,277.15873,...,0,2486330,-1,0,,,,,,Horn Concerto No. 4 in E flat K495: II. Romanc...
9998,8244d38b6505d44c6f44a4b29ad7fa260578e419,SOEZMOB12A8C139067,1,TRGGZPN128F42891DB,We Ride,Massive R & B - Spring 2007,ARKU3Z61187FB51DCA,73e5e69d-3554-40d8-8516-00cb38737a1c,Rihanna,234.34404,...,2006,1077728,-1,0,"[dance pop, dancehall, hip hop, ballad, pop ro...","[barbadian, pop and chart, barbade, barbadien,...",13.11199,-59.59895,"St Michael, Barbados",We Ride - Rihanna


In [60]:
songs_df, songs_df_test = train_test_split(songs_df_raw, random_state=42, test_size=0.2, stratify=songs_df_raw['user_id'])

In [61]:
print(len(songs_df))
print(len(songs_df['song_display_name'].unique()))
print(len(songs_df['song_id'].unique()))

8000
4434
4461


In [62]:
songs_liked_by_users = []
user_ids = songs_df_test.user_id.unique()
for i in user_ids:
    songs_liked_by_users.append(songs_df_test[songs_df_test['user_id'] == i]['song_id'].to_numpy())

In [63]:
song_grouped = songs_df.groupby(['song_id', 'song_display_name']).agg({'listen_count':'count'}).reset_index()
grouped_sum = song_grouped['listen_count'].sum()
print(grouped_sum)
song_grouped['percentage'] = (song_grouped['listen_count'] / grouped_sum ) * 100
song_grouped.sort_values(['listen_count', 'song_display_name'], ascending=[0,1])

song_grouped

8000


Unnamed: 0,song_id,song_display_name,listen_count,percentage
0,SOAAAGQ12A8C1420C8,Orgelblut - Bohren & Der Club Of Gore,1,0.0125
1,SOAAFAC12A67ADF7EB,Rome Wasn't Built In A Day - Morcheeba,2,0.0250
2,SOAAKPM12A58A77210,So Confused (feat. Butta Creame) (amended albu...,1,0.0125
3,SOAAOYI12AB01831CE,Criminal - Gotan Project,1,0.0125
4,SOAAVUV12AB0186646,STARSTRUKK [FEATURINGKATYPERRY] (Explicit Bonu...,2,0.0250
...,...,...,...,...
4456,SOZZIOH12A67ADE300,Watch The Tapes - LCD Soundsystem,3,0.0375
4457,SOZZLZN12A8AE48D6D,Afterlife (Album Version) - Avenged Sevenfold,1,0.0125
4458,SOZZRHE12A6702165F,Tommy The Cat - Primus,2,0.0250
4459,SOZZTCU12AB0182C58,Throw Some D's Remix - Rich Boy / Andre 3000 /...,2,0.0250


In [64]:
# songs_csv_df (subset of 10K songs) to be used for song analysis data contains song features like tempo, danceability, energy, etc.

songs_csv_df.head()

Unnamed: 0,SongNumber,SongID,AlbumID,AlbumName,ArtistID,MusicbrainzID,PlayMeID,7DigitalID,ArtistFamiliarity,ArtistLatitude,...,KeySignature,KeySignatureConfidence,Mode,ModeConfidence,Tempo,TimeSignature,TimeSignatureConfidence,ArtistTerms,ArtistTermsFrequency,StartOfFadeOut
0,1,SOMJBYD12A6D4F8557,25824,Da Ghetto Psychic,ARD0S291187B9B7BF5,56503d6d-094e-4c28-ae3d-04cc748ade5b,-1,17970,0.556496,,...,1,0.0,1,0.0,111.787,1,0.0,['breakbeat' 'dirty south rap' 'hip hop' 'elec...,[1.0 0.8386187613378114 0.9353130749575775 0.9...,114.782
1,2,SOQHXMF12AB0182363,539081,I Need You,ARGSJW91187B9B1D6B,eb722df4-372e-4742-8586-f7ff658012d6,-1,60488,0.360031,35.21962,...,5,0.354,0,0.485,146.765,1,0.0,['alternative rock' 'indie rock' 'rock' 'alter...,[0.9560807508575261 0.9213064099169126 1.0 0.6...,207.012
2,3,SONHOTT12A8C13493C,287650,Friend Or Foe,AR7G5I41187FB4CE6C,e188a520-9cb7-4f73-a3d7-2f70c6538e92,12697,19072,0.630382,,...,0,0.751,1,0.749,119.293,4,0.0,['pop rock' 'new wave' 'dance rock' 'rock' 'ne...,[0.9885838625154639 0.9672504640243684 0.82060...,217.124
3,4,SOUDSGM12AC9618304,692313,Superinstrumental,ARNTLGG11E2835DDB9,4d96f7d0-2f0e-4e92-ba70-a405f96f8cec,-1,242273,0.550514,,...,7,0.053,0,0.473,114.041,4,0.878,['breakcore' 'miami bass' 'new rave' 'grime' '...,[0.9156017479341086 0.862352379556314 0.862352...,261.747
4,5,SOHKNRJ12A6701D1F8,8876,Gin & Phonic,AR10USD1187B99F3F1,d89de379-665d-425c-b2e9-41b95d1edb36,-1,21128,0.801136,,...,4,0.0,0,0.16,101.43,3,0.408,['post-hardcore' 'screamo' 'emo' 'hardcore' 'p...,[0.999818045125181 0.9999454585723618 1.0 0.81...,181.023


In [65]:
artist_location_df.head()

Unnamed: 0,artist_id,artist_latitude,artist_longitude,artist_name,artist_location
0,ARZGXZG1187B9B56B6,-16.96595,-61.14804,Endless Blue,Santa Cruz
1,AR8K6F31187B99C2BC,46.44231,-93.36586,Go Fish,"Twin Cities, MN"
2,ARHJJ771187FB5B581,51.59678,-0.33556,Screaming Lord Sutch,"Harrow, Middlesex, England"
3,ARJ8YLL1187FB3CA93,40.69626,-73.83301,Morton Gould,"Richmond Hill, NY"
4,ARYBAGV11ECC836DAC,43.58828,-79.64372,Crash Parallel,Mississauga


In [66]:
import numpy as np
import pandas

# Popularity based recommendation

In [67]:
# Get a count of user_ids for each unique song as popularity score
songs_df_grouped = songs_df.groupby(['song_id', 'artist_id', 'song_display_name']).agg({'user_id': 'count'})\
    .reset_index()\
    .rename(columns={'user_id': 'score'})

# sort songs based on popularity score
song_recommendation_df = songs_df_grouped.sort_values(['score', 'song_display_name'], ascending=[0, 1])
# generate recommendation rank based on popularity score
song_recommendation_df['Rank'] = song_recommendation_df['score'].rank(ascending=0, method='first')

song_recommendation_df.head(10)

Unnamed: 0,song_id,artist_id,song_display_name,score,Rank
987,SOFRQTD12A81C233C0,AR0IVTL1187B9AD520,Sehr kosmisch - Harmonia,28,1.0
155,SOAXGDH12A8C13F8A1,ARNCHOP121318C56B8,Dog Days Are Over (Radio Edit) - Florence + Th...,24,2.0
2477,SONYKOW12AB01849C9,AR73S4G1187B9A03C2,Secrets - OneRepublic,22,3.0
1846,SOKLRPJ12A8C13C3FE,ARJ7KF01187B98D717,The Scientist - Coldplay,20,4.0
276,SOBOAFP12A8C131F36,ARML3X41187FB35F2E,Lucky (Album Version) - Jason Mraz & Colbie Ca...,19,5.0
3312,SOSXLTC12AF72A7F54,ARF2EHS1187B994F4E,Revelry - Kings Of Leon,19,6.0
3842,SOWCKVR12A8C142411,ARF2EHS1187B994F4E,Use Somebody - Kings Of Leon,19,7.0
282,SOBONKR12A58A7A7E0,AR5E44Z1187B9A1D74,You're The One - Dwight Yoakam,19,8.0
3489,SOTWSXL12A8C143349,ARS54I31187FB46721,Love Story - Taylor Swift,16,9.0
3799,SOVWADY12AB0189C63,ARYAVBS1187FB5B46C,Party In The U.S.A. - Miley Cyrus,16,10.0


### Model 1: Get popular song recommendations that the user has not listened to before

In [80]:
# Model 1: Get popular song recommendations that the user has not listened to before
def get_popular_song_recommendation_model_1(user_id):
    songs_listened_by_user = songs_df.loc[songs_df['user_id'] == user_id, 'song_id'].unique()
    user_recommendations = song_recommendation_df.loc[
        ~song_recommendation_df['song_id'].isin(songs_listened_by_user)
    ].drop(columns=['artist_id'])
    user_recommendations.insert(0, 'user_id', user_id)
    return user_recommendations

user_id = 'b80344d063b5ccb3212f76538f3d9e43d87dca9e'
get_popular_song_recommendation_model_1(user_id).head(10)

Unnamed: 0,user_id,song_id,song_display_name,score,Rank
155,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAXGDH12A8C13F8A1,Dog Days Are Over (Radio Edit) - Florence + Th...,24,2.0
2477,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SONYKOW12AB01849C9,Secrets - OneRepublic,22,3.0
1846,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOKLRPJ12A8C13C3FE,The Scientist - Coldplay,20,4.0
276,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBOAFP12A8C131F36,Lucky (Album Version) - Jason Mraz & Colbie Ca...,19,5.0
3312,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOSXLTC12AF72A7F54,Revelry - Kings Of Leon,19,6.0
3842,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOWCKVR12A8C142411,Use Somebody - Kings Of Leon,19,7.0
282,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBONKR12A58A7A7E0,You're The One - Dwight Yoakam,19,8.0
3489,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOTWSXL12A8C143349,Love Story - Taylor Swift,16,9.0
3799,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOVWADY12AB0189C63,Party In The U.S.A. - Miley Cyrus,16,10.0
136,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAUWYT12A81C206F1,Undo - Björk,16,11.0


#### Evaluation

In [81]:
predictions_popularity_1 = []
for i in user_ids:
    user_recommendations = get_popular_song_recommendation_model_1(i)
    predictions_popularity_1.append(user_recommendations['song_id'].to_numpy())

mapk(songs_liked_by_users, predictions_popularity_1, 10)

0.020995611147684876

### Model 2: Get popular song recommendations that the user has not listened to before, but from the artists that the user has listened to before

In [82]:
# Model 2: Get popular song recommendations that the user has not listened to before,
#          but from the artists that the user has listened to before.
def get_popular_song_recommendation_model_2(user_id):
    songs_listened_by_user = songs_df.loc[songs_df['user_id'] == user_id, 'song_id'].unique()
    artists_listened_by_user = songs_df.loc[songs_df['user_id'] == user_id, 'artist_id'].unique()
    user_recommendations = song_recommendation_df.loc[
        (~song_recommendation_df['song_id'].isin(songs_listened_by_user)) & 
        (song_recommendation_df['artist_id'].isin(artists_listened_by_user))
    ].drop(columns=['artist_id'])
    user_recommendations.insert(0, 'user_id', user_id)
    return user_recommendations

user_id = 'b80344d063b5ccb3212f76538f3d9e43d87dca9e'
get_popular_song_recommendation_model_2(user_id).head(10)

Unnamed: 0,user_id,song_id,song_display_name,score,Rank
3167,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOSCIZP12AB0181D2F,Alejandro - Lady GaGa,13,20.0
2408,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SONOYIB12A81C1F88C,I Kissed A Girl - Katy Perry,11,31.0
939,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOFKABN12A8AE476C6,Just Dance - Lady GaGa / Colby O'Donis,11,32.0
4100,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOXNZOW12AB017F756,Half Of My Heart - John Mayer,9,39.0
3273,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOSPXWA12AB0181875,Bubble Toes - Jack Johnson,8,54.0
824,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOEQJBS12A8AE475A4,The Funeral (Album Version) - Band Of Horses,7,85.0
873,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOEYVHS12AB0181D31,Monster - Lady GaGa,6,105.0
1907,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOKUTUM12A6701D9CD,Do You Remember - Jack Johnson,5,125.0
3540,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOUFPNI12A8C142D19,Heartbreak Warfare - John Mayer,5,138.0
2925,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOQLUTQ12A8AE48037,The Pretender - Foo Fighters,5,179.0


#### Evaluation

In [85]:
predictions_popularity_2 = []
for i in user_ids:
    user_recommendations = get_popular_song_recommendation_model_2(i)
    predictions_popularity_2.append(user_recommendations['song_id'].to_numpy())

mapk(songs_liked_by_users, predictions_popularity_2, 10)

0.03570301286583929

# Similarity based recommendation

In [73]:
# get all unique songs in the dataset
all_song_ids = list(songs_df['song_id'].unique())
print(f"no. of unique songs in the dataset: {len(all_song_ids)}")

no. of unique songs in the dataset: 4461


In [106]:
# create song co-occurence matrix of shape len(user_song_ids) x len(all_song_ids)
def create_co_occurence_matrix(user_song_ids, all_song_ids):
    # get users who listened to each song in user_song_ids
    user_songs_user_ids = [
        set(songs_df.loc[songs_df['song_id'] == song_id, 'user_id'].unique())
        for song_id in user_song_ids
    ]

    # initialize co-occurence matrix of size len(user_song_ids) x len(all_song_ids)
    co_occurence_matrix = np.matrix(np.zeros(shape=(len(user_song_ids), len(all_song_ids))), float)

    # calculate similarity between user songs and all songs
    for i, song_id in enumerate(all_song_ids):
        # get listeners of song i
        user_ids_i = set(songs_df.loc[songs_df['song_id'] == song_id, 'user_id'].unique())

        for j, _ in enumerate(user_song_ids):
            user_ids_j = user_songs_user_ids[j]  # listeners of song j

            users_intersection = user_ids_i.intersection(user_ids_j)  # intersection of listeners of songs i and j

            # calculate co-occurence_matrix[i,j] as Jaccard Index
            if len(users_intersection) != 0:
                users_union = user_ids_i.union(user_ids_j)  # union of listeners of songs i and j

                co_occurence_matrix[j, i] = float(len(users_intersection)) / float(len(users_union))
            else:
                co_occurence_matrix[j, i] = 0

    return co_occurence_matrix

user_id = 'b80344d063b5ccb3212f76538f3d9e43d87dca9e'

# get all songs listened by this user
user_song_ids = songs_df.loc[songs_df['user_id'] == user_id, 'song_id'].unique()
print(f"No. of unique songs listened by the user: {len(user_song_ids)}")

co_occurence_matrix = create_co_occurence_matrix(user_song_ids, all_song_ids)
print(f"Non zero values in co-occurence matrix: {np.count_nonzero(co_occurence_matrix)}")
print(f"Shape of co-occurence matrix: {co_occurence_matrix.shape}")

No. of unique songs listened by the user: 36
Non zero values in co-occurence matrix: 5316
Shape of co-occurence matrix: (36, 4461)


In [105]:
def get_top_recommendations(user_id, co_occurence_matrix, all_song_ids, user_song_ids):
    # calculate weighted average of the scores in co-occurence matrix for all user songs.
    similarity_scores = co_occurence_matrix.sum(axis=0) / float(co_occurence_matrix.shape[0])
    similarity_scores = np.array(similarity_scores)[0].tolist()

    # sort the indices of similarity_scores based on their value, while maintaining the score
    similarity_scores_sorted = sorted(((e, i) for i, e in enumerate(list(similarity_scores))), reverse=True)

    df = pandas.DataFrame(columns=['user_id', 'song_id', 'song_display_name', 'score', 'rank'])

    # find top 10 recommendations and update the dataframe
    rank = 1
    for score, i in similarity_scores_sorted:
        if (
            ~np.isnan(score) and 
            all_song_ids[i] not in user_song_ids and 
            rank <= 10
        ):
            song_id = all_song_ids[i]
            song_display_name = songs_df.loc[songs_df['song_id'] == song_id, 'song_display_name'].iloc[0]
            df.loc[len(df)] = [user_id, all_song_ids[i], song_display_name, score, rank]
            rank += 1

    if len(df) == 0:
        print(f"WARN: The user {user_id} has not listened to any songs for training the similarity based recommendation model.")
    return df

get_top_recommendations(user_id, co_occurence_matrix, all_song_ids, user_song_ids)

Unnamed: 0,user_id,song_id,song_display_name,score,rank
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SORYLNP12A8C13C3E7,Lump Sum - Bon Iver,0.038228,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOVXBOT12A8C136816,Drops Of Jupiter - Train,0.03386,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOLCKAS12A8C14242A,Eye Of The Tiger - Survivor,0.032833,3
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOSEUUV12A6701E93C,No One Knows - Queens Of The Stone Age,0.032407,4
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOSMTVN12A6701E910,Song For The Dead - Queens Of The Stone Age,0.032407,5
5,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOYFMNU12AB0181435,ROCKSTAR 101 - Rihanna / Slash,0.030884,6
6,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOFLJIY12A8C13FF69,DLZ - TV On The Radio,0.030093,7
7,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAFQGA12A8C1367FA,I'm Still Breathing - Katy Perry,0.030071,8
8,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOFKABN12A8AE476C6,Just Dance - Lady GaGa / Colby O'Donis,0.029038,9
9,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOCDNMP12AB018440C,Vuelvo Al Sur - Gotan Project,0.028736,10


In [102]:
# Get songs similar to the songs listened by the user
def get_similar_songs_for_user(user_id):
    # get all unique songs for this user
    user_song_ids = songs_df.loc[songs_df['user_id'] == user_id, 'song_id'].unique()
    print(f"no. of unique songs for the user: {len(user_song_ids)}")

    co_occurence_matrix = create_co_occurence_matrix(user_song_ids, all_song_ids)
    print(f"non zero values in co-occurence_matrix: {np.count_nonzero(co_occurence_matrix)}")
    return get_top_recommendations(user_id, co_occurence_matrix, all_song_ids, user_song_ids)


user_id = 'b80344d063b5ccb3212f76538f3d9e43d87dca9e'
get_similar_songs_for_user(user_id)

no. of unique songs for the user: 36
non zero values in co-occurence_matrix: 5316


Unnamed: 0,user_id,song_id,song_display_name,score,rank
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SORYLNP12A8C13C3E7,Lump Sum - Bon Iver,0.038228,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOVXBOT12A8C136816,Drops Of Jupiter - Train,0.03386,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOLCKAS12A8C14242A,Eye Of The Tiger - Survivor,0.032833,3
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOSEUUV12A6701E93C,No One Knows - Queens Of The Stone Age,0.032407,4
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOSMTVN12A6701E910,Song For The Dead - Queens Of The Stone Age,0.032407,5
5,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOYFMNU12AB0181435,ROCKSTAR 101 - Rihanna / Slash,0.030884,6
6,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOFLJIY12A8C13FF69,DLZ - TV On The Radio,0.030093,7
7,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAFQGA12A8C1367FA,I'm Still Breathing - Katy Perry,0.030071,8
8,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOFKABN12A8AE476C6,Just Dance - Lady GaGa / Colby O'Donis,0.029038,9
9,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOCDNMP12AB018440C,Vuelvo Al Sur - Gotan Project,0.028736,10


### Evaluation

In [92]:
predictions_similarity = []

for i in user_ids:
    recommendations_df = get_similar_songs_for_user(i)
    # print(recommendations_df['song_id'])
    # print(songs_liked_by_users[0])
    predictions_similarity.append(recommendations_df['song_id'].to_numpy())

mapk(songs_liked_by_users, predictions_similarity, 10)

no. of unique songs for the user: 46
non zero values in co-occurence_matrix: 10068
no. of unique songs for the user: 86
non zero values in co-occurence_matrix: 17222
no. of unique songs for the user: 42
non zero values in co-occurence_matrix: 13767
no. of unique songs for the user: 105
non zero values in co-occurence_matrix: 21875
no. of unique songs for the user: 98
non zero values in co-occurence_matrix: 26587
no. of unique songs for the user: 105
non zero values in co-occurence_matrix: 29275
no. of unique songs for the user: 101
non zero values in co-occurence_matrix: 23008
no. of unique songs for the user: 47
non zero values in co-occurence_matrix: 9602
no. of unique songs for the user: 24
non zero values in co-occurence_matrix: 1534
no. of unique songs for the user: 70
non zero values in co-occurence_matrix: 12581
no. of unique songs for the user: 58
non zero values in co-occurence_matrix: 6400
no. of unique songs for the user: 136
non zero values in co-occurence_matrix: 31812
no.

0.023362476531831367

In [79]:
# Get similar songs to a given list of songs
song_ids = ['SOAKIMP12A8C130995', 'SOMMJUQ12AF72A5931']

co_occurence_matrix = create_co_occurence_matrix(song_ids, all_song_ids)
recommendations_df = get_top_recommendations(user_id, co_occurence_matrix, all_song_ids, song_ids)
recommendations_df

no. of unique songs in the dataset: 4461


Unnamed: 0,user_id,song_id,song_display_name,score,rank
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOVYIYI12A8C138D88,He Doesn't Know Why - Fleet Foxes,0.5,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOQJLDY12AAF3B456D,Love Song For No One - John Mayer,0.5,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOXRXDG12A8C131DE5,City Love - John Mayer,0.5,3
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOJPFQG12A58A7833A,Clarity - John Mayer,0.5,4
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOZOBWN12A8C130999,Holes To Heaven - Jack Johnson,0.5,5
5,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SONSAEZ12A8C138D7A,Ragged Wood - Fleet Foxes,0.5,6
6,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOIYTOA12A6D4F9A23,Let It Be Sung - Jack Johnson / Matt Costa / Z...,0.5,7
7,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOPCVQE12AC468AF36,Country Road - Jack Johnson / Paula Fuga,0.5,8
8,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOUKXIN12A8C133C7F,Drive - Incubus,0.5,9
9,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODDNQT12A6D4F5F7E,Apuesta Por El Rock 'N' Roll - Héroes del Sile...,0.5,10


# Recommendation using Nearest Neighbours

In [177]:
songs_cat_cols = ["KeySignature", "Mode", "TimeSignature"]
songs_num_cols = ["Danceability", "Duration", "Tempo"]

In [179]:
from sklearn.model_selection import train_test_split
songs_df_train, songs_df_test = train_test_split(songs_csv_df,test_size=0.2,random_state=42)

In [180]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

pipeline = ColumnTransformer([
                              ('ohe_encoder', OneHotEncoder(handle_unknown = 'ignore'), songs_cat_cols),
                              ('scaler', StandardScaler(), songs_num_cols)
])

In [181]:
from sklearn.neighbors import NearestNeighbors

data = pipeline.fit_transform(songs_csv_df)

neigh = NearestNeighbors(n_neighbors=2)
neigh.fit(data)

print(neigh.kneighbors(pipeline.transform(songs_df_test.head(1)), 4, return_distance=True))

(array([[0.        , 0.12098556, 0.1294099 , 0.18359538]]), array([[6252, 7501, 2604, 5933]]))


In [182]:
def get_listened_songs_with_count(df_func, userId):
  return df_func[(df_func['user_id'] == userId) & (df_func['listen_count']>0)]

In [183]:
songs_data_df

Unnamed: 0,track_id,title,song_id,release,artist_id,artist_mbid,artist_name,duration,artist_familiarity,artist_hotttnesss,year,track_7digitalid,shs_perf,shs_work
0,TRMMMYQ128F932D901,Silent Night,SOQMMHC12AB0180CB8,Monster Ballads X-Mas,ARYZTJS1187B98C555,357ff05d-848a-44cf-b608-cb34b5701ae5,Faster Pussy cat,252.05506,0.649822,0.394032,2003,7032331,-1,0
1,TRMMMKD128F425225D,Tanssi vaan,SOVFVAK12A8C1350D9,Karkuteillä,ARMVN3U1187FB3A1EB,8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9,Karkkiautomaatti,156.55138,0.439604,0.356992,1995,1514808,-1,0
2,TRMMMRX128F93187D9,No One Could Ever,SOGTUKN12AB017F4F1,Butter,ARGEKB01187FB50750,3d403d44-36ce-465c-ad43-ae877e65adc4,Hudson Mohawke,138.97098,0.643681,0.437504,2006,6945353,-1,0
3,TRMMMCH128F425532C,Si Vos Querés,SOBNYVR12A8C13558C,De Culo,ARNWYLR1187B9B2F9C,12be7648-7094-495f-90e6-df4189d68615,Yerba Brava,145.05751,0.448501,0.372349,2003,2168257,-1,0
4,TRMMMWA128F426B589,Tangle Of Aspens,SOHSBXH12A8C13B0DF,Rene Ablaze Presents Winter Sessions,AREQDTE1269FB37231,,Der Mystic,514.29832,0.000000,0.000000,0,2264873,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRYYYUS12903CD2DF0,O Samba Da Vida,SOTXAME12AB018F136,Pacha V.I.P.,AR7Z4J81187FB3FC59,9d50cb20-7e42-45cc-b0dd-154c3e92a577,Kiko Navarro,217.44281,0.528617,0.411595,0,7522478,-1,0
999996,TRYYYJO128F426DA37,Jago Chhadeo,SOXQYIQ12A8C137FBB,Naale Baba Lassi Pee Gya,ART5FZD1187B9A7FCF,2357c400-9109-42b6-b3fe-9e2d9f8e3872,Kuldeep Manak,244.16608,0.401500,0.374866,0,1632096,-1,0
999997,TRYYYMG128F4260ECA,Novemba,SOHODZI12A8C137BB3,Dub_Connected: electronic music,ARZ3R6M1187B9AF750,8b97e9c8-61f5-4615-9a96-276f24204e34,Gabriel Le Mar,553.03791,0.556918,0.336914,0,2219291,-1,0
999998,TRYYYDJ128F9310A21,Faraday,SOLXGOR12A81C21EB7,The Trance Collection Vol. 2,ARCMCOK1187B9B1073,4ac5f3de-c5ad-475e-ad50-41f1ef9dba20,Elude,484.51873,0.403668,0.256935,0,5472456,-1,0


In [184]:
triplets_train, triplets_test = train_test_split(triplets_df,test_size=0.2,random_state=42, stratify=triplets_df['user_id'])

In [190]:
def get_song_predictions(user_id):
    listened_songs_with_count = get_listened_songs_with_count(triplets_train, user_id)

    pred_songs_with_weight=[]
    for index, row in listened_songs_with_count.iterrows():
        song_row = songs_csv_df[songs_csv_df['SongID']==row["song_id"]]
        if len(song_row) == 0:
            continue
        neighbours_predict = (neigh.kneighbors(pipeline.transform(song_row), 11, return_distance=True))

        prediction_indices = neighbours_predict[1][0]
        prediction_distances = neighbours_predict[0][0]
        for i in range(len(prediction_indices)):
            if prediction_distances[i]>0:
                pred_songs_with_weight.append((songs_csv_df.iloc[prediction_indices[i]]['SongID'], row['listen_count']/prediction_distances[i]))

    pred_songs_with_weight.sort(key = lambda x: -x[1])
    listened_songs = listened_songs_with_count['song_id'].values

    df = pandas.DataFrame(columns=['user_id', 'song_id', 'song_display_name', 'score'])
    for song_id, weight in pred_songs_with_weight:
        #if song_id in listened_songs:
            #continue
        song_info_df = songs_data_df.loc[songs_data_df['song_id'] == song_id]
        song_display_name = (song_info_df['title'] + ' - ' + song_info_df['artist_name']).iloc[0]
        df.loc[len(df)] = [user_id, song_id, song_display_name, weight]

    return df

In [186]:
user_id = "b64cdd1a0bd907e5e00b39e345194768e330d652"
display(get_song_predictions(user_id))
print(triplets_test[triplets_test['user_id'] == user_id]['song_id'].to_numpy())

Unnamed: 0,user_id,song_id,song_display_name,score
0,b64cdd1a0bd907e5e00b39e345194768e330d652,SOZNZIN12A6701E95F,Janie's Got A Gun - Aerosmith,36.03581
1,b64cdd1a0bd907e5e00b39e345194768e330d652,SONOQCH12A6D4FE0EE,Walk With Me - Angélla Christie,24.214133
2,b64cdd1a0bd907e5e00b39e345194768e330d652,SOSIANM12AB018CC80,Only Human - Usher Featuring The Nu Beginning,18.862833
3,b64cdd1a0bd907e5e00b39e345194768e330d652,SOJHJKZ12AB0182AA2,Les choses - Edgar Bori,17.530574
4,b64cdd1a0bd907e5e00b39e345194768e330d652,SOPJSAG12AC468ADAF,Stand Up and Praise Him - Vickie Winans,14.53412
5,b64cdd1a0bd907e5e00b39e345194768e330d652,SOJRFYT12A6D4F9972,I Cannot Believe It's True - Phil Collins,13.221819
6,b64cdd1a0bd907e5e00b39e345194768e330d652,SOYEUES12AB0187F14,Underlying Lies - Scott Matthews,12.49564
7,b64cdd1a0bd907e5e00b39e345194768e330d652,SOYOZCL12AB0189C16,Heart it Races (Frank Tetaz Remix) - Architect...,11.500072
8,b64cdd1a0bd907e5e00b39e345194768e330d652,SOLZCZD12A8C13833B,Baños De Budapest (Extended Remix) - Hevia,10.972725
9,b64cdd1a0bd907e5e00b39e345194768e330d652,SOIMZSM12AB0188CB8,Time Will Tell - Ice,10.433003


['SOIDSDT12A6D4F98DB' 'SOWUTFF12A8C138AB2' 'SOTIEEP12A6701C779'
 'SOTSVKB12A8C139EF0' 'SOXFTSF12A6D4F7278' 'SOMGVYU12A8C1314FF'
 'SOZPQES12A6D4F8E57' 'SOFWANS12AF72A12E6' 'SOBDMNP12AF72AB1E1'
 'SOYUOVK12AB0185549' 'SOOSADY12A6701F119' 'SOYSWHI12AAA8C5BFD'
 'SOXGQCN12A8C1384FA']


## Evaluation

In [187]:
len(triplets_test['user_id'].unique())

20391

In [194]:
predictions_similarity = []
user_ids = triplets_df['user_id'].unique()[:100]

songs_liked_by_users_knn = []
for i in user_ids:
    songs_liked_by_users_knn.append(triplets_df[triplets_df['user_id'] == i]['song_id'].to_numpy())
    recommendations_df = get_song_predictions(i)
    print(len(recommendations_df))
    # print(recommendations_df['song_id'])
    # print(songs_liked_by_users[0])
    predictions_similarity.append(recommendations_df['song_id'].to_numpy())

mapk(songs_liked_by_users_knn, predictions_similarity, 10)

0
30
10
10
50
0
0
10
20
0
0
10
20
0
0
10
10
30
20
20
10
10
0
0
0
10
20
0
30
0
0
0
0
10
10
0
20
10
30
60
0
10
10
0
10
10
0
30
0
20
0
0
0
50
10
40
10
0
0
0
20
0
0
10
10
0
60
10
20
10
20
10
20
0
20
0
0
0
20
20
0
20
0
0
20
0
0
0
20
10
40
10
0
0
10
10
0
0
10
0


0.0

In [161]:
mapk(songs_liked_by_users_knn, [x for x in predictions_similarity if len(x) != 0], 10)

0.0

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=07a75db5-b040-4684-ba0a-30d718353198' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>