In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("spotify_millsongdata.csv")
df_tracks = pd.read_csv("user_top_tracks.csv")
df_artists = pd.read_csv("user_top_artists.csv")
df_albums = pd.read_csv("user_top_albums.csv")
df.head()

df_tracks.head()

Unnamed: 0,user_id,rank,track_name,artist_name,playcount,mbid
0,1,1,Ice Cream,New Young Pony Club,54,417a3d0b-0291-4e08-b167-50971dbdae36
1,1,2,Hiding On The Staircase,New Young Pony Club,51,f6ac2fd2-b33b-48ff-a094-537cf530cf20
2,1,3,Descend,New Young Pony Club,49,33434fd1-79a1-3882-bb31-fbd430815a93
3,1,4,Air War,Crystal Castles,48,06d3fea3-e806-35ed-9562-aa396a97de14
4,1,5,Get Lucky,New Young Pony Club,48,ab351b25-7126-3792-9b0d-ce44fd0c0462


In [6]:
df_tracks.tail()

Unnamed: 0,user_id,rank,track_name,artist_name,playcount,mbid
23822545,479484,46,teeth,Loser,192,
23822546,479484,47,MISA MISA!,Corpse,191,
23822547,479484,48,title track,Machine Gun Kelly,191,b9874074-b2d6-4135-bc2a-f4ef44b0ae69
23822548,479484,49,В последний раз,Tima Belorusskih,190,
23822549,479484,50,fantasize,ericdoa,189,


In [235]:
def top_50_songs(user):
    temp = []
    for track in df_tracks.loc[df_tracks["user_id"] == user, "track_name"]:
        temp.append(track)
    return temp

def top_50_artists(user):
    temp = []
    for artist in df_artists.loc[df_artists["user_id"] == user, "artist_name"]:
        temp.append(artist)
    return temp

def top_50_albums(user):
    temp = []
    for album in df_albums.loc[df_albums["user_id"] == user, "album_name"]:
        temp.append(album)
    return temp

In [237]:
def shared_songs(user1, user2):
    song_list = []
    user1_songs, user2_songs = top_50_songs(user1), top_50_songs(user2)
    for song in user1_songs:
        if song in user2_songs:
            song_list.append(song)
    return song_list

def shared_artists(user1, user2):
    artist_list = []
    user1_artists, user2_artists = top_50_artists(user1), top_50_artists(user2)
    for artist in user1_artists:
        if artist in user2_artists:
            artist_list.append(artist)
    return artist_list

def shared_albums(user1, user2):
    album_list = []
    user1_albums, user2_albums = top_50_albums(user1), top_50_albums(user2)
    for album in user1_albums:
        if album in user2_albums:
            album_list.append(album)
    return album_list

print(shared_songs(5, 7))

['Time to Pretend', 'Walking on a Dream']


In [219]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def similarity_coefficient(user1, user2):
    songs1, songs2 = top_50_songs(user1), top_50_songs(user2)
    artists1, artists2 = top_50_artists(user1), top_50_artists(user2)
    albums1, albums2 = top_50_albums(user1), top_50_albums(user2)

    # 1. Vectorize the strings
    vectorizer = TfidfVectorizer()
    tfidf_matrix_songs = vectorizer.fit_transform(songs1 + songs2)
    tfidf_matrix_artists = vectorizer.fit_transform(artists1 + artists2)
    tfidf_matrix_albums = vectorizer.fit_transform(albums1 + albums2)

    # Separate the matrices for each list
    tfidf_list_songs1 = tfidf_matrix_songs[:len(songs1)]
    tfidf_list_songs2 = tfidf_matrix_songs[len(songs1):]
    tfidf_list_artists1 = tfidf_matrix_artists[:len(artists1)]
    tfidf_list_artists2 = tfidf_matrix_artists[len(artists1):]
    tfidf_list_albums1 = tfidf_matrix_albums[:len(albums1)]
    tfidf_list_albums2 = tfidf_matrix_albums[len(albums1):]

    # 2. Calculate cosine similarity between elements of the two lists
    # This will result in a matrix where element (i, j) is the cosine similarity
    # between the i-th string in list1 and the j-th string in list2.
    similarity_matrix_songs = cosine_similarity(tfidf_list_songs1, tfidf_list_songs2)
    similarity_matrix_artists = cosine_similarity(tfidf_list_artists1, tfidf_list_artists2)
    similarity_matrix_albums = cosine_similarity(tfidf_list_albums1, tfidf_list_albums2)
    return sum(sum(similarity_matrix_songs)) + sum(sum(similarity_matrix_artists)) + sum(sum(similarity_matrix_albums))

threshold = 0.1

def user_coefficients(user, threshold):
    users = []
    for i in range(1, 100):
        similarity = similarity_coefficient(user, i)
        self_similarity = similarity_coefficient(user, user)
        ratio = similarity / self_similarity
        if(ratio > threshold and ratio != 1):
            users.append(i)
    return(users)

print(similarity_coefficient(10, 10))

print(user_coefficients(87, 0.12))

183.01922377028035
[1, 4, 5, 7, 12, 14, 15, 18, 19, 20, 21, 22, 24, 26, 27, 28, 31, 32, 33, 35, 41, 42, 43, 44, 46, 49, 50, 51, 53, 54, 55, 58, 59, 62, 63, 66, 67, 68, 69, 71, 72, 73, 74, 75, 77, 78, 80, 81, 83, 89, 91, 92, 93, 94, 96, 97, 99]


In [241]:
def all_combinations(list):
    temp = []
    for i in range(len(list)):
        for j in range(i + 1, len(list)):
            temp.append([list[i], list[j]])
    return temp

def new_music(user, t):
    current_songs, current_artists, current_albums = top_50_songs(user), top_50_artists(user), top_50_albums(user)
    similar_users = user_coefficients(user, t)
    user_combinations = all_combinations(similar_users)
    temp_songs, temp_artists, temp_albums = [], [], []
    for combination in user_combinations:
        user1, user2 = combination[0], combination[1]
        new_songs, new_artists, new_albums = shared_songs(user1, user2), shared_artists(user1, user2), shared_albums(user1, user2)
        for song in new_songs:
            if(song not in current_songs and song not in temp_songs):
                temp_songs.append(song)
        for artist in new_artists:
            if(artist not in current_artists and artist not in temp_artists):
                temp_artists.append(artist)
        for album in new_albums:
            if(album not in current_albums and album not in temp_albums):
                temp_albums.append(album)
    print("Songs: ", temp_songs, "\n\nArtists: ", temp_artists, "\n\nAlbums: ", temp_albums)

new_music(1165, 0.28)

Songs:  [] 

Artists:  [] 

Albums:  ['Greatest Hits']


In [289]:
import random

def song_recommender(songs, N):
    temp = []
    users = []
    for song in songs:
        users += df_tracks.loc[df_tracks["track_name"] == song, "user_id"].tolist()
    while(len(temp) < N):
        rand = random.sample(users, 2)
        r1, r2 = rand[0], rand[1]
        rand_songs = shared_songs(r1, r2)
        if(len(rand_songs) == 0):
            continue
        rand_song = random.sample(rand_songs, 1)[0]
        if(rand_song not in temp and rand_song not in songs):
            temp.append(rand_song)
    print(temp)

def artist_recommender(artists, N):
    temp = []
    users = []
    for artist in artists:
        users += df_artists.loc[df_artists["artist_name"] == artist, "user_id"].tolist()
    while(len(temp) < N):
        rand = random.sample(users, 2)
        r1, r2 = rand[0], rand[1]
        rand_artists = shared_artists(r1, r2)
        if(len(rand_artists) == 0):
            continue
        rand_artist = random.sample(rand_artists, 1)[0]
        if(rand_artist not in temp and rand_artist not in artists):
            temp.append(rand_artist)
    print(temp)

def album_recommender(albums, N):
    temp = []
    users = []
    for album in albums:
        users += df_albums.loc[df_albums["album_name"] == album, "user_id"].tolist()
    while(len(temp) < N):
        rand = random.sample(users, 2)
        r1, r2 = rand[0], rand[1]
        rand_albums = shared_albums(r1, r2)
        if(len(rand_albums) == 0):
            continue
        rand_album = random.sample(rand_albums, 1)[0]
        if(rand_album not in temp and rand_album not in albums):
            temp.append(rand_album)
    print(temp)
    
song_recommender(["I'm Dancing in the Show Tonight", "Ocean Man"], 5)
artist_recommender(["Kendrick Lamar", "Eminem", "Drake"], 5)
album_recommender(["The Dark Side of the Moon", "Meddle", "Rumours"], 5)

["What Deaner Was Talkin' About", 'Not Allowed', 'Buckingham Green', 'In the Aeroplane Over the Sea', 'The Mollusk']
['Skrillex', 'Toro y Moi', 'Kanye West', 'Lady Gaga', 'Tim Hecker']
['Blonde', 'Greatest Hits', 'To Pimp a Butterfly', 'OK Computer', 'Kid A']
