# Spotify Music Recommendation Model

> Modeling recommendation relationships between spotify tracks using the Million Playlist dataset and graph theory.

## Links

<ul>
    <li><a href="https://www.kaggle.com/datasets/himanshuwagh/spotify-million">Million Playlist Dataset</a></li>
</ul>

In [1]:
import numpy as np
import os
import json
import rustworkx as rx
import pickle

In [2]:
%%time
# loading data

files = os.listdir('./data/data')

uriToData = {}
numPlaylists = 0

for file in files:
    if file == '.DS_Store':
        continue
    with open(f"./data/data/{file}") as f:
        d = json.load(f)
        playlists = d["playlists"]
        for i in range(len(playlists)):
            if playlists[i]["num_followers"] < 5:
                continue
            if playlists[i]["num_tracks"] > 70:
                continue
            if playlists[i]["num_tracks"] / playlists[i]["num_artists"] > 2:
                continue
            playlist = playlists[i]["tracks"]
            numPlaylists += 1
            for j in range(len(playlist)):
                playlist[j]["track_uri"] = playlist[j]["track_uri"].split(':')[2]
                if playlist[j]["track_uri"] not in uriToData:
                    uriToData[playlist[j]["track_uri"]] = (len(uriToData), playlist[j]["track_name"], playlist[j]["artist_name"])

allPlaylists = np.full((numPlaylists, 70), -1, dtype='int32')
counter = 0

for file in files:
    if file == '.DS_Store':
        continue
    with open(f"./data/data/{file}") as f:
        d = json.load(f)
        playlists = d["playlists"]
        for i in range(len(playlists)):
            if playlists[i]["num_followers"] < 5:
                continue
            if playlists[i]["num_tracks"] > 70:
                continue
            if playlists[i]["num_tracks"] / playlists[i]["num_artists"] > 2:
                continue
            playlist = playlists[i]["tracks"]
            for j in range(len(playlist)):
                allPlaylists[counter, j] = uriToData[playlist[j]["track_uri"].split(':')[2]][0]
            counter += 1

CPU times: user 4min 3s, sys: 20.5 s, total: 4min 23s
Wall time: 4min 32s


In [3]:
%%time
graph = rx.PyGraph(multigraph=False)

for k, v in uriToData.items():
    graph.add_node(k)

for i in range(numPlaylists):
    playlist = allPlaylists[i, :]
    for j in range(70):
        if playlist[j] == -1:
            break
        for k in range(j + 1, 70):
            if playlist[k] == -1:
                break
            if graph.has_edge(playlist[j], playlist[k]):
                graph.update_edge(playlist[j], playlist[k], graph.get_edge_data(playlist[j], playlist[k]) + 1)
            else:
                graph.add_edge(playlist[j], playlist[k], 1)
        

CPU times: user 56.7 s, sys: 571 ms, total: 57.3 s
Wall time: 58.1 s


# Creating an API for our Model

Creating a few functions to easily get recommendations from our adjacency matrix and print recommendations to the console.

In [5]:
# functions for interfacing with adjacency matrix and maps

def get_uri(url):
    parts = url.split('/')
    return parts[len(parts) - 1]

def get_recommendations(url, numRecs=5):
    try:
        trackIdx = uriToData[get_uri(url)][0]
    except KeyError:
        print("song not found in database")
        return None
    recommendations = sorted(dict(graph.incident_edge_index_map(trackIdx)).items(), key=lambda x: x[1][2], reverse=True)
    result = []
    for i in range(len(recommendations)):
        if i >= numRecs:
            break
        result.append((uriToData[graph[recommendations[i][1][1]]][1], recommendations[i][1][2], uriToData[graph[recommendations[i][1][1]]][2]))
    return result

def print_recommendations(recs):
    if recs is None:
        print("no recommendations")
        return
    for recSong, numAprs, artist in recs:
        print(f"{recSong} by {artist} - {numAprs}")

In [6]:
%%time

# Fluorescent Adolescent by Arctic Monkeys
recommendations = get_recommendations("https://open.spotify.com/track/3DNRdudZ2SstnDCVKFdXxG", numRecs=10)

CPU times: user 1.31 ms, sys: 2.51 ms, total: 3.82 ms
Wall time: 4.78 ms


In [7]:
print_recommendations(recommendations)

The Nutcracker - Ballet, Op.71, Act II: No. 13 - Waltz of the Flowers by Pyotr Ilyich Tchaikovsky - 2
Chopin : Nocturne No.2 in E flat major Op.9 No.2 by Frédéric Chopin - 2
On the Beautiful Blue Danube, Op. 314 by Johann Strauss II - 1
Nocturne No.20 in C sharp minor, Op.posth. - Trans. Nathan Milstein by Frédéric Chopin - 1
Orff: Carmina Burana: Introduction, Fortuna Imperatrix Mundi, No. 1 "O Fortuna" (Chorus) by Carl Orff - 1
Tchaikovsky: Overture with Military Band, Op. 49, "1812 Overture" by Pyotr Ilyich Tchaikovsky - 1
El Club De Los Poetas Muertos by Orchestra Cinema Paradise - 1
Symphony No. 9 in D Minor, Op. 125 "Choral": IV. Finale: Presto - Allegro assai by Ludwig van Beethoven - 1
Symphony No. 5 in C Minor, Op. 67: I. Allegro con brio by Ludwig van Beethoven - 1
Liebesträum No. 2 ‘Seliger tod’, S. 541 by Franz Liszt - 1


In [8]:
with open('./web/recommendation_graph.pkl', 'wb+') as f:
    pickle.dump(graph, f)

with open('./web/uriToData.pkl', 'wb+') as f:
    pickle.dump(uriToData, f)


In [8]:
import csv

with open('names.csv', 'w+', newline='') as csvfile:
    fieldnames = ['uri', 'name', 'artist', 'graph_index']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()
    for k, v in uriToData.items():
        writer.writerow({'uri': k, 'name': v[1], 'artist': v[2], 'graph_index': v[0]})

In [13]:
maxArtistLen = 0
for _, v in uriToData.items():
    maxArtistLen = max(maxArtistLen, len(v[2]))

In [14]:
maxArtistLen

267