In [1]:
import gzip
from collections import defaultdict
import random
import tensorflow as tf
import math
import statistics
import csv




In [2]:
data = []
with open("spotify_dataset.csv", encoding="utf8") as csv_file:
    csv_reader = csv.DictReader(csv_file)

    #ignore first row
    next(csv_reader)

    csv_reader.fieldnames = ["user_id", "artist_name", "track_name", "playlist_name"]

    for l in csv_reader:
        data.append(l)
        

In [3]:
print(data[0])

{'user_id': '9cc0cfd4d7d7885102480dd99e7a90d6', 'artist_name': 'Elvis Costello & The Attractions', 'track_name': "(What's So Funny 'Bout) Peace, Love And Understanding", 'playlist_name': 'HARD ROCK 2010'}


In [4]:
songArtistPerPlaylist = defaultdict(list)
songIDs = {}
userIDs = {}
interactions = []

for d in data:
    playlist = d["playlist_name"]
    track = d["track_name"]
    artist = d["artist_name"]
    user = d["user_id"]
    if not user in userIDs: userIDs[user] = len(userIDs)
    if not track in songIDs: songIDs[track] = len(songIDs)
    interactions.append((user, track))
    songArtistPerPlaylist[(user, playlist)].append([artist,track])

In [5]:

dataTrain = defaultdict(list)
dataValid = defaultdict(list)

# 80 20 split
for playlist in songArtistPerPlaylist:
    playlist_length = len(songArtistPerPlaylist[playlist])
    split_index = int(0.8 * playlist_length)
    for i in range(0, split_index):
        dataTrain[playlist].append(songArtistPerPlaylist[playlist][i])
    for i in range(split_index):
        dataValid[playlist].append(songArtistPerPlaylist[playlist][i])

In [6]:
print(len(dataTrain[("9cc0cfd4d7d7885102480dd99e7a90d6", "HARD ROCK 2010")]))
print(len(songArtistPerPlaylist[("9cc0cfd4d7d7885102480dd99e7a90d6", "HARD ROCK 2010")]))
# it works

52
66


In [7]:
print(len(songArtistPerPlaylist))
print(len(dataTrain))
print(len(dataValid))

240039
222626
222626


In [8]:
songsPerUser = defaultdict(list)
usersPerSong = defaultdict(list)
for playlist in dataTrain:
    user = playlist[0]
    for song in dataTrain[playlist]:
        artist = song[0]
        track = song[1]
        songsPerUser[user].append(song)
        usersPerSong[(artist, track)].append(user)

In [9]:
#print(songsPerUser)

In [10]:
betaU = {}
betaI = {}
for u in songsPerUser:
    betaU[u] = 0

for t in usersPerSong:
    betaI[t] = 0

In [11]:
def MSE(y, ypred):
    sse = [(y_val - predicted_val) ** 2 for y_val, predicted_val in zip(y, ypred)]
    return sum(sse) / len(sse)

In [12]:

class BPRbatch(tf.keras.Model):
    def __init__(self, K, lamb):
        super(BPRbatch, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(tf.random.normal([len(songIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(songIDs),K],stddev=0.001))
        # Regularization coefficient
        self.lamb = lamb

    # Prediction for a single instance
    def predict(self, u, i):
        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +\
                            tf.nn.l2_loss(self.gammaU) +\
                            tf.nn.l2_loss(self.gammaI))
    
    def score(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return x_ui

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))


In [13]:
optimizer = tf.keras.optimizers.Adam(0.01)
modelBPR = BPRbatch(5, 0.00001)

songs = list(songIDs.keys())

def trainingStepBPR(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ = [], [], []
        for _ in range(Nsamples):
            u,i = random.choice(interactions) # positive sample
            j = random.choice(songs) # negative sample
            while j in songsPerUser[u]:
                j = random.choice(songs)
            sampleU.append(userIDs[u])
            sampleI.append(songIDs[i])
            sampleJ.append(songIDs[j])

        loss = model(sampleU,sampleI,sampleJ)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()




In [14]:
for i in range(100):
    obj = trainingStepBPR(modelBPR, interactions)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 0.6812658
iteration 20, objective = 0.66956604
iteration 30, objective = 0.6592799
iteration 40, objective = 0.6515281
iteration 50, objective = 0.6445346
iteration 60, objective = 0.6385529
iteration 70, objective = 0.6320902
iteration 80, objective = 0.6260958
iteration 90, objective = 0.6218401
iteration 100, objective = 0.61744994


In [15]:
def predictUser(user, song):
    u = userIDs[user]
    g = songIDs[song]
    return modelBPR.predict(u, g).numpy()

In [16]:
def notInPlaylistDataTrain(user, playlist):
    pair = (user, playlist)
    playedSongs = [playlist_song[1] for playlist_song in dataTrain[pair]]
    unplayedSongs = []
    for song in songIDs:
        if song not in playedSongs:
            unplayedSongs.append(song)
    return unplayedSongs


In [17]:
predictions = {}
for playlist in dataValid:
    user = playlist[0]
    desiredLength = len(dataValid[playlist])

    # we want to find the desiredLength number of songs to add to this playlist

    songScoreList = []
     
    # find which songs havent been interacted with
    unplayedSongs = notInPlaylistDataTrain(user, playlist[1])
    # go through each and predict
    for song in unplayedSongs:
        predictedScore = predictUser(user, song)
        songScoreList.append((song, predictedScore))

    # we sort the tuples by the predicted score value. higher predicted scores in front since more likely
    sortedSongScoreList = sorted(songScoreList, key=lambda x:x[1], reverse=True)  

    #slice and take the desiredLength of the sortedList to add.
    predictions[playlist] = [song[0] for song in sortedSongScoreList[0,desiredLength]]

In [None]:
# now lets calculate accuracy and MSE

actualSongs = []
predictedSongs = []
for playlist in dataValid:
    for song in dataValid[playlist]:
        actualSongs.append(song[1])

for playlist in predictions:
    for song in predictions[playlist]:
        predictedSongs.append(song)

ourMSE = MSE(actualSongs,predictedSongs)
print(ourMSE)

correct_predictions = sum(1 for actual, predicted in zip(actualSongs, predictedSongs) if actual == predicted)

accuracy = (correct_predictions/ len(actualSongs))

print(accuracy)