In [None]:
import gzip
from collections import defaultdict
import random
import tensorflow as tf
import math
import statistics
import csv

In [None]:
data = []
with open("spotify_dataset.csv", encoding="utf8") as csv_file:
    csv_reader = csv.DictReader(csv_file)

    #ignore first row
    next(csv_reader)

    csv_reader.fieldnames = ["user_id", "artist_name", "track_name", "playlist_name"]

    for l in csv_reader:
        data.append(l)
        

In [None]:
print(data[0])

In [None]:
songArtistPerPlaylist = defaultdict(list)
songIDs = {}
userIDs = {}
interactions = []

for d in data:
    playlist = d["playlist_name"]
    track = d["track_name"]
    artist = d["artist_name"]
    user = d["user_id"]
    if not user in userIDs: userIDs[user] = len(userIDs)
    if not track in songIDs: songIDs[track] = len(songIDs)
    interactions.append((user, track))
    songArtistPerPlaylist[(user, playlist)].append([artist,track])

In [None]:

dataTrain = defaultdict(list)
dataValid = defaultdict(list)

# 80 20 split
for playlist in songArtistPerPlaylist:
    playlist_length = len(songArtistPerPlaylist[playlist])
    split_index = int(0.8 * playlist_length)
    for i in range(0, split_index):
        dataTrain[playlist].append(songArtistPerPlaylist[playlist][i])
    for i in range(split_index):
        dataValid[playlist].append(songArtistPerPlaylist[playlist][i])

In [None]:
print(len(dataTrain[("9cc0cfd4d7d7885102480dd99e7a90d6", "HARD ROCK 2010")]))
print(len(songArtistPerPlaylist[("9cc0cfd4d7d7885102480dd99e7a90d6", "HARD ROCK 2010")]))
# it works

In [None]:
print(len(songArtistPerPlaylist))
print(len(dataTrain))
print(len(dataValid))

In [None]:
songsPerUser = defaultdict(list)
usersPerSong = defaultdict(list)
for playlist in dataTrain:
    user = playlist[0]
    for song in dataTrain[playlist]:
        artist = song[0]
        track = song[1]
        songsPerUser[user].append(song)
        usersPerSong[(artist, track)].append(user)

In [None]:
print(songsPerUser)

In [None]:
betaU = {}
betaI = {}
for u in songsPerUser:
    betaU[u] = 0

for t in usersPerSong:
    betaI[t] = 0

In [None]:
def MSE(y, ypred):
    sse = [(y_val - predicted_val) ** 2 for y_val, predicted_val in zip(y, ypred)]
    return sum(sse) / len(sse)

In [None]:

class BPRbatch(tf.keras.Model):
    def __init__(self, K, lamb):
        super(BPRbatch, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(tf.random.normal([len(songIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(songIDs),K],stddev=0.001))
        # Regularization coefficient
        self.lamb = lamb

    # Prediction for a single instance
    def predict(self, u, i):
        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +\
                            tf.nn.l2_loss(self.gammaU) +\
                            tf.nn.l2_loss(self.gammaI))
    
    def score(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return x_ui

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))


In [None]:
optimizer = tf.keras.optimizers.Adam(0.01)
modelBPR = BPRbatch(3, 0.00001)

games = list(songIDs.keys())

def trainingStepBPR(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ = [], [], []
        for _ in range(Nsamples):
            u,i = random.choice(interactions) # positive sample
            j = random.choice(games) # negative sample
            while j in songsPerUser[u]:
                j = random.choice(games)
            sampleU.append(userIDs[u])
            sampleI.append(songIDs[i])
            sampleJ.append(songIDs[j])

        loss = model(sampleU,sampleI,sampleJ)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [None]:
for i in range(100):
    obj = trainingStepBPR(modelBPR, interactions)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))