# GitHub Fetch

In [3]:
import requests

requests.get("https://github.com/eifuentes/lastfm-dataset-1K/releases/download/v1.0/lastfm-dataset-1k.snappy.parquet")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fastparquet
  Downloading fastparquet-2022.12.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 4.9 MB/s 
Collecting pandas>=1.5.0
  Downloading pandas-1.5.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)
[K     |████████████████████████████████| 12.2 MB 55.7 MB/s 
Collecting cramjam>=2.3
  Downloading cramjam-2.6.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 43.1 MB/s 
Installing collected packages: pandas, cramjam, fastparquet
  Attempting uninstall: pandas
    Found existing installation: pandas 1.3.5
    Uninstalling pandas-1.3.5:
      Successfully uninstalled pandas-1.3.5
Successfully installed cramjam-2.6.2 fastparquet-2022.12.0 pandas-1.5.2


# Parameters + Imports

In [12]:
import pandas as pd
import requests
from pprint import pprint
from collections import OrderedDict
import random
from pprint import pprint
import numpy as np
import time

EXPERIMENT_USER_COUNT = 100
EXPERIMENT_TRACK_COUNT = 500
EXPERIMENT_SPLIT = 400
BASKET_SIZE = 10

# Read raw data

In [13]:
raw = pd.read_parquet("lastfm-dataset-1k.snappy.parquet")
raw.style.hide_index()
raw.head()

  raw.style.hide_index()


Unnamed: 0,user_id,timestamp,artist_id,artist_name,track_id,track_name
0,user_000001,2006-08-13 13:59:20+00:00,09a114d9-7723-4e14-b524-379697f6d2b5,Plaid & Bob Jaroc,c4633ab1-e715-477f-8685-afa5f2058e42,The Launching Of Big Face
1,user_000001,2006-08-13 14:03:29+00:00,09a114d9-7723-4e14-b524-379697f6d2b5,Plaid & Bob Jaroc,bc2765af-208c-44c5-b3b0-cf597a646660,Zn Zero
2,user_000001,2006-08-13 14:10:43+00:00,09a114d9-7723-4e14-b524-379697f6d2b5,Plaid & Bob Jaroc,aa9c5a80-5cbe-42aa-a966-eb3cfa37d832,The Return Of Super Barrio - End Credits
3,user_000001,2006-08-13 14:17:40+00:00,67fb65b5-6589-47f0-9371-8a40eb268dfb,Tommy Guerrero,d9b1c1da-7e47-4f97-a135-77260f2f559d,Mission Flats
4,user_000001,2006-08-13 14:19:06+00:00,1cfbc7d1-299c-46e6-ba4c-1facb84ba435,Artful Dodger,120bb01c-03e4-465f-94a0-dce5e9fac711,What You Gonna Do?


# Rework data

### Get top tracks from dataset

In [14]:
top_tracks = raw.track_id.value_counts()[:1000].index.to_list()

### Remove Null users

In [15]:
end_users = []

raw = raw.dropna()

users = raw.user_id.unique()

end_users = users[:EXPERIMENT_USER_COUNT]

### Extract useful data from raw

In [16]:
data = {}
for user in end_users:
    data[user] = raw[raw["user_id"] == user][:EXPERIMENT_TRACK_COUNT].track_id.to_list()


# Extract track counts

In [17]:
GP = False

split = {}

# Dictionary of users containing their baskets
baskets = {}
explore_counts = {}
# Dictionary we will use for the prediction
prediction = {}
exploration = {}

for user in end_users:
  l = len(data[user])
  if l >= EXPERIMENT_SPLIT:
    lsplit = EXPERIMENT_SPLIT
  else:
    lsplit = (int)(l*0.8)
  split[user] = data[user][:lsplit]
  exploration[user] = data[user][(lsplit-l):]
  
  basket = OrderedDict()
  explore_count = OrderedDict()
  mem = []
  for track in split[user]:
    if track in mem:
      basket[track] += 1
    elif track != None:
      basket[track] = 1
      mem.append(track)

  mem = []
  for track in exploration[user]:
    if track in mem:
      explore_count[track] += 1
    elif track != None:
      explore_count[track] = 1
      mem.append(track)
  #print(basket)

  baskets[user] = []
  explore_counts[user] = []
  for title, count in basket.items():
    baskets[user].append([title, count])
  for title, count in explore_count.items():
    explore_counts[user].append([title, count])


In [18]:
print(baskets["user_000061"])

[['3d02535e-a5ef-4c25-8ad4-250542288fbd', 1], ['d2c36a97-6596-439e-9018-3d9ff08278c1', 1], ['652e6278-f7a3-4590-9f76-4d18b124922e', 1], ['2e25a2d0-f5e3-43ca-a8d5-a68c6f758f7b', 1]]


In [None]:
Nr = []
Ne = []
tReps = {}
tExps = {}

for user in end_users:
    tReps[user] = []
    tExps[user] = []
    for track in explore_counts[user]:
        if track[1] > 1:
            tReps[user].append(track[0])
            if user not in Nr:
                Nr.append(user)
        elif track[1] == 1:
            tExps[user].append(track[0])
            if user not in Ne:
                Ne.append(user)

print(len(Nr)/100)
print(len(Ne)/100)

# Baseline Algorithm

In [26]:
def baseline_algo(user_basket, user):
  frequency_list = {}
  #print(user_basket)
  for track in user_basket:
    if track[0] not in frequency_list.values():
      count = track[1]
      if count not in frequency_list.keys():
        frequency_list[count] = [track]
      else:
        frequency_list[count].append(track)
  #print(frequency_list)
  highest = 0
  for key in frequency_list.keys():
    if int(key) > highest:
      highest = key
  #print(highest)
  
  #predicted_track = random.choice(frequency_list[highest])
  next_basket = []
  limit = BASKET_SIZE
  count = 0
  for value in sorted(frequency_list.items(), reverse = True):
    lg = len(value[1])
    count += lg
    #print(count)
    if lg >= limit:
      for track in value[1][:limit]:
        next_basket.append(track)
      #print(next_basket)
      break
    else:
      limit -= lg
      for track in value[1]:
        next_basket.append(track)

  if len(next_basket) < BASKET_SIZE:
    for track in top_tracks:
      if track not in next_basket and track not in split[user]:
        next_basket.append([track, 0])
        
      if len(next_basket) >= BASKET_SIZE:
        break

  return next_basket

predicted_baskets = {}
for user in end_users:
  base = baseline_algo(baskets[user], user)
  predicted_baskets[user] = base

# Metrics

### Functions

In [27]:
import math

#print(predicted_baskets["user_000001"])
#print(data["user_000001"])
# Can only implement this if we can somehow rank the results of the algorithm

def recRep(basket, target):
  if len(target) == 0:
    return 0
  res = 0
  for track in basket:
    if track[0] in target:
      res += 1
  return res/len(target)

def preRep(basket, target):
  if len(target) == 0:
    return 0
  count = 0
  for track in basket:
    if track[0] in target:
      count += 1
  return count/len(basket)

def recExp(basket, target):
  if len(target) == 0:
    return 0
  res = 0
  for track in basket:
    if track[0] in target:
      res += 1
  return res/len(target)

def preExp(basket, target):
  if len(target) == 0:
    return 0
  count = 0
  for track in basket:
    if track[0] in target:
      count += 1
  return count/len(basket)

def PHR(basket, target):
  for track in basket:
    if track[0] in target:
      return 1
  return 0

def DCG(basket, history):
  res = 1
  for i, track in enumerate(basket[1:]):
    if track[0] in history:
      res += 1/(math.log2(i+2))
  return res

def NDCG(basket, history):
  res = DCG(basket, history)
  # Our algorithm always returns results in the optimal order : first, results the user listened to most, then the ones they may not have listened to.
  # So, NDCG is always 1
  return res/res

def average_score_array(array):
  for i in range(len(array)):
    if i >= 0 and i <= 2:
      array[i] = array[i]/len(Nr)
    elif i >= 3 and i <= 5:
      array[i] = array[i]/len(Ne)
    else:
      array[i] = array[i]/EXPERIMENT_USER_COUNT
  return array

def compute_tRep(testing_data):
  res = []
  for track in testing_data:
    if track[1] > 1:
      res.append()

### Computation

In [36]:
scores = {}

for user in Nr:
  #scores : Repeat Recall, Repeat Precision, Repeat PHR
  scores[user] = {}
  scores[user]["recRep"] = recRep(predicted_baskets[user], tReps[user])
  scores[user]["preRep"] = preRep(predicted_baskets[user], tReps[user])
  scores[user]["PHRRep"] = PHR(predicted_baskets[user], tReps[user])

for user in Ne:
  tExp = list(set(exploration[user]) - set(split[user]))
  #print("tExp: ", tExp)
  #print("predicted: ", predicted_baskets[user])
  if(user not in scores.keys()):
    scores[user] = {}
  
  scores[user]["recExp"] = recExp(predicted_baskets[user], tExps[user])
  scores[user]["preExp"] = preExp(predicted_baskets[user], tExps[user])
  scores[user]["PHRExp"] = PHR(predicted_baskets[user], tExps[user])

for user in end_users:
  scores[user]["NDCG"] = NDCG(predicted_baskets[user], data[user])

#print(scores)
avg_scores = [0,0,0,0,0,0,0]

for user in end_users:
  for i in range(7):
    if user in Nr and i == 0:
      avg_scores[i] += scores[user]["recRep"]
    elif user in Nr and i == 1:
      avg_scores[i] += scores[user]["preRep"]
    elif user in Nr and i == 2:
      avg_scores[i] += scores[user]["PHRRep"]
    elif user in Ne and i == 3:
      avg_scores[i] += scores[user]["recExp"]
    elif user in Ne and i == 4:
      avg_scores[i] += scores[user]["preExp"]
    elif user in Ne and i == 5:
      avg_scores[i] += scores[user]["PHRExp"]
    elif i == 6:
      avg_scores[i] += scores[user]["NDCG"]

avg_scores = average_score_array(avg_scores)

print("RepeatRecall, RepeatPrecision, RepeatPHR, ExploreRecall, ExplorePrecision, ExplorePHR, NDCG:")
print(avg_scores)

RepeatRecall, RepeatPrecision, RepeatPHR, ExploreRecall, ExplorePrecision, ExplorePHR, NDCG:
[0.12146810439593225, 0.1555555555555555, 0.5666666666666667, 0.025910608307885327, 0.14141414141414138, 0.6262626262626263, 1.0]
