In [9]:
import pandas as pd
import numpy as np
import scipy.stats as sp
import scipy.special as sps
import csv

In [10]:
sample = pd.read_csv('sampled_interactions.tsv', sep='\t', names=['user_id', 'track_id', 'play_count', 'gender'])
sample

Unnamed: 0,user_id,track_id,play_count,gender
0,1,2154,4,m
1,1,2241,4,m
2,1,3270,6,m
3,1,4003,2,m
4,1,5080,2,m
...,...,...,...,...
268362,120320,1480735,2,m
268363,120320,2626622,3,m
268364,120320,2465242,4,m
268365,120320,14175074,2,m


In [11]:
interactions = sample[['user_id', 'track_id', 'play_count']]
users = sample[['user_id', 'gender']].drop_duplicates()

men = set(users[users['gender'] == 'm']['user_id'].values)
women = set(users[users['gender'] == 'f']['user_id'].values)
print(f'men: {len(men)}')
print(f'women: {len(women)}')
print(f'total listening events: {interactions["play_count"].sum()}')

track_ids = np.sort(np.array(interactions['track_id'].unique(), dtype = np.int32))
print(f'unique tracks: {len(track_ids)}')

popularities_df = interactions[['track_id', 'play_count']].groupby('track_id').sum()
popularities = np.array(popularities_df['play_count'].values)

print(f'average sum of track playcounts: {popularities.mean()}')
popularities_df.head(5) # P(t)   

men: 17977
women: 4939
total listening events: 1967620
unique tracks: 7140
average sum of track playcounts: 275.57703081232495


Unnamed: 0_level_0,play_count
track_id,Unnamed: 1_level_1
482,12009
521,8427
724,17837
1041,501
1112,10192


In [12]:
grouped_by_users = interactions[['user_id', 'track_id']].groupby('user_id')
user_histories = {user_id: group['track_id'].values for user_id, group in grouped_by_users}

print(f'example listening history: {user_histories[1]}')
print(f'average length of user history: {np.mean([len(value) for key, value in user_histories.items()])}')

example listening history: [2154 2241 3270 4003 5080 6353 7724 8543]
average length of user history: 11.65900599530802


In [13]:
# decile-binning so that the cumulative popularity of all tracks in one bin is approximately 10% of the total popularity
sorted_pops_df = popularities_df.sort_values('play_count', ascending = False)
sorted_tracks = np.array(sorted_pops_df.index.values)
sorted_pops = np.array(sorted_pops_df['play_count'].values)

total_popularity = popularities.sum()
bin_limit = total_popularity / 10

bins = [list() for i in range(0,10)]

current_pop_sum = 0
current_bin = 0
for i, track in np.ndenumerate(sorted_tracks):
    current_pop_sum = current_pop_sum + sorted_pops[i]
    
    bins[current_bin].append(track) 
    
    if current_pop_sum >= bin_limit:
        current_pop_sum = 0
        current_bin = current_bin + 1


In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(1, 20))

interactions['scaled_counts'] = scaler.fit_transform(interactions[['play_count']])
interactions = interactions.drop(columns = 'play_count')
interactions = interactions.rename(columns = {'scaled_counts': 'play_count'})
interactions['play_count'].sort_values()

268366     1.000000
171755     1.000000
171756     1.000000
171758     1.000000
70998      1.000000
            ...    
16803     14.049217
209478    15.721104
198703    15.905295
261966    16.585384
198710    20.000000
Name: play_count, Length: 268367, dtype: float64

In [15]:
REC_LIST_SIZE = 10
zeros = np.zeros(len(track_ids), dtype = np.int32)
ones = np.ones(len(track_ids), dtype = np.int32)

def non_zero_median(a): #  the median in these distributions is typically 0, so np.median gets div-by-zero
    return np.median(a[a.nonzero()])


class AlgoResult:
    def __init__(self, name, recs, score):
        self.name = name
        self.recs = recs
        self.score = score

class Evaluation:        
    def __init__(self, testset, algos_list):
        self.users = set([uid for (uid, _, _) in testset])
        self.women = set([uid for uid in self.users if uid in women])
        self.men = set([uid for uid in self.users if uid in men])

        self.algos_list = algos_list # list of AlgoResults
        
        self.metrics = [np.mean, non_zero_median, np.var, sp.skew, sp.kurtosis]        
                
    def calc_metrics(self, uid, rec_list):
        h = np.where(np.isin(track_ids, user_histories[uid]), ones, zeros) * popularities
        r = np.where(np.isin(track_ids, rec_list[:len(user_histories[uid])]), ones, zeros) * popularities
        
        calc = []
        for metric in self.metrics:
            mh = metric(h)
            mr = metric(r)
            calc.append(100 * (mr - mh) / mh)
            
            
        kt, _ = sp.kendalltau(h,r)    
        calc.append(kt)
        return calc

    def aggregate_metrics(self, recs, name, score, gender = None):
        if gender == "women":
            metrics = np.array([self.calc_metrics(uid, rec_list) for (uid, rec_list) in recs if uid in women])
        elif gender == "men":
            metrics = np.array([self.calc_metrics(uid, rec_list) for (uid, rec_list) in recs if uid in men])
        else:
            gender = "all"
            metrics = np.array([self.calc_metrics(uid, rec_list) for (uid, rec_list) in recs])   
            
        metrics = np.median(metrics, axis = 0)
        return pd.Series({'algorithm': name, 'gender': gender, 'mean': metrics[0], 'median': metrics[1],
                          'variance': metrics[2], 'skew': metrics[3], 'kurtosis': metrics[4],
                          'kendall-tau': metrics[5], 'rmse': score})
            
    def eval_algo(self, algo):
        print(f'processing {algo.name}')
        
        a = self.aggregate_metrics(algo.recs, algo.name, algo.score)
        b = self.aggregate_metrics(algo.recs, algo.name, algo.score, "women")
        c = self.aggregate_metrics(algo.recs, algo.name, algo.score, "men")
        
        return pd.DataFrame(data = [a, b, c])
 
    def process(self):
        return pd.concat([self.eval_algo(algo) for algo in self.algos_list], ignore_index = True)



    def calc_kl(self, uid, rec_list):
        print(uid, end = '\r')
        #sum_kl = 0
        
        #for b in bins:
            #zeros_b = np.zeros(len(b), dtype = np.int32)
            #ones_b = np.ones(len(b), dtype = np.int32)
            #h = np.where(np.isin(b, user_histories[uid]), ones_b, zeros_b) * popularities
            #r = np.where(np.isin(b, rec_list[:len(user_histories[uid])]), ones_b, zeros_b) * popularities
            #h = [popularities_df.loc[track]['play_count'] if track in user_histories[uid] else 0 for track in b]
            #r = [popularities_df.loc[track]['play_count'] if track in rec_list[:len(user_histories[uid])] else 0 for track in b]
            #h = np.array(h)
            #r = np.array(r)
            
            #sum_kl = sum_kl + np.sum(np.where(h != 0, h * np.log(h / r), 0))
            #sum_kl = sum_kl + sum(sps.kl_div(h,r))
                        
        #return sum_kl
        
        
        h = np.where(np.isin(track_ids, user_histories[uid]), ones, zeros) * popularities
        r = np.where(np.isin(track_ids, rec_list[:len(user_histories[uid])]), ones, zeros) * popularities

        #return sps.kl_div(h,r)
        return sp.kendalltau(h,r)
            
            


In [16]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import NormalPredictor, BaselineOnly, KNNWithMeans, KNNBaseline, SVD, NMF, SlopeOne, CoClustering
from surprise.accuracy import rmse
from surprise.model_selection import GridSearchCV

dataset = Dataset.load_from_df(interactions, Reader(rating_scale=(1, interactions['play_count'].max())))

# param_grid = 
# SVD - {'n_factors': [20, 50, 100, 150], 'n_epochs': [5, 10, 20, 25, 30], 'lr_all': [0.002, 0.005, 0.01],
#        'reg_all': [0.02, 0.1, 0.4, 0.6], 'verbose': [True]}
# NMF - {'n_factors': [5, 15, 30], 'n_epochs': [5, 15, 50, 75], 'biased': [True, False] , 'verbose': [True]}
# Co-clustering - {'n_cltr_u ': [3,5,10], 'n_cltr_i ': [3,5,10], 'n_epochs': [10, 20, 35, 50], 'verbose': [True]}
# KNNWithMeans & KNNBaseline - {'k': [10, 20, 40, 75, 100], 'verbose': [True],
#                               'sim_options': {'name': ['msd', 'cosine'], 'user_based': [False]}}

#gs = GridSearchCV(KNNBaseline, param_grid, measures = ['rmse', 'mae'], cv = 5, joblib_verbose = True, n_jobs = -1)
#gs.fit(dataset)

#print(gs.best_score['rmse'])
#print(gs.best_params['rmse'])
#algo = gs.best_estimator['rmse']
# SVD - {'n_factors': 20, 'n_epochs': 25, 'lr_all': 0.005, 'reg_all': 0.6, 'verbose': True}
# NMF - {'n_factors': 5, 'n_epochs': 75, 'biased': True, 'verbose': True}
# Co-clustering - {'n_cltr_u': 3, 'n_cltr_i': 3, 'n_epochs': 10, 'verbose': True}
# KNNWithMeans & KNNBaseline - {'k': 100, 'verbose': True, 'sim_options': {'name': 'cosine', 'user_based': False}}

In [228]:
from collections import defaultdict
from surprise.model_selection import train_test_split

def get_top_n(predictions, n = REC_LIST_SIZE):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def train_and_rec(trainset, testset, algorithm):
    algorithm.fit(trainset)
    predictions = algorithm.test(testset)  
    top_n = get_top_n(predictions)
 
    score = rmse(predictions, verbose=False)
    
    return score, [(uid, [iid for (iid, _) in user_ratings]) for uid, user_ratings in top_n.items()]

def get_algo_results(trainset, testset):
    algorithms = [NormalPredictor(),
                  BaselineOnly(verbose = True),
                  SlopeOne(),
                  KNNWithMeans(verbose = True, k = 100, sim_options = {'name': 'cosine', 'user_based': False}),
                  KNNBaseline(verbose = True, k = 100, sim_options = {'name': 'cosine', 'user_based': False}),
                  SVD(verbose = True, random_state = 42, n_epochs = 25, n_factors = 20, lr_all = 0.005, reg_all = 0.6),
                  NMF(verbose = True, random_state = 42, n_epochs = 75, n_factors = 5, biased = True),
                  CoClustering(verbose = True, random_state = 42, n_cltr_u = 3, n_cltr_i = 3, n_epochs = 10)]       
        
    res = []
    for algorithm in algorithms:
        score, recs = train_and_rec(trainset, testset, algorithm)
        res.append(AlgoResult(type(algorithm).__name__, recs, score))
           
    return res

In [229]:
trainset, testset = train_test_split(dataset, test_size=.2, random_state = 42)
res = get_algo_results(trainset, testset)

import time
start = time.time()

evals = Evaluation(testset, res)
scores = evals.process()

end = time.time()
print(end - start)

scores.to_csv('output.txt')

Estimating biases using als...


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  algorithm.fit(trainset)


Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 

In [230]:
pd.read_csv('output.txt')

Unnamed: 0.1,Unnamed: 0,algorithm,gender,mean,median,variance,skew,kurtosis,kendall-tau,rmse
0,0,NormalPredictor,all,-83.235975,2.840673,-91.979982,37.721537,77.794421,0.447151,0.247181
1,1,NormalPredictor,women,-82.92457,1.839306,-92.172687,35.695129,73.986031,0.447276,0.247181
2,2,NormalPredictor,men,-83.302088,3.130148,-91.918975,38.293489,78.973906,0.447151,0.247181
3,3,BaselineOnly,all,-83.108081,3.562,-91.914156,37.634934,77.670031,0.447151,0.183616
4,4,BaselineOnly,women,-82.870861,2.028526,-92.148991,35.723443,74.114849,0.447276,0.183616
5,5,BaselineOnly,men,-83.171336,3.974895,-91.823457,38.214521,78.721728,0.447151,0.183616
6,6,SlopeOne,all,-83.211873,2.770307,-91.974407,37.726948,77.813245,0.447151,0.195959
7,7,SlopeOne,women,-82.92457,1.772855,-92.172687,35.695129,73.986031,0.447276,0.195959
8,8,SlopeOne,men,-83.276109,3.061224,-91.905325,38.317732,79.004219,0.447151,0.195959
9,9,KNNWithMeans,all,-83.12568,3.581702,-91.91532,37.634483,77.660493,0.447151,0.200465


In [231]:
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

user_item = csr_matrix((interactions['play_count'], (interactions['user_id'], interactions['track_id'])),
                          shape=(interactions['user_id'].max() + 1, interactions['track_id'].max() + 1))
item_user = user_item.T.tocsr()


als = AlternatingLeastSquares(factors = 10)
als.fit(item_user)

  0%|          | 0/15 [00:00<?, ?it/s]

In [232]:
recommendations = als.recommend(7, user_item[7], 10)
recommendations

(array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32))

In [233]:
#from sklearn.model_selection import train_test_split
#train_val, test = train_test_split(interactions, test_size = 0.2)
#train, validation = train_test_split(train_val, test_size = 0.25) # 0.25*0.8 = 0.2
#print(len(train))
#print(len(validation))
#print(len(test))

#len(test[test['user_id'].isin(users[users['gender'] == 'f']['user_id'].values)]['user_id'].values)