In [187]:
#our goal is to have top K artist recommendations for a user, so that they would 
from math import log
from __future__ import print_function
import pandas as pd
import implicit
import numpy as np
import scipy
import random
from operator import itemgetter
from tqdm import tqdm_notebook as tqdm

df = pd.read_csv('lastfm_9000_users.csv', na_filter=False)
df = df.drop(['Unnamed: 0'], axis=1)

## TO DO

BIG THINGS: 
- set-up cross-validation
    - Baseline model evaluate
    - Tune our Hyperparameters
- fix precision/recall
- calculate NDCG  
- seeing if it scales

CASES TO EVALUATE:
- sparse vs. non-sparse data
- threshold of relevance
- which users should we hold out (those with > 10/20/30 values?)

PERFORMANCE:
- speed up split_train_per_user, by figuring out how to access zero values faster
- speed up precision, recall evaluation

## Data Preparation|

In [188]:
#create sparse matrix from dataframe object
def create_sparse_matrix(data, user_user = True):
    """
    Creates sparse matrix (csr_matrix) out of pandas dataframe.
    
    Parameters: 
    - data: Dataframe of user/artist data
    - user_user: determines whether output will be for user-to-user or item-to-item collaborative filtering
                 if user_user is True, then rows will be items and columns will be users
    
    Returns: 
    - plays_sparse: a sparse csr_matrix
    
    """
    print("Creating sparse matrix...")
    #grab unique users/artist IDS
    users = list(np.sort(data.user_id.unique()))
    artists = list(data.artist_mbid.unique())
    plays = list(data.plays)

    # user-user set-up
    if (user_user == True):
        rows = data.user_id.astype('category', categories=users).cat.codes
        cols = data.artist_mbid.astype('category', categories=artists).cat.codes
        plays_sparse = scipy.sparse.csr_matrix((plays, (rows, cols)), shape=(len(users),len(artists)))

    #item-item set-up
    else:    
        rows = data.artist_mbid.astype('category', categories=artists).cat.codes
        cols = data.user_id.astype('category', categories=users).cat.codes
        plays_sparse = scipy.sparse.csr_matrix((plays, (rows, cols)), shape=(len(artists),len(users)))
        
    return plays_sparse

In [189]:
#calculate how sparse the matrix is
def calculate_sparsity(M):
    """
    Calculates how sparse this matrix
    """
    matrix_size = float(M.shape[0]*M.shape[1]) # Number of possible interactions in the matrix
    num_plays = len(M.nonzero()[0]) # Number of items interacted with
    sparsity = 100*(1 - float(num_plays/matrix_size))
    return sparsity

## Split Train Test

In [190]:
#split train, test using all user pairs
def make_train_all_user_pairs(data, test_pct):
    """
    params:
        data: data set in csr_matrix format
        test_pct: percentage to be test set
    """
    #create copies of dataset for training and test data
    test = data.copy()
    train = data.copy()
    
    #alter train data, masking/holding-out random user-pair values for some users
    nonzero_idx = train.nonzero() #find indices in data where interaction exists
    
    
    nonzero_pairs = zip(nonzero_idx[0], nonzero_idx[1]) #create pairs of (user, item) index
    
    #determine how many user-pair values we need to mask, according to test_pct
    random.seed(0) #for reproducibility
    num_samples = int(np.ceil(test_pct * len(nonzero_pairs)))
    samples = random.sample(nonzero_pairs, num_samples) #sample random number of user-item pairs without replacement
    print(type(samples))
    #get user, item row and column indices
    user_idx = [index[0] for index in samples] 
    item_idx = [index[1] for index in samples] 
    
    train[user_idx,item_idx] = 0 #mask the randomly chosen user-item pairs
    train.eliminate_zeros() #remove zeros in sparse arrays that was made previously
    
    return train, test, list(set(user_idx)), samples #output unique list of user rows that were altered

In [191]:
#split train, test by user only with interactions#, with test size=total/k
def split_train_test_per_user(data, k, interactions = 20,cross_valid= False,):
    """
    Create train matrix with masked values and dictionary of test values 
    
    Parameters:
    - data: csr_matrix, assuming matrix is user-user (item as rows, columns as users)
    - test_pct: percentage of items to mask per user
    
    Output:
    - train: masked matrix
    - test: list of tuples of held out data ((user_idx, item_idx), plays)
    """
    random.seed(0) #for reproducibility
    
    train = data.copy() #transpose to make procedure easier/more intuitive
    
    test = dict() #dict to keep track of masked user-item values
    
    user_count = 0
    test_list=[]
    train_list=[]
    if cross_valid==True: #initialize
        for i in range(k):
            test_list.append(dict())
            train_list.append(train)
    
    #for each user in the training set
    for user_idx in tqdm(range(train.get_shape()[0])):

        #get indices of interactions of this user
        nonzero_idx = train[user_idx].nonzero()

        #only hold out users that have enough data (greater than interactions #)
        if nonzero_idx[1].shape[0] >= interactions:
            user_count += 1
            #create list of tuples: interaction index (row, col) with the number of plays
            nonzero_pairs = [((user_idx, item_idx), train[user_idx,item_idx]) for item_idx in nonzero_idx[1]]

            #sort tuples descending by value
            nonzero_sorted = sorted(nonzero_pairs, key = itemgetter(1), reverse = True)

            #get top interaction # values, then sample test_pct% randomly from subset
            top_values = nonzero_sorted[0:interactions]

            #sample random number of item_indexes without replacement
            num_samples = int(np.floor(interactions/float(k)))
            if (cross_valid==False): 
                samples = random.sample(top_values, num_samples) 

                #append user_idx, item_
                test[user_idx] = [pair[0][1] for pair in samples]

                #mask the randomly chosen items of this user
                for pair in samples:
                    train[pair[0][0], pair[0][1]] = 0

            else: #Cross Validation Step
                for i in range(k):
                    train = train_list[i]
                    k_test=test_list[i]
                    random.shuffle(top_values) 
                    samples=top_values[0:num_samples]
                    top_values=top_values[num_samples:]
                    #append user_idx, item_
                    k_test[user_idx] = [pair[0][1] for pair in samples]
                    test_list[i]=k_test #update test
                    #mask the randomly chosen items of this user
                    for pair in samples:
                        train[pair[0][0], pair[0][1]] = 0
                    train.eliminate_zeros()
                    train_list[i]=train #update train
    if (cross_valid==False):
        return train.T.tocsr(), test, user_count #convert matrix back
    else:
        for i in range(k):
            train_list[i]=train_list[i].T.tocsr()
        return train_list, test_list, user_count #convert matrix back


In [192]:
#calculate how many interactions are masked compared to previous dataset
def pct_masked(original, altered):
    altered_n = altered.nonzero()[0].shape[0]
    original_n = original.nonzero()[0].shape[0]
    return (original_n - altered_n)/float(altered_n)

## Custom Models

In [193]:
def baseline(k,user_items):
    plays=user_items.toarray()
    totalplays=np.sum(plays,axis=1)    
    idx = (-totalplays).argsort()[:k]
    return idx

## Evaluation/Metrics

In [194]:
def zeros_list(n):
    listofzeros = [0] * n
    return listofzeros

def dcg_at_k(scores):
    assert scores
    return scores[0] + sum(sc / log(ind, 2) for sc, ind in zip(scores[1:], range(2, len(scores)+1)))

def ndcg_at_k(predicted_scores, user_scores):
    """
    predicted_scores: recommended k items from model
    user_scores: held out items
    """
    assert len(predicted_scores) == len(user_scores)

    idcg = dcg_at_k(sorted(user_scores, reverse=True))
    x = (dcg_at_k(predicted_scores) / idcg) if idcg > 0.0 else 0.0
    
    return x

#used to evaluate model
def evaluate(model, test, M, n_rec = 20):
    """
    Calculate precision/recall
    
    parameters:
    - model: fitted implicit model that will perform recommendations
    - test: list containing tuples that are heldout for each user
    - M: csr_matrix of item-users, used in fit
    - n_rec: how many recommendations the system outputs
    
    
    returns:
    - two numpy arrays containing precision and recall
    """
    #TODO: Refactor NDCG and Recall, less dependent on each other
    
    M_rec = M.T.tocsr() #transpose to recommend
    
    tp = float(0)
    test_n = float(0)
    print('Evaluating model...')
    
    ndcg = list()
    #calculate true positives for each user, append results to list
    for user, holdout_items in tqdm(test.items()):
        
        #get list of item recs for each user
        rec = model.recommend(user, M_rec, N=n_rec, filter_already_liked_items=True) #returns (item_id, score)
        rec_items = [pair[0] for pair in rec] #get only item_id
        test_n += len(holdout_items) #total number of heldout items
        
        #for NDCG
        predicted_scores = zeros_list(n_rec)
        user_scores = zeros_list(n_rec)
        i = 0
            
        #count true positives in recommended items
        for item in holdout_items: 
            value = M[user,item]
            user_scores[i] = value
            i += 1
            if item in rec_items:
                predicted_scores[rec_items.index(item)] = value #if holdout items is in recommended
                tp += 1
        #Calculate NDCG
        ndcg.append(ndcg_at_k(predicted_scores, user_scores))

    recall = tp/test_n
    return recall, np.mean(ndcg)

In [202]:
#used to evaluate model
def evaluate_base(rec_items, test, M, n_rec = 20):
    """
    Calculate recall
    
    parameters:
    - model: fitted implicit model that will perform recommendations
    - test: list containing tuples that are heldout for each user
    - M: csr_matrix of item-users, used in fit
    - n_rec: how many recommendations the system outputs
    returns:
    - numpy array containing recall
    """
    M_rec = M.T.tocsr() #transpose to recommend
    
    tp = float(0)
    test_n = float(0)
    rec_items = list(rec_items)
    ndcg = list()

    print('Evaluating model...')
    #calculate true positives for each user, append results to list

    for user, holdout_items in tqdm(test.items()):
        test_n += len(holdout_items)
        #count true positives in recommended items
        predicted_scores = zeros_list(n_rec)
        user_scores = zeros_list(n_rec)
        i = 0
        for item in holdout_items:
            value = M[user,item]
            user_scores[i] = value
            i += 1
            if item in rec_items:
                predicted_scores[rec_items.index(item)] = value #if holdout items is in recommended
                tp += 1

        ndcg.append(ndcg_at_k(predicted_scores, user_scores))
        
    recall = tp/test_n
    return recall,np.mean(ndcg)

# Main Script

#### Create sparse matrix

In [185]:
"""
PREPARE
"""
#create sparse matrix
plays_sparse = create_sparse_matrix(df).astype('float')

#filter out users with < 15 artists/reduce sparsity if needed
print('Matrix Sparsity:', calculate_sparsity(plays_sparse))

"""
SPLIT TRAIN TEST
"""
train, test, user_count = split_train_test_per_user(plays_sparse, 3, 10)
print("Percentage of original data masked:", pct_masked(plays_sparse, train))
print("Users masked:", user_count)

Creating sparse matrix...




Matrix Sparsity: 99.90785209726101


Percentage of original data masked: 0.06849575890522634
Users masked: 8514


#### Run model-based once

In [196]:
"""
ALS MODEL BASED
"""
model = implicit.als.AlternatingLeastSquares(factors=30)

# K-Nearest Neighbors
# model = implicit.nearest_neighbours.BM25Recommender()

# train model 
print("Fitting model...")
model.fit(train, show_progress=True)

recall, ndcg = evaluate(model, test, plays_sparse)
print("Recall:",recall*100,'%')
print("Average NDCG:",ndcg*100,'%')

  3%|▎         | 0.5/15 [00:00<00:04,  3.48it/s]

Fitting model...


100%|██████████| 15.0/15 [00:04<00:00,  3.17it/s]


Evaluating model...


Recall: 21.364810899694618 %
Average NDCG: 11.86640420797939 %


#### Run Baseline once

In [197]:
#BASELINE
user_items = plays_sparse.T.tocsr()
rec_items=baseline(20,user_items)

In [203]:
#Evaluate Baseline
recall,ndcg = evaluate_base(rec_items,test,plays_sparse)
print(recall*100,'%')
print("Average NDCG per User:",ndcg*100,'%')

Evaluating model...


AttributeError: 'numpy.float64' object has no attribute 'append'

# Cross Validation 

### Splitting into test and training sets

In [14]:
#Cross Validation
k=5
train_list, test_list, user_count = split_train_test_per_user(plays_sparse,k,20,cross_valid=True)

A Jupyter Widget




### Evaluate Model-Based

In [15]:
#Evaluate Model-based from Cross Validation 

# train model
recall_list=[]
ndcg_list=[]
for i in range(k):
    print("Fitting model...")
    train=train_list[i]
    test=test_list[i]
    model = implicit.als.AlternatingLeastSquares(factors=50)
    model.fit(train, show_progress=False)
    recall,ndcg = evaluate(model,test,plays_sparse)
    print(recall*100,'%')
    print(ndcg*100,'%')
    recall_list.append(recall)
    ndcg_list.append(ndcg)
    print("---------------------------")
print("The mean recall is ", np.mean(recall_list))
print("The mean ndcg is ",np.mean(ndcg_list))

Fitting model...
Evaluating model...


A Jupyter Widget

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Evaluating model...


A Jupyter Widget

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Evaluating model...


A Jupyter Widget

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Evaluating model...


A Jupyter Widget

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Evaluating model...


A Jupyter Widget

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



ValueError: setting an array element with a sequence.

## Evaluate KNN 

In [None]:
#Evaluate KNN from Cross Validation 

# train model
recall_list=[]
for i in range(k):
    print("Fitting model...")
    train=train_list[i]
    test=test_list[i]
    model = implicit.nearest_neighbours.BM25Recommender()    
    model.fit(train, show_progress=False)
    recall = evaluate(model,test,plays_sparse)
    print(recall*100,'%')
    recall_list.append(recall)
    print("---------------------------")
print("The mean recall is ", np.mean(recall_list))


# Evaluate Baseline

In [None]:
#Evaluate Model-based from Baseline

# train model
recall_list=[]
for i in range(k):
    print("Fitting model...")
    train=train_list[i]
    test=test_list[i]
    user_items = plays_sparse.T.tocsr()
    rec_items=baseline(20,user_items)
    recall = evaluate_base(rec_items,test,plays_sparse)
    print(recall*100,'%')
    recall_list.append(recall)
    print("---------------------------")
print("The mean recall is ", np.mean(recall_list))

In [None]:
model = implicit.als.AlternatingLeastSquares(factors=50)
model.fit(plays_sparse)
catalog = []
for i in range(0,len(users)):
    for x,y in model.recommend(i,plays_sparse.T.tocsr(), N=20, filter_already_liked_items=True):
        if x not in catalog:
            catalog.append(x)
print('Catalog Coverage is', len(catalog)/len(artists))

In [None]:
#AUC https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc
#Primary metric: ROC-based
#calculate Precision, Recall, TPr, FPr, AUC by comparing resulting matrix to test_data
#Secondary metric: DCG

#cross_validate(data, k)
#get k-fold indices on train (mask again)
#for each different k-fold, loop through the indices, masked as train, test as not
#train model
#calculate Precision, Recall, AUC, append result to list
#return list of scores for each fold

#calculate ATOP? DCG? 