In [427]:
#our goal is to have top K artist recommendations for a user, so that they would 
from __future__ import print_function
import pandas as pd
import implicit
import numpy as np
import scipy
import random
from tqdm import tqdm_notebook as tqdm

df = pd.read_csv('lastfm_9000_users.csv', na_filter=False)
df = df[df.plays > 10] #our threshold of relevance is 10 plays
df = df.drop(['Unnamed: 0'], axis=1)

## TO DO

BIG THINGS: 
- set-up cross-validation
- calculate precision/recall/AUC using ALL user-pairs, not only k-recommended values
- calculate NDCG 
- calculate ATOP 
- calculate speed performance

CASES TO EVALUATE:
- sparse vs. non-sparse data
- threshold of relevance
- which users should we hold out (those with > 10/20/30 values?)

PERFORMANCE:
- speed up split_train_per_user, by figuring out how to access zero values faster
- speed up precision, recall evaluation

In [458]:
#create sparse matrix from dataframe object
def create_sparse_matrix(data, user_user = True):
    """
    Creates sparse matrix (csr_matrix) out of pandas dataframe.
    
    Parameters: 
    - data: Dataframe of user/artist data
    - user_user: determines whether output will be for user-to-user or item-to-item collaborative filtering
                 if user_user is True, then rows will be items and columns will be users
    
    Returns: 
    - plays_sparse: a sparse csr_matrix
    
    """
    print("Creating sparse matrix...")
    #grab unique users/artist IDS
    users = list(np.sort(data.user_id.unique()))
    artists = list(data.artist_name.unique())
    plays = list(data.plays)

    # user-user set-up
    if (user_user == True):
        rows = data.artist_name.astype('category', categories=artists).cat.codes
        cols = data.user_id.astype('category', categories=users).cat.codes
        plays_sparse = scipy.sparse.csr_matrix((plays, (rows, cols)), shape=(len(artists),len(users)))

    #item-item set-up
    else:    
        rows = data.user_id.astype('category', categories=users).cat.codes
        cols = data.artist_name.astype('category', categories=artists).cat.codes
        plays_sparse = scipy.sparse.csr_matrix((plays, (rows, cols)), shape=(len(users),len(artists)))
        
    return plays_sparse

#calculate how sparse the matrix is
def calculate_sparsity(M):
    """
    Calculates how many 
    p
    """
    matrix_size = float(M.shape[0]*M.shape[1]) # Number of possible interactions in the matrix
    num_plays = len(M.nonzero()[0]) # Number of items interacted with
    sparsity = 100*(1 - float(num_plays/matrix_size))
    return sparsity

#split train, test using all user pairs
def make_train_all_user_pairs(data, test_pct):
    """
    params:
        data: data set in csr_matrix format
        test_pct: percentage to be test set
    """
    #create copies of dataset for training and test data
    test = data.copy()
    train = data.copy()
    
    #alter train data, masking/holding-out random user-pair values for some users
    nonzero_idx = train.nonzero() #find indices in data where interaction exists
    nonzero_pairs = zip(nonzero_idx[0], nonzero_idx[1]) #create pairs of (user, item) index
    
    #determine how many user-pair values we need to mask, according to test_pct
    random.seed(0) #for reproducibility
    num_samples = int(np.ceil(test_pct * len(nonzero_pairs)))
    samples = random.sample(nonzero_pairs, num_samples) #sample random number of user-item pairs without replacement
    
    #get user, item row and column indices
    user_idx = [index[0] for index in samples] 
    item_idx = [index[1] for index in samples] 
    
    train[user_idx,item_idx] = 0 #mask the randomly chosen user-item pairs
    train.eliminate_zeros() #remove zeros in sparse arrays that was made previously
    
    return train, test, list(set(user_idx)), samples #output unique list of user rows that were altered

#split train, test by user only with k interactions
def split_train_test_per_user(data, test_pct, min_interactions = 0):
    """
    Create train matrix with masked values and dictionary of test values 
    
    data: csr_matrix, items-rows
    test_pct: percentage of items to mask per user
    """
    #create a copy for training data
    train = plays_sparse.copy().T.tocsr() #transpose to make calculation easier

    random.seed(0) #for reproducibilitiiy

    #dictionary with keys as users and values as a list of item-indexes
    #which were masked
    test = dict()
    
    print("Splitting train, test data for each user...")
    #for each user in sparse matrix, get random user-item pairs to mask
    for user_id in tqdm(range(train.get_shape()[0])):

        #find item indices interaction exists
        nonzero_idx = train[user_id].nonzero()

        #only mask users with enough data (greater than 10 interactions)
        if nonzero_idx[1].shape[0] >= min_interactions:
            item_idx = nonzero_idx[1] #get item indexes of interactions

            #sample random number of item_indexes without replacement
            num_samples = int(np.ceil(test_pct * item_idx.shape[0]))
            samples = random.sample(item_idx, num_samples) 

            #append user_id, item_indexes to test dictionary
            test[user_id] = samples
    
            #mask the randomly chosen items of this user
            for items in samples:
                train[user_id, items] = 0

    #remove zeros in sparse arrays to save space
    train.eliminate_zeros()
    
    return train.T.tocsr(), test #convert back

#calculate how many interactions are masked compared to previous dataset
def pct_masked(original, altered):
    altered_n = altered.nonzero()[0].shape[0]
    original_n = original.nonzero()[0].shape[0]
    return (original_n - altered_n)/float(altered_n)

#used to evaluate model
def evaluate(model, test, M, n_rec = 20):
    """
    Calculate precision/recall
    
    parameters:
    - model: fitted implicit model that will perform recommendations
    - test: dict containing items that are heldout for each user
    - M: csr_matrix of item-users, used in fit
    - n_rec: how many recommendations the system outputs
    
    
    returns:
    - two numpy arrays containing precision and recall
    """
    precision= list()
    recall= list()

    M_rec = M.T.tocsr() #transpose to recommend
    
    print('Evaluating model...')
    #calculate precision/recall for each user, append results to list
    for user, holdout_items in tqdm(test.items()):
        rec = model.recommend(user, M_rec, N=n_rec, filter_already_liked_items=True)
        rec_items = [pair[0] for pair in rec]
        
        #count true positives in recommended items
        tp = float(0)
        for item in holdout_items: 
            if item in set(recs_items):
                tp += 1
                
        #calculate precision and recall
        precision.append(tp/n_rec) #fraction of recommendations that are in hold-out set
        recall.append(tp/len(holdout_items)) #fraction of hold-out set that were reocmmended
    
    return np.asarray(precision), np.asarray(recall)

In [457]:
#MAIN SCRIPT

plays_sparse = create_sparse_matrix(df)

#filter out users with < 15 artists/reduce sparsity if needed
print('Matrix Sparsity:', calculate_sparsity(plays_sparse))

#split train,test by masking random values by user
train, test = split_train_test_per_user(plays_sparse, 0.20, 20)
print("Percentage of original data masked:", pct_masked(plays_sparse, train))

# Matrix Factorization
model = implicit.als.AlternatingLeastSquares(factors=50)

# K-Nearest Neighbors
# model = implicit.nearest_neighbours.BM25Recommender()

# train model 
print("Fitting model...")
model.fit(train, show_progress=True)

precision, recall = evaluate(model, test, plays_sparse)
print('Mean of Precision:',np.mean(precision)*100,'%')
print('Std of Precision:',np.std(precision)*100,'%')
print('Mean of Recall:',np.mean(recall)*100,'%')
print('Std of Recall:',np.std(recall)*100,'%')

Creating sparse matrix...


  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


Matrix Sparsity: 99.9078520973
Splitting train, test data for each user...


HBox(children=(IntProgress(value=0, max=8848), HTML(value=u'')))

  0%|          | 0/15 [00:00<?, ?it/s]

Percentage of original data masked: 0.256562846907
Fitting model...


100%|██████████| 15.0/15 [00:05<00:00,  3.01it/s]

Evaluating model...





HBox(children=(IntProgress(value=0, max=8178), HTML(value=u'')))

Mean of Precision: 1.9203961848862805 %
Std of Precision: 4.497427023400114 %
Mean of Recall: 3.845866997462742 %
Std of Recall: 9.035758132828487 %


In [None]:
#AUC https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc
#Primary metric: ROC-based
#calculate Precision, Recall, TPr, FPr, AUC by comparing resulting matrix to test_data
#Secondary metric: DCG

#cross_validate(data, k)
#get k-fold indices on train (mask again)
#for each different k-fold, loop through the indices, masked as train, test as not
#train model
#calculate Precision, Recall, AUC, append result to list
#return list of scores for each fold

#calculate ATOP? DCG? 