In [1]:
# Import relevant packages
from math import log
import pandas as pd
import implicit
import numpy as np
import scipy
import random
from operator import itemgetter
from tqdm import tqdm_notebook as tqdm
import seaborn as sns
from matplotlib import pyplot as plt
from pylab import savefig
import time 

## Data Preparation|

create_sparse_matrix and calculate_sparsity are functions that allow us to set up the dataset in sparse matrix form.

In [2]:
# Create sparse matrix from dataframe object
def create_sparse_matrix(data, user_user = True):
    """
    Creates sparse matrix (csr_matrix) out of pandas dataframe.
    
    Parameters: 
    - data: Dataframe of user/artist data
    - user_user: determines whether output will be for user-to-user or item-to-item collaborative filtering
                 if user_user is True, then rows will be items and columns will be users
    
    Returns: 
    - plays_sparse: a sparse csr_matrix
    
    """
    print("Creating sparse matrix...")
    #grab unique users/artist IDS
    users = list(np.sort(data.user_id.unique()))
    artists = list(data.artist_mbid.unique())
    plays = list(data.plays)

    # user-user set-up
    if (user_user == True):
        rows = data.user_id.astype('category', categories=users).cat.codes
        cols = data.artist_mbid.astype('category', categories=artists).cat.codes
        plays_sparse = scipy.sparse.csr_matrix((plays, (rows, cols)), shape=(len(users),len(artists)))

    #item-item set-up
    else:    
        rows = data.artist_mbid.astype('category', categories=artists).cat.codes
        cols = data.user_id.astype('category', categories=users).cat.codes
        plays_sparse = scipy.sparse.csr_matrix((plays, (rows, cols)), shape=(len(artists),len(users)))
        
    return plays_sparse

In [3]:
# Calculate sparsity of mnatrix
def calculate_sparsity(M):
    """
    Computes sparsity of matrix
    
    params:
        M: matrix to be computed
    """
    matrix_size = float(M.shape[0]*M.shape[1]) # Number of possible interactions in the matrix
    num_plays = len(M.nonzero()[0]) # Number of items interacted with
    sparsity = 100*(1 - float(num_plays/matrix_size))
    return sparsity

## Split Train Test

These two functions are used to split the dataset into test/train/tune in various ways. 
make_train_all_user_pairs splits the data into a test and training set with a proportion taken over all users, while split_train_test_per_user makes sure that a certain proportion of each user is held off for the test set. 

split_train_test_per_user also allows for the option of k-fold cross validation. 

In [5]:
# Split train, test by user only with interactions#, with test size=total/k
def split_train_test_per_user(data, k, interactions = 20,cross_valid= False):
    """
    Create train matrix with masked values and dictionary of test values 
    
    Parameters:
    - data: csr_matrix, assuming matrix is user-user (item as rows, columns as users)
    - test_pct: percentage of items to mask per user
    
    Output:
    - train: masked matrix
    - test: list of tuples of held out data ((user_idx, item_idx), plays)
    """
    random.seed(0) #for reproducibility
    
    train = data.copy() #transpose to make procedure easier/more intuitive
    
    test = dict() #dict to keep track of masked user-item values
    
    user_count = 0
    test_list=[]
    train_list=[]
    if cross_valid==True: #initialize
        for i in range(k):
            test_list.append(dict())
            train_list.append(train)
    
    for user_idx in tqdm(range(train.get_shape()[0])):

        # Get indices of interactions of this user
        nonzero_idx = train[user_idx].nonzero()

        # Only hold out users that have enough data (greater than interactions #)
        if nonzero_idx[1].shape[0] >= interactions:
            user_count += 1
            # Create list of tuples: interaction index (row, col) with the number of plays
            nonzero_pairs = [((user_idx, item_idx), train[user_idx,item_idx]) for item_idx in nonzero_idx[1]]

            # Sort tuples by descending value
            nonzero_sorted = sorted(nonzero_pairs, key = itemgetter(1), reverse = True)

            # Get top interaction # values, then sample test_pct% randomly from subset
            top_values = nonzero_sorted[0:interactions]

            # Sample random number of item_indexes without replacement
            num_samples = int(np.floor(interactions/float(k)))
            if (cross_valid==False): 
                samples = random.sample(top_values, num_samples) 

                # Append user_idx, item_
                test[user_idx] = [pair[0][1] for pair in samples]

                # Mask the randomly chosen items of this user
                for pair in samples:
                    train[pair[0][0], pair[0][1]] = 0
            
            # Cross Validation Step
            else:
                for i in range(k):
                    train = train_list[i]
                    k_test=test_list[i]
                    random.shuffle(top_values) 
                    samples=top_values[0:num_samples]
                    top_values=top_values[num_samples:]
                    # Append user_idx, item_
                    k_test[user_idx] = [pair[0][1] for pair in samples]
                    test_list[i]=k_test #update test
                    # Mask the randomly chosen items of this user
                    for pair in samples:
                        train[pair[0][0], pair[0][1]] = 0
                    train.eliminate_zeros()
                    # Update train
                    train_list[i]=train 
    if (cross_valid==False):
        # Convert matrix back to initial shape
        return train.T.tocsr(), test, user_count
    else:
        for i in range(k):
            train_list[i]=train_list[i].T.tocsr()
        # Convert matrix back to initial shape
        return train_list, test_list, user_count


In [6]:
# Calculate how many interactions are masked compared to previous dataset
def pct_masked(original, altered):
    altered_n = altered.nonzero()[0].shape[0]
    original_n = original.nonzero()[0].shape[0]
    return (original_n - altered_n)/float(altered_n)

### Baseline Implementation

Below is how we generated our baseline recommendations (taking the most popular artists across the entire dataset and recommending them to everyone)

In [210]:
class Baseline():
    """
    Baseline model. Take most popular artist across entire dataset. 
    """
    def __init__(self, n_recs):
        self.n_recs = n_recs
    
    def fit(self, item_user):
        print("Fitting baseline...")
        plays = item_user.toarray()
        total_plays = np.sum(plays, axis = 1)
        print(total_plays)
        #get index of most popular artists
        idx = (-total_plays).argsort()[:self.n_recs]
        self.idx = idx
    
    def predict(self, X=None):
        return self.idx

## Evaluation/Metrics

The following are the functions we wrote to determine the NDCG/recall of the baseline, ALS, and KNN recommendations. 

auto_tune_parameter is a function written to determine the best hyperparameters to use for a given model. 

NDCG Metrics

In [8]:
def zeros_list(n):
    listofzeros = [0] * n
    return listofzeros

def dcg_at_k(scores):
    assert scores
    return scores[0] + sum(sc / log(ind, 2) for sc, ind in zip(scores[1:], range(2, len(scores)+1)))

def ndcg_at_k(rec_items, holdout_items):
    """
    rec_items: recommended k items from model
    heldout_items: held out items
    """
    assert len(rec_items) == len(holdout_items)
    idcg = dcg_at_k(sorted(holdout_items, reverse=True))
    ndcg = (dcg_at_k(rec_items) / idcg) if idcg > 0.0 else 0.0
    
    return ndcg

In [214]:
# Used to evaluate model
def evaluate(model, model_name, test, M_train, n_rec = 20):
    """
    Calculate evaluation metrics (precision@k, recall@k, NDCG@k)
    
    parameters:
    - model: fitted implicit model that will perform recommendations
    - model_name: name of package for switch case
    - test: list containing tuples that are heldout for each user
    - M_train: csr_matrix of user-item pairs, used in fit (user by item)
    - n_rec: how many recommendations the system outputs
    
    returns:
    - two numpy arrays containing precision and recall
    """
    print('Evaluating model...')
    
    #to store results
    ndcg = []
    catalog = []
    user_n = 0.0
    test_n = 0.0 #keep track of number of heldout items
    tp = 0.0 #true positive 
    
    print(model_name)
    for user, holdout_items in tqdm(test.items()):
        
        user_n += 1
        test_n += len(holdout_items)
        
        #for NDCG
        predicted_items = zeros_list(n_rec)
        true_items = zeros_list(n_rec)
        
        #get recommended items from models for user
        if model_name == "baseline": 
            rec_items = model.predict()
            
        elif model_name == "implicit":
            rec_items = model.recommend(user, M_train.T.tocsr(), N=n_rec, filter_already_liked_items=True) #returns (item_id, score)
            rec_items = [pair[0] for pair in rec_items] #get only item_
        
        elif model_name == "lightfm":
            #sort scores, find top recommended items
#             print(-model.predict([user],range(0,M_train.shape[1])).argsort()[:n_rec])
            rec_items = (-model.predict([user],range(0,M_train.shape[1]))).argsort()[:n_rec]
            print(user)
            
        else:
            raise ValueError("Model may not be supported. Check if model name is correct.")
        
        #if np array change to list
        if isinstance(rec_items, np.ndarray):
            rec_items = rec_items.tolist()
        
        #coverage calculation
        for recs in rec_items:
            if recs not in catalog:
                catalog.append(recs)
        
        #index for holdout items
        i = 0
        
        #calculate True Positive and NDCG Placement
        for item in holdout_items:
            value = M_train[user,item] #get plays value of this holdout item
            true_items[i] = value
            i += 1

            if item in rec_items:
                tp += 1
                predicted_items[rec_items.index(item)] = value #get plays value of true positive

        ndcg.append(ndcg_at_k(predicted_items, true_items))

    recall = tp/test_n
    precision = tp/(n_rec * user_n)
    avg_ndcg = np.mean(ndcg)
    coverage = len(catalog)/float(M_train.shape[1])

    return coverage, precision, recall, avg_ndcg

In [211]:
"""
Function that identifies optimal parameter values given relevant models and arrays of parameters

input: 
    - k: # of folds within the training set (split into tuning sets)
    - interactions: size of recommendation list
    - model: model that is being optimized   
    - data: sparse user-item matrix
    - param1: list of values to try for hyperparameter 1.
    - param2: list of values to try for hyperparameter 2. 
  
output:
    - max_ndcg_list: a list of k tuples, one for each fold. 
        each tuple is in the form (max_ndcg,max_first_param,max_second_param,max_recall)
        which records the best ndcg, and the two params that achieved it, 
        and the max_recall achieved (which may be from different param values).
    - heatmap_list: a list of k heatmaps of the NDCG values for the two tested 
        parameters (one heatmap per fold). Useful for visualizations
"""

def auto_tune_parameter(k,interactions,model,model_name,data,param1,param2,user_features=None,artist_features = None):
    # Train model
    # Create list of MAX NDCG and Recall depending on # params
    max_ndcg_list=[] #will end up being length k list of tuples of best param values
    heatmap_list=[]
    train_and_tune,test,user_count=split_train_test_per_user(data, k+1, interactions,cross_valid= False)
    train_list, tune_list, user_count = split_train_test_per_user(train_and_tune.T.tocsr(),k,int(np.ceil(((k-1)/k)*interactions)),cross_valid=True)
    #to be updated via max: to determine final params to use on test set 
    test_ndcg=0
    test_first_param=param1[0]
    test_second_param=param2[0]
    #create recall/NDCG matrix storing for each combination of params
    for fold in range(k): #For each fold; there are k-1 folds within train_and_tune
        ndcg_heatmap=[[0 for x in range(len(param2))] for y in range(len(param1))]
        print(ndcg_heatmap)
        train=train_list[fold]
        tune=tune_list[fold]
        max_first_param=param1[0] #initialize best value of first_param for this fold
        max_second_param=param2[0] #initialize best value of second_param for this fold
        max_recall=0
        max_ndcg=0
        max_precision=0
        max_coverage=0
        value1_index=0 #index for heatmap
        print("Fitting fold number...",fold)
        for value1 in param1:
            value2_index=0
            for value2 in param2:
                print("Trying ",(value1,value2))
                usemodel=model(value1,value2)
                if (user_features==None):
                    usemodel.fit(train, show_progress=False)
                else:
                    usemodel.fit(train,user_features,artist_features)
                coverage, precision,recall,ndcg = evaluate(usemodel,model_name,tune,data)
                print(value1_index,value2_index)
                ndcg_heatmap[value1_index][value2_index]=ndcg #update heatmap 
                #update maximum values
                max_recall=max(max_recall,recall)
                max_precision=max(max_precision,precision)
                max_coverage=max(max_coverage,coverage)
                if ndcg>max_ndcg: 
                    max_ndcg=ndcg
                    max_first_param=value1
                    max_second_param=value2
                value2_index=value2_index+1
            value1_index=value1_index+1    
        max_ndcg_list.append([max_ndcg,max_first_param,max_second_param,max_recall,max_precision,max_coverage])
        if max_ndcg>test_ndcg:
            print("Fold ",fold," beat the record for ncdg!")
            print("New best ndcg is ",max_ndcg)
            print("New best params are ",(max_first_param,max_second_param))
            test_ndcg=max_ndcg
            test_first_param=max_first_param
            test_second_param=max_second_param
        heatmap_list.append(ndcg_heatmap)
        print("end of fold---------------------------")

    #Now, test_first_param and test_second_param should be optimized
    usemodel=model(test_first_param,test_second_param)
    usemodel.fit(train_and_tune,show_progress=True)
    final_coverage,final_precision,final_recall,final_ndcg = evaluate(usemodel,model_name,test,data)

    print("The precision on the test set is ",final_precision,", after hyperparameter optimization")
    print("The recall on the test set is ", final_recall,", after hyperparameter optimization")
    print("The ndcg on the test set is ",final_ndcg,", after hyperparameter optimization")
    print("The coverage on the test set is ",final_coverage,", after hyperparameter optimization")

    
    return max_ndcg_list,heatmap_list 