In [1]:
run analysis_functions.ipynb #import all helper functions

In [70]:
def active_users(data, threshold = 0.5):
    """
    Identifies and returns CSR matrices representing the most/least active users,
    determined by the number of plays they have logged
    
    Parameters:
    - data: csr_matrix (no_users, no_items) with users as rows and items as columns
    - threshold: a percentage threshold for 'activeness', e.g. threshold=0.1 identifies 
    top 10% (and bottom 90%) of the most active users
    
    Output:
    - most_active: csr_matrix of the most active users as specified by threshold
    - least_active: csr_matrix of the least active users as specified by threshold
    """
    plays_sum = np.asarray(data.sum(axis=1))
    min_plays = np.percentile(plays_sum, (1-threshold) * 100)
    
    most_active = []
    least_active = []
    for user in range(0,data.shape[0]):
        if plays_sum[user] < min_plays:
            least_active.append(data.getrow(user).toarray()[0])
        else:
            most_active.append(data.getrow(user).toarray()[0])
            
    return scipy.sparse.csr_matrix(most_active), scipy.sparse.csr_matrix(least_active)

In [71]:
def diverse_users(data, threshold = 0.5):
    """
    Identifies and returns CSR matrices representing the most/least diverse users,
    determined by the diversity of artists listened to.
    
    Diversity of each user is calculated by the spread of listens across all artists played.
    
    Parameters:
    - data: csr_matrix (no_users, no_items) with users as rows and items as columns
    - threshold: a percentage threshold for diversity, e.g. threshold=0.1 identifies 
    top 10% (and bottom 90%) of users with the most diverse music taste
    
    Output:
    - most_diverse: csr_matrix of the most 'diverse' users as specified by threshold
    - least_diverse: csr_matrix of the least 'diverse' users as specified by threshold
    """
    plays_norm = []
    for user in range(0,plays_sparse.shape[0]):
        row = plays_sparse.getrow(user).toarray()[0]
        plays_norm.append(sum(np.interp(row, (row.min(), row.max()), (0, 1))))
    min_score = np.percentile(plays_norm, (1-threshold) * 100)
    
    most_diverse = []
    least_diverse = []
    for user in range(0,plays_sparse.shape[0]):
        if plays_norm[user] < min_score:
            least_diverse.append(plays_sparse.getrow(user).toarray()[0])
        else:
            most_diverse.append(plays_sparse.getrow(user).toarray()[0])
    
    return scipy.sparse.csr_matrix(most_diverse), scipy.sparse.csr_matrix(least_diverse)

In [None]:
def mainstream_users(data, n = 20, threshold = 0.5):
    """
    Identifies and returns CSR matrices representing the most/least mainstream users,
    determined by the popularity of the artists they listened to.
    
    Popularity of each artist is calculated by the total plays across all users. Indicator for how
    mainstream a user is is computed by the number of top n artists they listen to.
    
    Parameters:
    - data: csr_matrix (no_users, no_items) with users as rows and items as columns
    - threshold: a percentage threshold for diversity, e.g. threshold=0.1 identifies 
    top 10% (and bottom 90%) of users with the most diverse music taste
    
    Output:
    - most_active: csr_matrix of the most 'diverse' users as specified by threshold
    - least_active: csr_matrix of the least 'diverse' users as specified by threshold
    """