In [1]:
run analysis_functions.ipynb #import all helper functions

In [2]:
# Import main dataset
df = pd.read_csv('lastfm_9000_users.csv', na_filter=False)
df = df.drop(['Unnamed: 0'], axis=1)
#create sparse matrix
plays_sparse = create_sparse_matrix(df).astype('float')
print('Matrix Sparsity:', calculate_sparsity(plays_sparse))

Creating sparse matrix...
Matrix Sparsity: 99.8965986346416




In [3]:
def active_users(data, n = 5):
    """
    Identifies and returns CSR matrices representing groups of users by activity,
    determined by the number of plays they have logged
    
    Parameters:
    - data: csr_matrix (no_users, no_items) with users as rows and items as columns
    - n: number of distinct groups (classified by level of activity) to output
    
    Output:
    - groups: an array of CSR matrices representing the users in each activity group
    """
    groups = [[] for i in range(n)]
    plays_sum = np.asarray(data.sum(axis=1))
    group_range = [np.percentile(plays_sum, [(100/n) * i, (100/n) * (i+1)]) for i in range(n)]

    for user in range(data.shape[0]):
        for j in range(0, n):
            if group_range[j][0] <= plays_sum[user] <= group_range[j][1]:
                groups[j].append(data.getrow(user).toarray()[0])

    groups = [scipy.sparse.csr_matrix(i) for i in groups]
    
    return groups

In [4]:
def diverse_users(data, threshold = 0.5):
    """
    Identifies and returns CSR matrices representing the most/least diverse users,
    determined by the diversity of artists listened to.
    
    Diversity of each user is calculated by the spread of listens across all artists played.
    
    Parameters:
    - data: csr_matrix (no_users, no_items) with users as rows and items as columns
    - threshold: a percentage threshold for diversity, e.g. threshold=0.1 identifies 
    top 10% (and bottom 90%) of users with the most diverse music taste
    
    Output:
    - most_diverse: csr_matrix of the most 'diverse' users as specified by threshold
    - least_diverse: csr_matrix of the least 'diverse' users as specified by threshold
    """
    plays_norm = []
    for user in range(0,plays_sparse.shape[0]):
        row = plays_sparse.getrow(user).toarray()[0]
        plays_norm.append(sum(np.interp(row, (row.min(), row.max()), (0, 1))))
    min_score = np.percentile(plays_norm, (1-threshold) * 100)
    
    most_diverse = []
    least_diverse = []
    for user in range(0,plays_sparse.shape[0]):
        if plays_norm[user] < min_score:
            least_diverse.append(plays_sparse.getrow(user).toarray()[0])
        else:
            most_diverse.append(plays_sparse.getrow(user).toarray()[0])
    
    return scipy.sparse.csr_matrix(most_diverse), scipy.sparse.csr_matrix(least_diverse)

In [None]:
def mainstream_users(data, n = 20, threshold = 0.5):
    """
    Identifies and returns CSR matrices representing the most/least mainstream users,
    determined by the popularity of the artists they listened to.
    
    Popularity of each artist is calculated by the total plays across all users. Indicator for how
    mainstream a user is is computed by the number of top n artists they listen to.
    
    Parameters:
    - data: csr_matrix (no_users, no_items) with users as rows and items as columns
    - threshold: a percentage threshold for diversity, e.g. threshold=0.1 identifies 
    top 10% (and bottom 90%) of users with the most diverse music taste
    
    Output:
    - most_mainstream: csr_matrix of the most 'mainstream' users as specified by threshold
    - least_mainstream: csr_matrix of the least 'mainstream' users as specified by threshold
    """
    

In [11]:
artist_sum = np.asarray(plays_sparse.sum(axis=0))[0]
popular = artist_sum.argsort()[-20:]

most_mainstream = []
least_mainstream = []
for user in range(0,plays_sparse.shape[0]):
    row = plays_sparse.getrow(user).toarray()[0]
    if row[popular[0]] < min_score:
        least_diverse.append(plays_sparse.getrow(user).toarray()[0])
    else:
        most_diverse.append(plays_sparse.getrow(user).toarray()[0])

return scipy.sparse.csr_matrix(most_diverse), scipy.sparse.csr_matrix(least_diverse)

array([  16,  658,  821,  811, 1144,   49,  745,  664,  129,  583,   70,
         47,  671,  106,    0,  831,   56,  149,  331,   83])