## User user similarity

User-user cosine similarity is computed within each subcluster, based on publication year and category tag.
The purpose of this part is to select the most similar users within each group for the end output of the application.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from itertools import product

In [2]:
# Read users books and users clusters csv files
user_all = pd.read_csv('user_id_rating_book_all.csv')
cluster = pd.read_csv('clusters_final.csv')

In [3]:
# get each group from user_all
def clust_group(dfin, dfclusters, clustid):
    """ Merge all clusters and subclusters

    Args:
        dfin (:obj:`DataFrame`): pandas DataFrame of user and features
        dfclusters (:obj:`DataFrame`): pandas DataFrame of user_idx and group number
        clustid (:obj:`int`): number of group

    Returns:
        :obj:`tag_year`: pandas DataFrame pivoted with user vs book year and tag
    """
    
    # first find the users in the group needed from the user_idx group DataFrame
    df_cluster = dfclusters[dfclusters['group'] == clustid]
    
    # get the list of users belonging to this group
    listusers = list(set(df_cluster['user_idx']))
    
    # now get the corresponding columns from the features matrix
    df_cluster_features = dfin[dfin['user_idx'].isin(listusers)]
    
    # drop unnecessary columns
    df_clust_filtered = df_cluster_features.drop(df_cluster_features.columns[[1, 2, 3, 4, 5, 6, 7, 8, 10]], axis=1)  
    
    # count the number of years per user and pivot dataframe
    count_user_year = df_clust_filtered.groupby(['user_idx', 'pub_year']).size().reset_index(name ='year_count')
    user_year_pivot = pd.pivot_table(count_user_year, index='user_idx', columns= 'pub_year', values='year_count')
    
    # count the number of category tags and pivot dataframe
    count_user_tag = df_clust_filtered.groupby(['user_idx', 'popular_shelves']).size().reset_index(name ='tag_count')
    listtags = Counter(count_user_tag['popular_shelves'])
    listtags_freq = [k for k,v in listtags.items() if v>=10]
    df_u_tags = count_user_tag[count_user_tag['popular_shelves'].isin(listtags_freq)]
    user_tag_pivot = pd.pivot_table(df_u_tags, index='user_idx', columns= 'popular_shelves', values='tag_count')

    # merge two dataframes on user_idx
    tag_year = pd.merge(user_year_pivot, user_tag_pivot, on='user_idx')
    tag_year.fillna(0, inplace=True)

    return tag_year    

def cosine_similarity_clusters(dfin):
    """ Calculate cosine similarity between users based on book year and category tags

    Args:
         dfin (:obj:`DataFrame`): pandas DataFrame pivoted with user vs book year and tag

    Returns:
        :obj:`similarity_with_tag_year`: pandas DataFrame of user-user similarity
    """
    cosine = cosine_similarity(dfin)
    np.fill_diagonal(cosine, 0 )
    similarity_with_tag_year =pd.DataFrame(cosine,index=dfin.index)
    similarity_with_tag_year.columns=dfin.index
    
    return similarity_with_tag_year

def get_similarity_flat(similarity, group):
    """ Flatten similarity matrix into row, col, similarity, group format

    Args:
         similarity (:obj:`DataFrame`): pandas DataFrame of user-user similarity
         group (:obj:`int`): number of group

    Returns:
        :obj:`user1`: list of user 1
        :obj:`user2`: list of user 2
        :obj:`sim`: list of user1/user2 similarity
        :obj:`groupusers`: list of groupusers        
    """

    user1 = []
    user2 = []
    sim   = []
    groupusers = []
    for u1, u2 in list(product(similarity.index,similarity.index)):
        if u1 != u2:
            user1.append(u1)
            user2.append(u2)
            sim.append(similarity.loc[u1][u2])
            groupusers.append(group)
    return user1, user2, sim, groupusers

In [4]:
allu1 = [] 
allu2 = [] 
allsim = [] 
allgroups = []

# get list of cluster groups
listgroups = list(set(cluster['group']))


for group in listgroups:
    tag_year = clust_group(user_all, cluster, group)
    sim      = cosine_similarity_clusters(tag_year)
    user1, user2, sim, groupusers = get_similarity_flat(sim, group)
    allu1.append(user1)
    allu2.append(user2)
    allsim.append(sim)
    allgroups.append(groupusers)

# Flatten lists of list
allu1_flat = [elm for u in allu1 for elm in u]
allu2_flat = [elm for u in allu2 for elm in u]
allsim_flat = [elm for u in allsim for elm in u]
allgroups_flat = [elm for u in allgroups for elm in u]

# Combine all in pandas DataFrame
users_similarity= pd.DataFrame(zip(allu1_flat, allu2_flat, allsim_flat, allgroups_flat), 
                               columns=['User1', 'User2', 'Similarity', 'Group'])

In [None]:
user_similarity.to_csv("user_user_similarity.csv")