In [1]:
import pickle
import sklearn.metrics
import numpy as np

class CFRecommender:
    def __init__(self):
        
        # Get sparse utility matrix
        with open("utility_matrix.pkl", "rb") as f:
            utility = pickle.load(f)
            
        # Convert utility matrix for efficient row- and column- lookups
        self.U_csc = utility.tocsc()
        self.U_csr = utility.tocsr()
        
        # Get lookup tables
        with open("id2title.pkl", "rb") as f:
            self.id2title = pickle.load(f)

        with open("id2name.pkl", "rb") as f:
            self.id2name = pickle.load(f)

        with open("name2id.pkl", "rb") as f:
            self.name2id = pickle.load(f)

        with open("title2id.pkl", "rb") as f:
            self.title2id = pickle.load(f)
        
        # Get item encoders and user encoders
        # They are used to convert user and item ids into utility matrix entries
        with open("items_encoder.pkl", "rb") as f:
            self.items_encoder = pickle.load(f)

        with open("users_encoder.pkl", "rb") as f:
            self.users_encoder = pickle.load(f)

        
    def user_based_recommendations(self, user_name="Yann LeCun", user_neighborhood_k=30, n_recommendations=5):
    
        # find the paper's vector (row in utility matrix)
        user_id = self.name2id.get(user_name)
        user_idx = self.users_encoder.transform([user_id])
        user_vec = self.U_csr.getrow(user_idx)

        # calculate similarities with all the other users
        sims = sklearn.metrics.pairwise.cosine_similarity(user_vec, self.U_csr)
        sims = np.squeeze(sims)

        # get users most similar to the queried user
        asort = sims.argsort()
        k = user_neighborhood_k
        k_most_similar_idx = asort[::-1][1:k+1]
        k_sims = sims[k_most_similar_idx]

        # calculate missing utility matrix entries based on the neighborhood
        ratings_k_similar = np.squeeze(np.array([self.U_csr.getrow(r).todense() for r in k_most_similar_idx]))
        ratings_k_similar_weighted = (k_sims[:,np.newaxis] * ratings_k_similar)
        predicted_ratings = ratings_k_similar_weighted.mean(axis=0)

        # get b best recommendations
        b = n_recommendations
        b_best_items_idx = np.argsort(predicted_ratings)[::-1][:b]
        b_best_paper_ids = self.items_encoder.inverse_transform(b_best_items_idx)
        
        
        recommender_paper_titles = [self.id2title[i] for i in b_best_paper_ids]
        
        return recommender_paper_titles
        
        
    def item_based_recommendations(self, user_name="Yann LeCun", user_item_neighborhood_k=30, n_recommendations=5):
    
        user_id = self.name2id.get(user_name)
        user_idx = self.users_encoder.transform([user_id])
        user_vec = self.U_csr.getrow(user_idx)
        _, cited_item_idxs = user_vec.nonzero()

        pred_rtgs = []
        for idx in cited_item_idxs:
            curr_item = self.U_csc.getcol(idx).T

            # calculate similarities
            sims = sklearn.metrics.pairwise.cosine_similarity(curr_item, self.U_csc.T)
            sims = np.squeeze(sims) 

            # get the most similar items
            asort = sims.argsort()
            asort = asort[::-1] # to descending order

            # get the neighborhood
            k = item_neighborhood_k
            k_most_similar_idx = asort[1:k+1] # the highest-similarity idx will be the user herself, so we skip that
            k_sims = sims[k_most_similar_idx]
            cols_k_similar = np.squeeze(np.array([self.U_csr.getcol(r).todense() for r in k_most_similar_idx]))
            pred_from_one_item = cols_k_similar.mean(axis=0)
            pred_rtgs.append(pred_from_one_item)

        predicted_ratings = np.array(pred_rtgs).mean(axis=0)

        asort = predicted_ratings.argsort()
        asort = asort[::-1] # to descending order

        b = n_recommendations
        b = 5
        predicted_idxs = asort[:b]

        predicted_paper_ids = self.items_encoder.inverse_transform(predicted_idxs)
        recommended_paper_titles = [self.id2title[i] for i in predicted_paper_ids]

        return recommended_paper_titles

In [2]:
recommender = CFRecommender()

In [3]:
recommender.user_based_recommendations("Yann LeCun", user_neighborhood_k=30, n_recommendations=5)

['A novel graph matching based approach for land-cover classification of multi-temporal images',
 'On error models for RTL security evaluations',
 'Integrating Security Policies in Federated Database Systems',
 'FINITE VOLUME PERSPECTIVES ON FINITE DIFFERENCE SCHEMES AND BOUNDARY FORMULATIONS FOR WAVE SIMULATION',
 'Graph-merged detection and decoding of polar-coded MIMO systems']

In [10]:
recommender.item_based_recommendations("Yann LeCun", item_neighborhood_k=30, n_recommendations=5)

['Algebraic attacks on stream ciphers with linear feedback',
 'Algebraic Attacks and Decomposition of Boolean Functions',
 'Basic Theory in Construction of Boolean Functions with Maximum Possible Annihilator Immunity',
 'Algebraic immunity for cryptographically significant Boolean functions: analysis and construction',
 'On the algebraic immunity of symmetric boolean functions']

In [12]:
recommender.item_based_recommendations("Andrej Karpathy", item_neighborhood_k=30, n_recommendations=5)

['Smart-M3 information sharing platform',
 'How smart are our environments? An updated look at the state of the art',
 'A conceptual framework and a toolkit for supporting the rapid prototyping of context-aware applications',
 'Key Properties in the Development of Smart Spaces',
 'Mobile application for guiding tourist activities: tourist assistant - TAIS']

In [17]:
recommender.user_based_recommendations("Kaikai Liu", user_neighborhood_k=30, n_recommendations=5)

['Multimode spatiotemporal background modeling for complex scenes',
 'Fast Parallel Connected Components Algorithms on GPUs',
 'On the Effect of Previous Technological Experience on the Usability of a Virtual Rehabilitation Tool for the Physical Activation and Cognitive Stimulation of Elders',
 'An environment to develop parallel code for solving partial differential equations based-problems',
 'Approximating power indices: theoretical and empirical analysis']

In [22]:
recommender.item_based_recommendations("Wojciech Zaremba", item_neighborhood_k=30, n_recommendations=5)

['Software pipelining: an effective scheduling technique for VLIW machines',
 'Some scheduling techniques and an easily schedulable horizontal architecture for high performance scientific computing',
 'Trace Scheduling: A Technique for Global Microcode Compaction',
 'Trace scheduling: a technique for global microcode compaction',
 'Software pipelining']

In [23]:
recommender.item_based_recommendations("Magdalini Eirinaki", item_neighborhood_k=30, n_recommendations=5)

['RESOLUTION OF COMPOSITE FUZZY RELATION EQUATIONS',
 'Introduction to Fuzzy Sets',
 'Toward a theory of fuzzy information granulation and its centrality in human reasoning and fuzzy logic',
 'On T -congruence L -relations on groups and rings',
 'Flexible neuro-fuzzy systems']