In [13]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

patent_sample = pd.read_csv('Data/all_combined.csv').drop(['Unnamed: 0'], axis=1)

"""
get user clicks sparse matrix
"""

def get_user_clicks_csr_matrix(patents, users, clicks):
    n_patents = len(patents)
    n_users = len(users)
    data_lenth = 0
    indptr = [0]
    indices = np.array([])
    for user in users:
        data_lenth += len(clicks[user])
        indptr.append(len(clicks[user]) + indptr[-1])
        click_patents = []
        indexes = [i for i,x in enumerate(patents) if x in clicks[user]]
        indices = np.append(indices, np.array(indexes))
    indptr = np.array(indptr)    
    data = np.array([1] * data_lenth)
    clicks_matrix = csr_matrix((data, indices, indptr), shape=(n_users, n_patents))
    return clicks_matrix

"""
Compute pairwise pearson correlation of users
"""
def get_user_corr(clicks_matrix):
    df = pd.DataFrame(clicks_matrix.toarray().T)
    corr = df.corr()
    return corr

"""
get the most close neighbors (peason correlation > 0), if no similar neighbors, return []
recommend nearest neighbor's clicked patent to the user
"""
def user_based_recommender(user, clicks):

    for key in clicks:
        clicks[key] = set(map(str, set(map(int, set(map(float, clicks[key]))))))

    users = list(clicks.keys())
    patents = list(patent_sample['id'].astype(str))

    clicks_matrix = get_user_clicks_csr_matrix(patents, users, clicks)
    corr = get_user_corr(clicks_matrix)

    ui = users.index(user)

    neighbors = [(i, x) for i,x in enumerate(corr[ui]) if x > - 0.9 and i != ui]
    neighbors.sort(key=lambda x: x[1], reverse=True)

    if not neighbors or len(neighbors) == 0:
        return []
    nn = neighbors[0][0]
    recomm_list = [ x for x in clicks[users[nn]] if x not in clicks[user] ]

    return recomm_list

"""
get the result dataframe
"""
def search_similar_user(user, clicks):
    idlist = user_based_recommender(user, clicks)
    temp = patent_sample.copy()
    temp = temp[temp['id'].isin(idlist)]
    return temp


In [14]:
# Test for functions

test ={'qgjuc': {4886794, 5732480, 7072753, 7189198, 8006808, 8029913, 8419399},
    'pwqvk': {4045704, 4176847}}
name = 'pwqvk'

search_similar_user(name, test)

Unnamed: 0,id,date,abstract,title,kind,num_claims,A,B,C,D,E,F,G,H,Y,inventor_name,lawyer_name,assignee_name
1185,4886794,1989-12-12,"4-[(.alpha.,.alpha.-Diaryl)-hydroxymethyl]-1-...","4-[(.alpha.,.alpha.-diaryl)-hydroxymethyl]-1-p...",A,60.0,,,1.0,,,,,,,['David A. Walsh'],unknown,"[' ,A. H. Robins Company, Incorporated']"
2024,5732480,1998-03-31,The invention is a waterproof shoe which is c...,Water shoe,A,7.0,1.0,1.0,,,,,,,,['Gunter Notzold'],['Gary A. ...,"[' ,W. L. Gore & Associates, Inc.']"
3357,7072753,2006-07-04,The invention relates to a hazard-prevention s...,Hazard-prevention system for a vehicle,B2,13.0,,1.0,,,,,1.0,,,"['Markus Maria Hess', 'Siegfried Rothe', 'Walt...","[' ,Crowell & Moring LLP']","[' ,DaimlerChrysler AG']"
3473,7189198,2007-03-13,A method of delivering a substance to targeted...,Magnetically guidable carriers and methods for...,B2,24.0,1.0,,,,,,,,,"['Rogers C. Ritter', 'Jonathan Harburn']","[' ,Harness, Dickey & Pierce, P.L.C.']","[' ,Stereotaxis, Inc.']"
4287,8006808,2011-08-30,An encoder failure in an elevator drive system...,Managing an encoder malfunction in an elevator...,B2,23.0,,1.0,,,,,,,,"['Ismail Agirman', 'Jeffrey M. Izard', 'Edward...","[' ,Kinney & Lange, P.A.']","[' ,Otis Elevator Company']"
4310,8029913,2011-10-04,"A panel is disclosed formed of sheet material,...",Waveform panel,B2,13.0,,1.0,,,1.0,,,1.0,1.0,['Ronald Leslie Mann'],"[' ,Pearne & Gordon LLP']","[' ,Gram Engineering Pty Limited']"
4697,8419399,2013-04-16,A gear compressor or supercharger for compress...,Roots type gear compressor with helical lobes ...,B2,14.0,,,,,,1.0,,,,"['Jirka Kaplan', 'Les Davenport']","[' ,Gowling Lafleur Henderson LLP']","[' ,Acceleration Enterprises Ltd.', ' ,592301 ..."
