In [1]:
'''
Instructions of functions:

tfidf_weighted_words(sentence) -- calculating tf-idf scores for an assigned word or a sentence. 
    Example: tfidf_weighted_words('bicycle')
             tfidf_weighted_words('big data analysis')
             
tfidf_similarity(patent_id) -- calculating the similarity for a patent. Input should be a patent id number. 
    Example: tfidf_similarity(10000000)

'''

"\nInstructions of functions:\n\ntfidf_weighted_words(sentence) -- calculating tf-idf scores for an assigned word or a sentence. \n    Example: tfidf_weighted_words('bicycle')\n             tfidf_weighted_words('big data analysis')\n             \ntfidf_similarity(patent_id) -- calculating the similarity for a patent. Input should be a patent id number. \n    Example: tfidf_similarity(10000000)\n\n"

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("Data/all_combined.csv").drop(['Unnamed: 0'], axis=1)
dft = df.copy()
stop = stopwords.words('english')
dft['title'] = dft['title'].str.lower()
dft['title'] = dft['title'].str.replace('\d+', '')
dft['title'] = dft['title'].str.split(' ').apply(lambda x: [item for item in x if item not in stop])
dft['title']=dft['title'].apply(', '.join)
v = TfidfVectorizer()
x = v.fit_transform(dft['title'])
dftx = pd.DataFrame(x.toarray(), columns=v.get_feature_names())
tfidf= pd.concat([df, dftx], axis=1)
dft['abstract'] = dft['abstract'].str.lower()
dft['abstract'] = dft['abstract'].str.replace('\d+', '')
dft['abstract'] = dft['abstract'].str.split(' ').apply(lambda x: [item for item in x if item not in stop])
dft['abstract'] = dft['abstract'].apply(', '.join)
x1 = v.fit_transform(dft['abstract'])
dftx1 = pd.DataFrame(x1.toarray(), columns=v.get_feature_names())
tfidf1= pd.concat([df, dftx1], axis=1)

In [3]:
sim_t = cosine_similarity(dftx)
sim_a = cosine_similarity(dftx1)
word_list_title = dftx.columns.get_values().tolist()
word_list_abstract = dftx1.columns.get_values().tolist()

In [4]:
def tfidf_weighted(word):
    temp = df.copy()
    word = word.lower()
    
    if word in word_list_abstract:
        temp_a = tfidf1.sort_values(by=word , ascending=False)[[word]]
        temp['result_a'] = temp_a[[word]]
    else:
        temp['result_a'] = 0.0
        
    if word in word_list_title:
        temp_t = tfidf.sort_values(by=word , ascending=False)[[word]]
        temp['result_t'] = temp_t[[word]]
    else:
        temp['result_t'] = 0.0
    
    temp['result_weighted'] = 0.8 * temp['result_t'] + 0.2 * temp['result_a']
    temp = temp.sort_values(by=['result_weighted'] , ascending=False)
    #temp = temp[(temp['result_weighted'] > 0)]
    temp = temp.drop(columns=['result_t', 'result_a'])
    
    return temp

def tfidf_weighted_words(sentence):
    wordlist = re.sub("[^\w]", " ",  sentence).split()
    temp = pd.DataFrame()
    for i in range(len(wordlist)):
        temp[wordlist[i]] = tfidf_weighted(wordlist[i])['result_weighted']
    
    temp.loc[:,'Total'] = temp.sum(axis=1)
    temp = temp[['Total']]
    
    res = pd.concat([df, temp], axis=1).sort_values(by="Total" , ascending=False)
    res = res[(res['Total'] > 0)]
    return res

In [5]:
def tfidf_similarity(patent_id):
    #patent_id=int(patent_id)
    temp = df.copy()
    index = temp[temp['id']==patent_id].index.values.astype(int)[0]
    temp['sim_temp_t'] = sim_t[index]
    temp['sim_temp_a'] = sim_a[index]
    temp['sim_temp'] = 0.8 * temp['sim_temp_t'] + 0.2 * temp['sim_temp_a']
    temp = temp.sort_values(by="sim_temp" , ascending=False)
    temp = temp[(temp['sim_temp'] > 0)]
    temp = temp[1:6]
    temp = temp.drop(columns=['sim_temp_t', 'sim_temp_a'])
    
    return temp

In [6]:
#test for finding similar items
tfidf_similarity(10000000)

Unnamed: 0,id,date,abstract,title,kind,num_claims,A,B,C,D,E,F,G,H,Y,inventor_name,lawyer_name,assignee_name,sim_temp
1297,4999006,1991-03-12,A coherent optical apparatus of this inventio...,Coherent optical apparatus,A,6.0,,,,,,,1.0,,,['Kenjiro Hamanaka'],"[' ,Woodcock Washburn Kurtz Mackiewicz & Norris']","[' ,Nippon Sheet Glass Co., Ltd.']",0.278483
1661,5365184,1994-11-15,A phase modulated signal is split into quadra...,Quadrature phase processing,A,19.0,,,,,,,1.0,,,"['Alan B. Callender', 'Robert A. Bondurant']","['Robert E. Greenstien,']","[' ,United Technologies Corporation']",0.267854
4445,8165388,2012-04-24,A system and method for pixel and object level...,Neutral pixel detection in an image path,B2,21.0,,,,,,,,1.0,,['Xing Li'],"[' ,Fleit Gibbons Gutman Bongini & Bianco P.L....","[' ,Xerox Corporation']",0.245545
4374,8094146,2012-01-10,Disclosed here is a driving method for a pixel...,Driving method for pixel circuit and display a...,B2,6.0,,,,,,,1.0,,,"['Tetsuya Yamamoto', 'Katsuhide Uchino']","[' ,Rader, Fishman & Grauer PLLC']","[' ,Sony Corporation']",0.206256
1239,4940893,1990-07-10,Method and apparatus for forming coherent clu...,Method and apparatus for forming coherent clus...,A,31.0,,,,,,,1.0,1.0,,['Shui-Yin Lo'],"[' ,Ladas & Parry']","[' ,Apricot S.A.']",0.201075


In [9]:
#test for get recommendation by keywords
tfidf_weighted_words('computer system').head()

Unnamed: 0,id,date,abstract,title,kind,num_claims,A,B,C,D,E,F,G,H,Y,inventor_name,lawyer_name,assignee_name,Total
2973,6687354,2004-02-03,The present invention refers to a telephone ex...,Method and arrangement for connection of a com...,B1,28.0,,,,,,,,1.0,,['Per Tomas Andreason'],unknown,"[' ,Telefonaktiebolaget LM Ericsson (publ)']",0.532415
3411,7126968,2006-10-24,In a data compression system for compressing a...,"Data compression system, data decompression sy...",B2,20.0,,,,,,,,1.0,,"['Yasuo Takagi', 'Wataro Shinohara']","[' ,Oblon, Spivak, McClelland, Maier & Neustad...","[' ,Kabushiki Kaisha Toshiba']",0.525685
3253,6968398,2005-11-22,A method of virtualizing hardware resources in...,Method of virtualizing I/O resources in a comp...,B2,20.0,,,,,,,1.0,,,"['Brad A. Davis', 'Thomas E. Malone']","[' ,Lieberman & Brandsdorfer, LLC']","[' ,International Business Machines Corporation']",0.511489
2236,5948054,1999-09-07,In a networked computer system including a cu...,Method and system for facilitating the exchang...,A,18.0,,,,,,,1.0,1.0,,['Jakob Peter Nielsen'],"[' ,Sabath & Truong']","[' ,Sun Microsystems, Inc.']",0.503783
2379,6091395,2000-07-18,A computer system and method manipulate a win...,Computer system and method of manipulating a g...,A,51.0,,,,,,,1.0,,,['George Francis DeStefano'],"[' ,Wood, Herron & Evans, L.L.P.']","[' ,International Business Machines Corporation']",0.502276
