## This code matches patents to their most similar Wikipedia article by Cosine similarity.

In [None]:
import pandas as pd
import os 
import re
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
pd.set_option('mode.chained_assignment', None)

nltk.download('stopwords')

In [None]:
#Uploading file with patent abstracts
patents_text=pd.read_csv('brf_sum_text.csv', encoding = "ISO-8859-1")
#Uploading file with patent titles
patents_title=pd.read_csv('pat_ass_title.csv')


#Truncating abstracts for computational purposes
patents_text['abstract']=patents_text['abstract'].str[0:5000]
patents_title['patent'] = patents_title['patent'].apply(pd.to_numeric, errors='coerce')
patents_text['patent'] = patents_text['patent'].apply(pd.to_numeric, errors='coerce')
patents_title=patents_title.dropna()
patents_text=patents_text.dropna()

#Merging titles and abstracts together
patents_text=pd.merge(patents_title, patents_text, how='inner', left_on='patent', right_on='patent')
patents_text=patents_text.rename(columns={'abstract':'text'})
# Basic cleaning
patents_text['text'] = patents_text['text'].str.lower()
patents_text['text'].replace(to_replace=r'[^a-z]+', value=' ', inplace=True, regex=True)
patents_text['text']=patents_text['text'].str.findall('\w{2,}').str.join(' ')
patents_text['title']=patents_text['title'].str.findall('\w{3,}').str.join(' ')
patents_text['title'] = patents_text['title'].str.lower()
patents_text['title'].replace(to_replace=r'[^a-z]+', value=' ', inplace=True, regex=True)

#Combining patent abstract and title into one after repeating the title three times
patents_text['patents_text']=patents_text['title']+' '
patents_text['patents_text']=patents_text['patents_text'].str.repeat(3)
patents_text['patent_text']=patents_text['patents_text']+patents_text['text']
patents_text=patents_text.rename(columns={'title':'patent_title'})
patents_text=patents_text[['patent','patent_title','patent_text']]
patents_text['patent_text']=patents_text['patent_text'].astype(str) #checking right type


In [None]:
#Upload selected Wiki articles (after defining the relevant ones)
articles=pd.read_csv('selected_wiki_articles.csv')

#Basic cleaning
articles['text']=articles['text'].str.lower()
articles['text']=articles['text'].astype(str)
articles['text']=articles['text'].str.findall('\w{3,}').str.join(' ')
#Truncating article text at the same level as patent text
articles['text']=articles['text'].str[0:5000]
articles['title']=articles['title'].str.lower()
articles['title']=articles['title'].astype(str)
articles['title']=articles['title'].str.findall('\w{3,}').str.join(' ')

#Combining article text and title into one after repeating the title three times
articles['article_text']=articles['title']+' '
articles['article_text']=articles['article_text'].str.repeat(3)
articles['article_text']=articles['article_text']+articles['text']
articles=articles.rename(columns={'title':'article_title'})
articles=articles[['article_title','article_text']]


######## Both patent and Wikipedia articles are now prepared for the match

##### Now the goal is to find the most similar Wikipedia article for each patent using cosine similarity (conditional on passing a threshold of similarity lower bound)



####### Pieces of the following code related to fast computation of cosine similarity matrix are taken from van den Blog: https://bergvca.github.io/2017/10/14/super-fast-string-matching.html

In [None]:
#Functions for tokenizing and lemmatizing text

tokenizer=RegexpTokenizer(r'\w+')
def remove_stopwords(text):
    words=[w for w in text if w not in stopwords.words('english')]
    return words

lemmatizer=WordNetLemmatizer()
def word_lemmatizer(text):
    lem_text=[lemmatizer.lemmatize(i) for i in text]
    return lem_text
  

In [None]:
patents_text['patent_text'].dropna(inplace=True)
patents_text['patent_text']=patents_text['patent_text'].apply(lambda x: tokenizer.tokenize(x))
patents_text['patent_text']=patents_text['patent_text'].apply(lambda x: remove_stopwords(x))
patents_text['patent_text']=patents_text['patent_text'].apply(lambda x: word_lemmatizer(x))
patents_text['patent_text']=patents_text['patent_text'].str.join(" ")
patents_text['patent_text'].dropna(inplace=True)

articles['article_text'].dropna(inplace=True)
articles['article_text']=articles['article_text'].apply(lambda x: tokenizer.tokenize(x))
articles['article_text']=articles['article_text'].apply(lambda x: remove_stopwords(x))
articles['article_text']=articles['article_text'].apply(lambda x: word_lemmatizer(x))
articles['article_text']=articles['article_text'].str.join(" ")
articles['article_text'].dropna(inplace=True)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

#Matrix that computes pair-wise cosine similarity between all text files but keeps only ntop of them by score
#If similarity score is lower than lower_bound, a pair is ignored
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

import time



In [None]:
#Creating a dataframe. If letting top parameter to kick in, you can first check what the subsample looks like
#That is helpful when trying to understand the optimal lower_bound
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    #if top:
     #   nr_matches = top
    #else:
     #   nr_matches = sparsecols.size
    
    nr_matches = sparsecols.size

    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'patent_text': left_side,
                          'article_text': right_side,
                           'similarity': similairity})

In [None]:
#You can split dataset by groups if you want to paralllelize things or just to track the progress
patents_text['row']=patents_text.index
patents_text['row']=patents_text['row']//10000
groups=patents_text['row'].nunique()

In [None]:
for i in range(1,groups):
    print("Processing group {}".format(i))
    t1 = time.time()
    df_patents=patents_text[patents_text['row']==i]
    df_patents=df_patents.drop(['row'], axis=1)
    all_text=np.concatenate((df_patents['patent_text'],df_articles['article_text']))
    vectorizer = TfidfVectorizer(min_df=1)
    tf_idf_matrix = vectorizer.fit_transform(all_text)
    matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 100, 0.3)
    matches_df = get_matches_df(matches, all_text)
    matches_df=pd.merge(matches_df, df_articles, how='inner', left_on='article_text', right_on='article_text')
    matches_df=pd.merge(matches_df, df_patents, how='inner', left_on='patent_text', right_on='patent_text')
    matches_df=matches_df.drop_duplicates()
    matches_df=matches_df.sort_values(by=['patent','similarity'], ascending=[True,False])
    #Choose top-1 article for each patent by similarity
    matches_df=matches_df.drop_duplicates(subset=['patent'], keep='first')
    matches_df=matches_df.reset_index(drop=True)
    if i==1:
        all_matches=matches_df
    else:
        all_matches=all_matches.append(matches_df, ignore_index=True)
    del matches_df
    del df_patents
    t = time.time()-t1
    print("SELFTIMED:", t)