### Names matching can be tricky. In many cases, different datasets have slighly difference spelling of the same person's or the same company's name. Patent data also quite often have typos in names. This code utilizes cosine similarity with n-grams approach to match company names from two different datasets.

In [None]:
import pandas as pd
import os 
import re
import numpy as np
#import spacy
import string
pd.set_option('mode.chained_assignment', None)

In [None]:
#Uploading the first dataset with names for the match (in my example, Compustat Capital IQ)
compustat=pd.read_csv('compustat.csv')

#Basic cleaning and making sure both vectors of names are formatted in the same way
compustat['conm']=compustat['conm'].str.lower()
compustat=compustat.rename(columns={'conm':'comp_name'})
compustat['comp_name']=compustat['comp_name'].str.strip()
compustat['comp_name']=compustat['comp_name'].astype(str)
compustat['comp_name']=compustat['comp_name'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
compustat['length']=compustat['comp_name'].str.len()
compustat=compustat[compustat['length']>5]
compustat=compustat.dropna(how='any')

In [None]:
#Uploading the second dataset with names for the match (in my example, Assignees from USPTO patent data)
assignees=pd.read_csv('assignees_list.csv')

#Basic cleaning and making sure both vectors of names are formatted in the same way
assignees['ass_name']=assignees['ass_name'].str.lower()
assignees['ass_name']=assignees['ass_name'].str.strip()
assignees['ass_name']=assignees['ass_name'].astype(str)
assignees['ass_name']=assignees['ass_name'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
assignees['length']=assignees['ass_name'].str.len()
assignees=assignees[assignees['length']>5]
assignees=assignees.dropna(how='any')



####### Pieces of the following code related to fast computation of cosine similarity matrix are taken from van den Blog: https://bergvca.github.io/2017/10/14/super-fast-string-matching.html

In [None]:
#Defining your n-gram that you will use as a unit of vocabulary
def ngrams(string, n=4):
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

#Uploading the TF-IDF library
from sklearn.feature_extraction.text import TfidfVectorizer

#Combining names from both datasets in one np array
all_names=np.concatenate((compustat['comp_name'],assignees['ass_name']))

#Vectorization - dividing all names into 4-grams
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(all_names)

In [None]:
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct # use pip install sparse_dot_topn if you don't have it

#Function that computes cossine similarity and saves only ntop similarity pairs by score
#If similarity score is lower than lower_bound, a pair is ignored
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [None]:
#For names, I would put the lower_bound pretty high
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)

In [None]:
#Creating a dataframe. If letting top parameter to kick in, you can first check what the subsample looks like
#That is helpful when trying to understand the optimal lower_bound
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    #if top:
     #   nr_matches = top
    #else:
     #   nr_matches = sparsecols.size
    
    nr_matches = sparsecols.size

    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'ass_name': left_side,
                          'comp_name': right_side,
                           'similarity': similairity})

matches_df = get_matches_df(matches, all_names)

In [None]:
#The matrix creates similarity score for all possible pairs in the vector of combined names
#You need to make sure that one column has only names from dataset 1 and the other column has only names from dataset 2
assignees=assignees[['ass_name']]
compustat=compustat[['comp_name']]
matches_df=pd.merge(matches_df, assignees, how='inner', left_on='ass_name', right_on='ass_name')
matches_df=pd.merge(matches_df, compustat, how='inner', left_on='comp_name', right_on='comp_name')
matches_df=matches_df.drop_duplicates()
#You might want to double check that the companies in the pairs start with the same letters
matches_df['ass_first']=matches_df['ass_name'].str[0:3]
matches_df['comp_first']=matches_df['comp_name'].str[0:3]
matches_df=matches_df[matches_df['ass_first']==matches_df['comp_first']]
matches_df=matches_df[['comp_name','ass_name','similarity']]