In [42]:
import pandas as pd
names =  pd.read_csv("company.csv")
print('The shape: %d x %d' % names.shape)
names.head()

The shape: 663000 x 3


Unnamed: 0,Line Number,Company Name,Company CIK Key
0,1,!J INC,1438823
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512
3,4,#1 PAINTBALL CORP,1433777
4,5,$ LLC,1427189


In [43]:
names=names[:10000]

In [44]:
import re

def ngrams(string, n=4):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

print('All 3-grams in "McDonalds":')
ngrams('McDonalds')

All 3-grams in "McDonalds":


['McDo', 'cDon', 'Dona', 'onal', 'nald', 'alds']

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

company_names = names['Company Name']
vectorizer = TfidfVectorizer(min_df=3, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)

In [46]:
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [47]:
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 100, 0.75)
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 0.3454611301422119


In [48]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [49]:
matches_df = get_matches_df(matches, company_names, top=1000)

matches_df

Unnamed: 0,left_side,right_side,similairity
0,!J INC,!J INC,1.000000
1,"#1 A LIFESAFER HOLDINGS, INC.","#1 A LIFESAFER HOLDINGS, INC.",1.000000
2,#1 ARIZONA DISCOUNT PROPERTIES LLC,#1 ARIZONA DISCOUNT PROPERTIES LLC,1.000000
3,#1 PAINTBALL CORP,#1 PAINTBALL CORP,1.000000
4,$ LLC,ABJJ LLC,1.000000
5,$ LLC,"3FLOZ, LLC",1.000000
6,$ LLC,34FMGJ LLC,1.000000
7,$ LLC,$ LLC,1.000000
8,& S MEDIA GROUP LLC,& S MEDIA GROUP LLC,1.000000
9,& S MEDIA GROUP LLC,2AM MEDIA GROUP LLC,0.764606
