In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [3]:
# https://www.kaggle.com/dattapiy/sec-edgar-companies-list
names =  pd.read_csv('data/sec__edgar_company_info.csv')[:100000]
names.head(3)

Unnamed: 0,Line Number,Company Name,Company CIK Key
0,1,!J INC,1438823
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512


In [4]:
import re

irrelevant_regex = re.compile(r'[^a-z0-9\s\!\@\#\$\%\&]')
multispace_regex = re.compile(r'\s\s+')

def assign_no_symbols_name(df):
    return df.assign(
        name=df['Company Name']
             .str.lower()
             .str.replace(irrelevant_regex, ' ')
             .str.replace(multispace_regex, ' ')
             .str.strip())

names = assign_no_symbols_name(names)
names.head(9)

Unnamed: 0,Line Number,Company Name,Company CIK Key,name
0,1,!J INC,1438823,!j inc
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607,#1 a lifesafer holdings inc
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512,#1 arizona discount properties llc
3,4,#1 PAINTBALL CORP,1433777,#1 paintball corp
4,5,$ LLC,1427189,$ llc
5,6,& S MEDIA GROUP LLC,1447162,& s media group llc
6,7,&TV COMMUNICATIONS INC.,1479357,&tv communications inc
7,8,"'MKTG, INC.'",886475,mktg inc
8,9,'OHANA LABS INC.,1703629,ohana labs inc


In [5]:
names_grouped = (
    names[['name', 'Company CIK Key']]
    .groupby('Company CIK Key')
    .filter(lambda x: len(x) > 1)
    .groupby('Company CIK Key')
    .agg(set)
)
names_grouped

Unnamed: 0_level_0,name
Company CIK Key,Unnamed: 1_level_1
1841,"{abel noser corp, abel noser corp bd}"
1904,"{abraham & co inc, abraham & co inc bd}"
2093,"{acme metals inc, acme metals inc de}"
2178,{adams resources & energy inc}
2230,"{adams diversified equity fund, adams diversif..."
...,...
1701914,"{buchta gary g, buctha gary g}"
1701916,"{argos funds sicav argos funds argonaut fund, ..."
1701917,{argos funds sicav in respect of argos funds y...
1701918,{argos funds sicav argos funds the bamboo fund...


In [6]:
import itertools
from fuzzywuzzy import fuzz

true_pairs = [
    tuple(sorted(pair))
    for duplicate_names in names_grouped['name'].values
    for pair in itertools.combinations(duplicate_names, 2)
    if fuzz.token_sort_ratio(*pair) >= 60
    # if fuzz.partial_ratio(*pair) >= 60  
]
display(true_pairs[:10])
display(len(true_pairs))
true_pairs = set(true_pairs)

[('abel noser corp', 'abel noser corp bd'),
 ('abraham & co inc', 'abraham & co inc bd'),
 ('acme metals inc', 'acme metals inc de'),
 ('adams diversified equity fund', 'adams diversified equity fund inc'),
 ('aei securities inc', 'aei securities inc bd'),
 ('aeroflex inc', 'arx inc'),
 ('aetna life & casualty co', 'aetna services inc ct'),
 ('aetna services inc', 'aetna services inc ct'),
 ('aetna variable encore fund', 'aetna variable encore fund inc'),
 ('agf securities inc', 'agf securities inc bd')]

3989

In [7]:
names_with_duplicates = list(set(itertools.chain.from_iterable(true_pairs)))
names_with_duplicates.sort()
len(names_with_duplicates)

6735

In [8]:
string_list = names_with_duplicates
k = 50
threshold = 0.6

In [9]:
%%time

from helpers.tfidf_blocker import block_with_tfidf_ann

ann_labels = block_with_tfidf_ann(string_list, threshold=threshold, k=k)
display(ann_labels)

array([2902, 2606, 1450, ...,  491, 2312, 2312])

CPU times: user 9.17 s, sys: 580 ms, total: 9.75 s
Wall time: 4.59 s


In [10]:
%%time

from helpers.tfidf_blocker import block_with_tfidf_brute

brute_labels = block_with_tfidf_brute(string_list, threshold=threshold, k=k)
display(brute_labels)

array([2836, 2541, 2835, ...,  964, 2343, 2343])

CPU times: user 2.16 s, sys: 522 ms, total: 2.68 s
Wall time: 2.15 s


In [12]:
from collections import defaultdict

def blocks_from_labels(dataset, labels):
    blocks = defaultdict(list)
    for i, label in enumerate(labels):
        record = dataset[i]
        blocks[label].append(record)
    return blocks

def pairs_from_blocks(blocks):
    return set(
        tuple(sorted(pair))
        for block in blocks.values()
        for pair in itertools.combinations(block, 2))

In [13]:
def precision(found_pairs, true_pairs):
    return len(true_pairs & found_pairs) / len(found_pairs)

def recall(found_pairs, true_pairs):
    return len(true_pairs & found_pairs) / (len(true_pairs & found_pairs) + len(true_pairs - found_pairs))

In [14]:
ann_blocks = blocks_from_labels(names_with_duplicates, ann_labels)
# ann_blocks

In [15]:
ann_pairs = pairs_from_blocks(ann_blocks)
display(precision(ann_pairs, true_pairs))
display(recall(ann_pairs, true_pairs))

0.4115193109944826

0.7671851480180633

In [16]:
brute_blocks = blocks_from_labels(names_with_duplicates, brute_labels)
brute_pairs = pairs_from_blocks(brute_blocks)
display(precision(brute_pairs, true_pairs))
display(recall(brute_pairs, true_pairs))

0.41153238546603477

0.7842448569994982

In [17]:
true_pairs - ann_pairs

{('012 smile communications ltd', 'b communications ltd'),
 ('1 800 attorney inc', 'attorneys com inc'),
 ('1st miracle entertainment inc', '1st miracle group inc'),
 ('20 20 gene systems inc', '20 20 genesystems inc'),
 ('20th century industries', '21st century insurance group'),
 ('2100 capital managed futures offshore fund ltd',
  '2100 xenon managed futures offshore fund ltd'),
 ('2tor inc', '2u inc'),
 ('360network inc', '360networks inc'),
 ('3g capital partners lp', '3g capital partners ltd'),
 ('4front software international inc co', '4front technologies inc'),
 ('6922767 holding s a r l', '6922767 holding s r l'),
 ('79 capital securities llc', 'capital innovations securities llc'),
 ('@road inc', 'at road inc'),
 ('a c r securities inc', 'acr securities inc bd'),
 ('a l laboratories inc', 'a l pharma inc'),
 ('a t a p financial services inc', 'atap financial services inc bd'),
 ('a123 systems inc', 'b456 systems inc'),
 ('ab bond fund inc', 'alliance bond fund inc'),
 ('ab bo

In [18]:
# debugging
from helpers.tfidf_blocker import vectorize, index_on_approx_knn, compute_knn_similarity_matrix

tfidf_matrix = vectorize(string_list, ngram_range=(2, 2), analyzer='char_wb')
tfidf_dense_matrix = tfidf_matrix.toarray()
approx_knn_index = index_on_approx_knn(tfidf_dense_matrix)
pairwise_similarity_csr_matrix = compute_knn_similarity_matrix(approx_knn_index, tfidf_dense_matrix, k=k, threshold=threshold)
i_to_name = dict(enumerate(names_with_duplicates))
name_to_i = {v: k for k, v in i_to_name.items()}

In [19]:
def name_pair_to_i_pair(name_pair):
    name_x, name_y = name_pair
    return name_to_i[name_x], name_to_i[name_y]

pairwise_similarity_csr_matrix[name_pair_to_i_pair(('blackey jeffrey', 'blakey jeff'))]

0.7754569

In [20]:
pairwise_similarity_csr_matrix[name_pair_to_i_pair(('bonny paul v', 'bonny pual v'))]

0.702332

In [21]:
pairwise_similarity_csr_matrix.nonzero()

(array([   0,    1,    2, ..., 6733, 6734, 6734], dtype=int32),
 array([   0,    1,    2, ..., 6734, 6733, 6734], dtype=int32))

In [22]:
non_zero_pairs = {
    (i_to_name[i], i_to_name[j])
    for i, j in zip(*[list(arr) for arr in pairwise_similarity_csr_matrix.nonzero()])
    if i < j
}
non_zero_pairs

{('alden global emerging markets fund offshore lp',
  'allen global partners offshore'),
 ('atlas securities llc', 'bdc securities llc'),
 ('breed technologies inc', 'bytewatch technologies inc'),
 ('advisor disciplined trust 902', 'advisors disciplined trust 64'),
 ('advisors disciplind trust 641', 'advisors disciplined trust 64'),
 ('blue harbour group l p', 'blue harbour group lp'),
 ('agora securities corp', 'alliance securities corp'),
 ('calamos financial services inc', 'calypso financial services inc'),
 ('arch communications group inc', 'arch communications group inc old'),
 ('artisan therapeutics inc', 'avanyx therapeutics inc'),
 ('american classic securities inc bd', 'american landmark securities inc'),
 ('alliance growth & income fund inc', 'alliance mid cap growth fund inc'),
 ('alliance securities corp', 'bullaro securities corp bd'),
 ('berens capital partners lp', 'capital partnership lp'),
 ('alpha equity multi strategy offshore fund ltd',
  'aris multi strategy fund l

In [23]:
display(precision(non_zero_pairs, true_pairs))
display(recall(non_zero_pairs, true_pairs))

0.12682345936290562

0.8549924736578023

In [24]:
true_pairs - non_zero_pairs

{('012 smile communications ltd', 'b communications ltd'),
 ('1 800 attorney inc', 'attorneys com inc'),
 ('20 20 gene systems inc', '20 20 genesystems inc'),
 ('201 bjf descendants trust b', '2010 bjf descendants trust b'),
 ('20th century industries', '21st century insurance group'),
 ('2100 capital managed futures offshore fund ltd',
  '2100 xenon managed futures offshore fund ltd'),
 ('2tor inc', '2u inc'),
 ('360network inc', '360networks inc'),
 ('3g capital partners lp', '3g capital partners ltd'),
 ('4front software international inc co', '4front technologies inc'),
 ('79 capital securities llc', 'capital innovations securities llc'),
 ('@road inc', 'at road inc'),
 ('a l laboratories inc', 'a l pharma inc'),
 ('a123 systems inc', 'b456 systems inc'),
 ('ab bond fund inc', 'alliancebernstein bond fund inc'),
 ('ab cap fund inc', 'alliancebernstein cap fund inc'),
 ('ab corporate shares', 'alliancebernstein corporatee shares'),
 ('ab discovery growth fund inc', 'alliance mid cap

In [25]:
non_zero_pairs - true_pairs

{('alden global emerging markets fund offshore lp',
  'allen global partners offshore'),
 ('atlas securities llc', 'bdc securities llc'),
 ('breed technologies inc', 'bytewatch technologies inc'),
 ('advisors disciplined trust 63', 'advisors disciplined trust series 69'),
 ('brandywine global investment management llc',
  'bt investment management re ltd'),
 ('advisor disciplined trust 902', 'advisors disciplined trust 64'),
 ('advisors disciplind trust 641', 'advisors disciplined trust 64'),
 ('agora securities corp', 'alliance securities corp'),
 ('calamos financial services inc', 'calypso financial services inc'),
 ('artisan therapeutics inc', 'avanyx therapeutics inc'),
 ('american classic securities inc bd', 'american landmark securities inc'),
 ('alliance growth & income fund inc', 'alliance mid cap growth fund inc'),
 ('alliance securities corp', 'bullaro securities corp bd'),
 ('berens capital partners lp', 'capital partnership lp'),
 ('alpha equity multi strategy offshore fund

In [26]:
pairwise_similarity_csr_matrix[name_pair_to_i_pair(('aikins mark r', 'atkins mark r'))]

0.64621115

In [27]:
pairwise_similarity_csr_matrix[name_pair_to_i_pair(('allen corinne e', 'allen corrine'))]

0.720903

In [28]:
pairwise_similarity_csr_matrix[name_pair_to_i_pair(('ball eric j', 'ball eric r'))]

0.6239901

In [29]:
import os
from sparse_dot_topn import awesome_cossim_topn

brute_force_pairwise_similarity_csr_matrix = awesome_cossim_topn(
    tfidf_matrix,
    tfidf_matrix.T,
    ntop=k,
    lower_bound=threshold,
    use_threads=True,
    n_jobs=os.cpu_count())

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(tfidf_matrix[name_to_i['aikins mark r']], tfidf_matrix[name_to_i['atkins mark r']])

array([[0.64621107]])

In [31]:
brute_force_pairwise_similarity_csr_matrix[name_pair_to_i_pair(('allen corinne e', 'allen corrine'))]

0.7209028641286321

In [32]:
brute_force_pairwise_similarity_csr_matrix[name_pair_to_i_pair(('ball eric j', 'ball eric r'))]

0.6239901176596676