In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [3]:
# https://www.kaggle.com/dattapiy/sec-edgar-companies-list
subset_size = 100000 # change to 1000000 to see ann faster than brute force
names_df = pd.read_csv('data/sec__edgar_company_info.csv')[:subset_size]
names_df.head(3)

Unnamed: 0,Line Number,Company Name,Company CIK Key
0,1,!J INC,1438823
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512


In [4]:
import re

irrelevant_regex = re.compile(r'[^a-z0-9\s\!\@\#\$\%\&]')
multispace_regex = re.compile(r'\s\s+')

def assign_no_symbols_name(df):
    return df.assign(
        name=df['Company Name']
             .str.lower()
             .str.replace(irrelevant_regex, ' ')
             .str.replace(multispace_regex, ' ')
             .str.strip())

names_df = assign_no_symbols_name(names_df)
names_df.head(9)

Unnamed: 0,Line Number,Company Name,Company CIK Key,name
0,1,!J INC,1438823,!j inc
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607,#1 a lifesafer holdings inc
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512,#1 arizona discount properties llc
3,4,#1 PAINTBALL CORP,1433777,#1 paintball corp
4,5,$ LLC,1427189,$ llc
5,6,& S MEDIA GROUP LLC,1447162,& s media group llc
6,7,&TV COMMUNICATIONS INC.,1479357,&tv communications inc
7,8,"'MKTG, INC.'",886475,mktg inc
8,9,'OHANA LABS INC.,1703629,ohana labs inc


In [5]:
index_to_name = names_df['name'].to_dict()
name_to_index = {v: k for k, v in index_to_name.items()}

In [6]:
import more_itertools

cik_clusters = (
    names_df[['name', 'Company CIK Key']]
    .groupby('Company CIK Key')
    .indices
)
more_itertools.take(10, cik_clusters.values())

[array([5878]),
 array([6287]),
 array([6375]),
 array([6459, 6468]),
 array([6521]),
 array([6578]),
 array([7053, 7055]),
 array([7209]),
 array([7175]),
 array([7525])]

In [7]:
import itertools

cik_name_pairs = [
    (index_to_name[id_x], index_to_name[id_y])
    for cik_cluster in cik_clusters.values()
    for id_x, id_y in itertools.combinations(cik_cluster, 2)
    if index_to_name[id_x] != index_to_name[id_y]
]
cik_name_pairs[:10]

[('abel noser corp bd', 'abel noser corp'),
 ('abraham & co inc bd', 'abraham & co inc'),
 ('acme metals inc de', 'acme metals inc'),
 ('adams diversified equity fund inc', 'adams diversified equity fund'),
 ('adams diversified equity fund inc', 'adams express co'),
 ('adams diversified equity fund', 'adams express co'),
 ('alliance gaming corp', 'bally technologies inc'),
 ('aei securities inc bd', 'aei securities inc'),
 ('aeroflex inc', 'arx inc'),
 ('aetna life & casualty co', 'aetna services inc ct')]

In [8]:
import more_itertools
from fuzzywuzzy import fuzz

true_pairs = {
    tuple(sorted(pair))
    for pair in cik_name_pairs
    if fuzz.token_sort_ratio(*pair) >= 60
    # if fuzz.partial_ratio(*pair) >= 60  
}
display(more_itertools.take(10, true_pairs))
display(len(true_pairs))

[('candela capital l p', 'candela capital lp'),
 ('buffalo equity fund inc', 'buffalo large cap fund inc'),
 ('better place llc', 'better plc llc'),
 ('advest bank & trust co', 'advest trust co'),
 ('brookwood securities partners llc', 'brookwood securities partners lp bd'),
 ('americredit automobile receivables trust 2002 d',
  'americredit financial services inc automobile rec tr 2002 d'),
 ('azimuth diversified fund ii llc', 'azimuth diversified fund llc'),
 ('ab cap fund inc', 'alliancebernstein cap fund inc'),
 ('anangel american shipholdings ltd',
  'anangel american shipholdings ltd adr'),
 ('ag separate account a of agl', 'aga separate account a')]

3986

In [9]:
names_with_duplicates = list(set(itertools.chain.from_iterable(true_pairs)))
names_with_duplicates.sort()
len(names_with_duplicates)

6735

In [10]:
string_list = names_with_duplicates
k = 50
threshold = 0.6

In [11]:
%%time

from helpers.tfidf_blocker import block_with_tfidf_ann

ann_labels = block_with_tfidf_ann(string_list, threshold=threshold, k=k)
display(ann_labels)

array([2847, 2557, 2846, ...,  971, 1159, 1159])

CPU times: user 17.7 s, sys: 543 ms, total: 18.2 s
Wall time: 6.51 s


In [12]:
%%time

from helpers.tfidf_blocker import block_with_tfidf_brute

brute_labels = block_with_tfidf_brute(string_list, threshold=threshold, k=k)
display(brute_labels)

array([2836, 2541, 2835, ...,  964, 2343, 2343])

CPU times: user 2.07 s, sys: 440 ms, total: 2.51 s
Wall time: 1.88 s


In [13]:
from collections import defaultdict

def blocks_from_labels(dataset, labels):
    blocks = defaultdict(list)
    for i, label in enumerate(labels):
        record = dataset[i]
        blocks[label].append(record)
    return blocks

def pairs_from_blocks(blocks):
    return set(
        tuple(sorted(pair))
        for block in blocks.values()
        for pair in itertools.combinations(block, 2))

In [14]:
def precision(found_pairs, true_pairs):
    return len(true_pairs & found_pairs) / len(found_pairs)

def recall(found_pairs, true_pairs):
    return len(true_pairs & found_pairs) / (len(true_pairs & found_pairs) + len(true_pairs - found_pairs))

In [15]:
ann_blocks = blocks_from_labels(string_list, ann_labels)
# ann_blocks

In [16]:
ann_pairs = pairs_from_blocks(ann_blocks)
display(precision(ann_pairs, true_pairs))
display(recall(ann_pairs, true_pairs))

0.410547132498352

0.7812343201204215

In [17]:
brute_blocks = blocks_from_labels(string_list, brute_labels)
brute_pairs = pairs_from_blocks(brute_blocks)
display(precision(brute_pairs, true_pairs))
display(recall(brute_pairs, true_pairs))

0.41153238546603477

0.7842448569994982

In [18]:
true_pairs - ann_pairs

{('012 smile communications ltd', 'b communications ltd'),
 ('1 800 attorney inc', 'attorneys com inc'),
 ('20th century industries', '21st century insurance group'),
 ('2tor inc', '2u inc'),
 ('4front software international inc co', '4front technologies inc'),
 ('79 capital securities llc', 'capital innovations securities llc'),
 ('@road inc', 'at road inc'),
 ('a c r securities inc', 'acr securities inc bd'),
 ('a l laboratories inc', 'a l pharma inc'),
 ('a t a p financial services inc', 'atap financial services inc bd'),
 ('a123 systems inc', 'b456 systems inc'),
 ('ab bond fund inc', 'alliance bond fund inc'),
 ('ab bond fund inc', 'alliancebernstein bond fund inc'),
 ('ab cap fund inc', 'alliancebernstein cap fund inc'),
 ('ab corporate shares', 'alliancebernstein corporate shares'),
 ('ab corporate shares', 'alliancebernstein corporatee shares'),
 ('ab discovery growth fund inc', 'alliance mid cap growth fund inc'),
 ('ab discovery growth fund inc',
  'alliancebernstein discover

In [19]:
ann_pairs - true_pairs

{('american express financial advisors', 'american express financial corp ta'),
 ('a t a p financial services inc', 'bolton financial services llc bd'),
 ('1 lane technologies corp', 'calient technologies inc'),
 ('barclays bank plc', 'barclays private bank & trust cayman ltd'),
 ('american realty capital trust ii inc', 'american realty trust inc'),
 ('bencmark capital management co llc', 'berkeley capital management funds'),
 ('advisors disciplined trust 50', 'advisors discliplined trust series 20'),
 ('amalgamated bank', 'amalgamated bank longview funds'),
 ('advisors disciplined trust 52', 'advisors disciplined trust series 57'),
 ('ammon robert j', 'beckman robert j'),
 ('alpine associates l p', 'amici associates l p'),
 ('capital guardian international all countries equity fund for tax exempt trusts',
  'capital guardian international non us equity fund for tax tr'),
 ('ascent acquisition corp', 'ascent acquisition psc llc'),
 ('aegis investments inc bd', 'altegris investments llc

In [20]:
# debugging
from helpers.tfidf_blocker import vectorize, index_on_approx_knn, compute_knn_similarity_matrix

tfidf_matrix = vectorize(string_list, ngram_range=(2, 2), analyzer='char_wb')
tfidf_dense_matrix = tfidf_matrix.toarray()
approx_knn_index = index_on_approx_knn(tfidf_dense_matrix)
pairwise_similarity_csr_matrix = compute_knn_similarity_matrix(approx_knn_index, tfidf_dense_matrix, k=k, threshold=threshold)
i_to_string = dict(enumerate(string_list))
string_to_i = {v: k for k, v in i_to_string.items()}

In [21]:
def string_pair_to_i_pair(string_pair):
    name_x, name_y = string_pair
    return string_to_i[name_x], string_to_i[name_y]

pairwise_similarity_csr_matrix[string_pair_to_i_pair(('blackey jeffrey', 'blakey jeff'))]

0.7754569

In [22]:
pairwise_similarity_csr_matrix.nonzero()

(array([   0,    1,    2, ..., 6733, 6734, 6734], dtype=int32),
 array([   0,    1,    2, ..., 6734, 6733, 6734], dtype=int32))

In [23]:
non_zero_pairs = {
    (i_to_string[i], i_to_string[j])
    for i, j in zip(*(list(arr) for arr in pairwise_similarity_csr_matrix.nonzero()))
    if i < j
}
non_zero_pairs

{('apollo capital group inc', 'blue capital group llc bd'),
 ('advisors disciplined trust 551', 'advisors disciplined trust 63'),
 ('apollo distressed investment fund lp', 'brar investment fund lp'),
 ('apollo real estate opportunity fund vi lp',
  'capital structures opportunity fund lp'),
 ('alliancebernstein global bond fund inc',
  'allliancebernstein greater china 97 fund inc'),
 ('acadian asset management llc', 'boston partners asset management llc'),
 ('capital growth financial llc', 'capital growth securities llc'),
 ('achelios therapeutics inc', 'agilis biotherapeutics inc'),
 ('atlantic coast airlines holdings inc', 'c t holdings inc'),
 ('abi capital management llc', 'berkeley capital management llc'),
 ('banc of america mort sec inc mort pass thr certs ser 2003 1',
  'banc one mortgage sec inc mort pass thr certs ser 2003 9'),
 ('accipiter capital management llc', 'capital markets mangement llc'),
 ('brar investment capital llc', 'brar investment fund lp'),
 ('altair nanote

In [24]:
display(precision(non_zero_pairs, true_pairs))
display(recall(non_zero_pairs, true_pairs))

0.12789753432747675

0.8692925238334169

In [25]:
true_pairs - non_zero_pairs

{('012 smile communications ltd', 'b communications ltd'),
 ('1 800 attorney inc', 'attorneys com inc'),
 ('20th century industries', '21st century insurance group'),
 ('2tor inc', '2u inc'),
 ('4front software international inc co', '4front technologies inc'),
 ('6922767 holding s a r l', '6922767 holding s r l'),
 ('79 capital securities llc', 'capital innovations securities llc'),
 ('@road inc', 'at road inc'),
 ('a l laboratories inc', 'a l pharma inc'),
 ('a123 systems inc', 'b456 systems inc'),
 ('ab bond fund inc', 'alliancebernstein bond fund inc'),
 ('ab cap fund inc', 'alliancebernstein cap fund inc'),
 ('ab discovery growth fund inc', 'alliance mid cap growth fund inc'),
 ('ab discovery growth fund inc', 'alliancebernstein mid cap growth fund inc'),
 ('ab equity income fund inc', 'alliance utility income fund inc'),
 ('ab equity income fund inc', 'alliancebernstein utility income fund inc'),
 ('ab funds trust de', 'annuity board funds trust'),
 ('ab large cap growth fund inc

In [26]:
non_zero_pairs - true_pairs

{('apollo capital group inc', 'blue capital group llc bd'),
 ('american energy capital partners lp', 'breitburn energy partners l p'),
 ('advisors disciplined trust 551', 'advisors disciplined trust 63'),
 ('apollo distressed investment fund lp', 'brar investment fund lp'),
 ('apollo real estate opportunity fund vi lp',
  'capital structures opportunity fund lp'),
 ('alliancebernstein global bond fund inc',
  'allliancebernstein greater china 97 fund inc'),
 ('basso capital management lp', 'bencmark capital management co llc'),
 ('acadian asset management llc', 'boston partners asset management llc'),
 ('achelios therapeutics inc', 'agilis biotherapeutics inc'),
 ('atlantic coast airlines holdings inc', 'c t holdings inc'),
 ('adamas opportunities l p',
  'american securities opportunities fund iii b l p'),
 ('abi capital management llc', 'berkeley capital management llc'),
 ('banc of america mort sec inc mort pass thr certs ser 2003 1',
  'banc one mortgage sec inc mort pass thr certs

In [27]:
pairwise_similarity_csr_matrix[string_pair_to_i_pair(('aikins mark r', 'atkins mark r'))]

0.64621115