In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [3]:
# https://www.kaggle.com/dattapiy/sec-edgar-companies-list
names_df = pd.read_csv('data/sec__edgar_company_info.csv')[:100000]
names_df.head(3)

Unnamed: 0,Line Number,Company Name,Company CIK Key
0,1,!J INC,1438823
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512


In [4]:
import re

irrelevant_regex = re.compile(r'[^a-z0-9\s\!\@\#\$\%\&]')
multispace_regex = re.compile(r'\s\s+')

def assign_no_symbols_name(df):
    return df.assign(
        name=df['Company Name']
             .str.lower()
             .str.replace(irrelevant_regex, ' ')
             .str.replace(multispace_regex, ' ')
             .str.strip())

names_df = assign_no_symbols_name(names_df)
names_df.head(9)

Unnamed: 0,Line Number,Company Name,Company CIK Key,name
0,1,!J INC,1438823,!j inc
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607,#1 a lifesafer holdings inc
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512,#1 arizona discount properties llc
3,4,#1 PAINTBALL CORP,1433777,#1 paintball corp
4,5,$ LLC,1427189,$ llc
5,6,& S MEDIA GROUP LLC,1447162,& s media group llc
6,7,&TV COMMUNICATIONS INC.,1479357,&tv communications inc
7,8,"'MKTG, INC.'",886475,mktg inc
8,9,'OHANA LABS INC.,1703629,ohana labs inc


In [5]:
index_to_name = names_df['name'].to_dict()
name_to_index = {v: k for k, v in index_to_name.items()}

In [6]:
import more_itertools

cik_clusters = (
    names_df[['name', 'Company CIK Key']]
    .groupby('Company CIK Key')
    .indices
)
more_itertools.take(10, cik_clusters.values())

[array([5878]),
 array([6287]),
 array([6375]),
 array([6459, 6468]),
 array([6521]),
 array([6578]),
 array([7053, 7055]),
 array([7209]),
 array([7175]),
 array([7525])]

In [7]:
import itertools

cik_name_pairs = [
    (index_to_name[id_x], index_to_name[id_y])
    for cik_cluster in cik_clusters.values()
    for id_x, id_y in itertools.combinations(cik_cluster, 2)
    if index_to_name[id_x] != index_to_name[id_y]
]
cik_name_pairs[:10]

[('abel noser corp bd', 'abel noser corp'),
 ('abraham & co inc bd', 'abraham & co inc'),
 ('acme metals inc de', 'acme metals inc'),
 ('adams diversified equity fund inc', 'adams diversified equity fund'),
 ('adams diversified equity fund inc', 'adams express co'),
 ('adams diversified equity fund', 'adams express co'),
 ('alliance gaming corp', 'bally technologies inc'),
 ('aei securities inc bd', 'aei securities inc'),
 ('aeroflex inc', 'arx inc'),
 ('aetna life & casualty co', 'aetna services inc ct')]

In [88]:
import more_itertools
from fuzzywuzzy import fuzz

true_pairs = {
    tuple(sorted(pair))
    for pair in cik_name_pairs
    if fuzz.token_sort_ratio(*pair) >= 60
    # if fuzz.partial_ratio(*pair) >= 60  
}
display(more_itertools.take(10, true_pairs))
display(len(true_pairs))

[('blackstone madison ave offshore fund ltd',
  'blackstone madison avenue offshore fund ltd class a b c & g shares'),
 ('biolex inc', 'biolex therapeutics inc'),
 ('alanar inc bd', 'alanar incorporated'),
 ('advanced glassfiber yarns llc', 'advanced glassfiber yarus llc'),
 ('agic equity & convertible income fund',
  'allianzgi equity & convertible income fund'),
 ('aqr global asset allocation offshore fund usd ii ltd',
  'aqr global asset allocationoffshore fund usd ii ltd'),
 ('american energy partners inc de', 'american energy production inc'),
 ('across america real estate corp',
  'across america real estate development corp'),
 ('bonny paul a', 'bonny pual v'),
 ('ad systems communications', 'ad systems communications inc')]

3986

In [89]:
names_with_duplicates = list(set(itertools.chain.from_iterable(true_pairs)))
names_with_duplicates.sort()
len(names_with_duplicates)

6735

In [101]:
string_list = names_with_duplicates
k = 50
threshold = 0.8

In [91]:
from pymagnitude import Magnitude
vectors = Magnitude("~/Downloads/wiki-news-300d-1M-subword.magnitude")

In [92]:
# https://stackoverflow.com/a/50237379/145349
import numpy as np

def fasttext_vectorize_string_list(string_list):
    all_string_vectors = []
    for string in string_list:
        string_vectors = []
        for word in string.split():
            string_vectors.append(vectors.query(word))
        all_string_vectors.append(np.mean(string_vectors, axis=0))
    fasttext_matrix = np.array(all_string_vectors)
    return fasttext_matrix
    
fasttext_matrix = fasttext_vectorize_string_list(string_list)
fasttext_matrix

array([[ 0.00940245,  0.00476717, -0.02096105, ...,  0.01635175,
         0.00679325, -0.00542458],
       [-0.00581288,  0.01224012,  0.04869105, ...,  0.00975113,
         0.0136479 , -0.00815085],
       [ 0.00097756, -0.015883  ,  0.02517906, ..., -0.0164187 ,
        -0.00445986, -0.0137378 ],
       ...,
       [ 0.00769634, -0.04264872,  0.01508736, ...,  0.0393482 ,
         0.00134668, -0.02387204],
       [ 0.02172828, -0.03250718,  0.00367697, ...,  0.00083562,
         0.01238225,  0.01873857],
       [ 0.01227023, -0.01549435, -0.00300307, ..., -0.00753788,
         0.01825282,  0.01084107]])

In [110]:
from sklearn.cluster import AgglomerativeClustering

linkage = 'average'
fasttext_labels = AgglomerativeClustering(
    affinity='l2',
    linkage=linkage,
    distance_threshold=1 - threshold,
    n_clusters=None
).fit_predict(fasttext_matrix)
fasttext_labels

array([1474, 1861,  832, ...,  718,  752,  752])

In [111]:
len(set(fasttext_labels))

2234

In [112]:
from collections import defaultdict

def blocks_from_labels(dataset, labels):
    blocks = defaultdict(list)
    for i, label in enumerate(labels):
        record = dataset[i]
        blocks[label].append(record)
    return blocks

def pairs_from_blocks(blocks):
    return set(
        tuple(sorted(pair))
        for block in blocks.values()
        for pair in itertools.combinations(block, 2))

In [113]:
def precision(found_pairs, true_pairs):
    return len(true_pairs & found_pairs) / len(found_pairs)

def recall(found_pairs, true_pairs):
    return len(true_pairs & found_pairs) / (len(true_pairs & found_pairs) + len(true_pairs - found_pairs))

In [114]:
fasttext_blocks = blocks_from_labels(string_list, fasttext_labels)
# fasttext_blocks

In [115]:
fastext_pairs = pairs_from_blocks(fasttext_blocks)
display(precision(fastext_pairs, true_pairs))
display(recall(fastext_pairs, true_pairs))

0.030782301049740474

0.6635725037631711

In [116]:
true_pairs - fastext_pairs

{('blackstone madison ave offshore fund ltd',
  'blackstone madison avenue offshore fund ltd class a b c & g shares'),
 ('alanar inc bd', 'alanar incorporated'),
 ('american energy partners inc de', 'american energy production inc'),
 ('bonny paul a', 'bonny pual v'),
 ('abengoa bioenergy new technologies inc', 'abengoa bioenergy r&d inc'),
 ('berens credit opportunities fund l p',
  'berens distressed opportunities lp'),
 ('becker christipher', 'becker christopher'),
 ('at&t capital corp de', 'capita corp'),
 ('anaphore inc', 'anaphore nc'),
 ('california limited maturity municipals portfolio',
  'california ltd maturity municipals portfolio'),
 ('alupka absolute return fund l p', 'alupka absolute return fund lp'),
 ('brummett burcar alison', 'burcar alison d'),
 ('birchtree financial services inc bd', 'birchtree financial services llc'),
 ('aq holdings inc', 'aq holidings inc'),
 ('advisors disciplined trsut 590', 'advisors disciplined trust 590'),
 ('arnheim walter r', 'arnheim walt

In [117]:
fastext_pairs - true_pairs

{('alliancebernstein multi manager alternative fund',
  'blackrock multi strategy fund ltd'),
 ('aegis asset backed securities llc', 'bonds direct securities llc bd'),
 ('banc of america capital management llc',
  'california university of protection & intelligence management'),
 ('adelphi europe partners l p', 'alloy global fund l p'),
 ('ascend u s market neutral fund bpo ltd',
  'blackstone partners investment fund l p'),
 ('blue rock long duration plus fund l p', 'canyon balanced equity fund l p'),
 ('birchtree financial services llc',
  'capital research brokerage services llc bd'),
 ('advanced magnetics inc', 'advanced medical inc'),
 ('athena capital 2002 l p',
  'blue point capital partners ii executive fund l p'),
 ('blackstone credit opportunities fund l p',
  'blackstone participation partnership cayman v nq l p'),
 ('aqr style premia emerging bond master account l p',
  'blue water total return fund i l p'),
 ('ab high income fund inc', 'battery alternative income fund llc'