In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/gov-names/Gov Orgs ONS.xlsx
/kaggle/input/sec-edgar-companies-list/sec__edgar_company_info.csv
/kaggle/input/sec-edgar-companies-list/database.sqlite


In [2]:
from random import choices
import gc
import time

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from tqdm.notebook import tqdm

  import sys


In [3]:
root = '/kaggle/input/sec-edgar-companies-list/'
data = pd.read_csv(os.path.join(root, 'sec__edgar_company_info.csv'), encoding='latin')
data.head()

Unnamed: 0,Line Number,Company Name,Company CIK Key
0,1,!J INC,1438823
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512
3,4,#1 PAINTBALL CORP,1433777
4,5,$ LLC,1427189


In [4]:
print('データセットのサイズ', data.shape)

データセットのサイズ (663000, 3)


In [5]:
# object 型のすべての列を抽出
data.select_dtypes('object').apply(pd.Series.nunique, axis=0)

Company Name    657160
dtype: int64

## 文字列照合によるアプローチ

In [6]:
# あいまい文字列マッチング
!pip install fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [7]:
# データの末尾
data.tail()

Unnamed: 0,Line Number,Company Name,Company CIK Key
662995,662996,ZZ GLOBAL LLC,1501460
662996,662997,ZZIF 2008 INVESTMENT LLC,1448632
662997,662998,"ZZLL INFORMATION TECHNOLOGY, INC",1365357
662998,662999,"ZZX, LLC",1691924
662999,663000,ZZYZX ZZAZX ZZOZX INC,1184274


In [8]:
# テスト（文字列の一致率を計算）
print(fuzz.ratio('ZZ GLOBAL LLC', 'ZZLL INFORMATION TECHNOLOGY, INC'))
print(fuzz.ratio('ZZ GLOBAL LLC', 'ZZX, LLC'))

36
57


In [9]:
# 部分一致のテスト
print(fuzz.partial_ratio('ZZ GLOBAL LLC', 'ZZLL INFORMATION TECHNOLOGY, INC'))
print(fuzz.partial_ratio('ZZ GLOBAL LLC', 'ZZX, LLC'))

38
50


In [10]:
# テスト（ソート一致率）
print(fuzz.token_sort_ratio('ZZ GLOBAL LLC', 'ZZLL INFORMATION TECHNOLOGY, INC'))
print(fuzz.token_sort_ratio('ZZ GLOBAL LLC', 'ZZX, LLC'))

32
60


In [11]:
# テスト（重複を無視）
print(fuzz.token_set_ratio('ZZ GLOBAL LLC', 'ZZLL INFORMATION TECHNOLOGY, INC'))
print(fuzz.token_set_ratio('ZZ GLOBAL LLC', 'ZZX, LLC'))

32
60


In [12]:
# 文字化けの修正
!pip install ftfy

Collecting ftfy
  Downloading ftfy-5.9.tar.gz (66 kB)
[K     |████████████████████████████████| 66 kB 1.4 MB/s eta 0:00:011
Building wheels for collected packages: ftfy
  Building wheel for ftfy (setup.py) ... [?25ldone
[?25h  Created wheel for ftfy: filename=ftfy-5.9-py3-none-any.whl size=46451 sha256=e538ff2472d70fadda1628c1d986faaed01573522744727cf014409c3a7555e8
  Stored in directory: /root/.cache/pip/wheels/4f/6b/a5/84880e9435707659c6b96d3aadeb9a87a41f61ec9ede469f41
Successfully built ftfy
Installing collected packages: ftfy
Successfully installed ftfy-5.9


In [13]:
import re
from ftfy import fix_text


def ngrams(string, n=3):
    string = fix_text(string) # fix text
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower()
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title() # normalise case - capital at start of each word
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [14]:
print('All 3-grams in "McDonalds":')
print(ngrams('McDonalds'))

All 3-grams in "McDonalds":
[' Mc', 'Mcd', 'cdo', 'don', 'ona', 'nal', 'ald', 'lds', 'ds ']


In [15]:
%%time
# Tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

company_names = data['Company Name'].unique()
# tfidf 変換する
# min_df=1 で単語頻度閾値1に設定
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)

CPU times: user 37 s, sys: 685 ms, total: 37.7 s
Wall time: 38 s


In [16]:
print(tf_idf_matrix.shape, tf_idf_matrix[2])
print(ngrams('#1 PAINTBALL CORP'))

(657160, 37119)   (0, 27459)	0.08354503109262025
  (0, 12077)	0.08396008410324765
  (0, 905)	0.08342625072048299
  (0, 32422)	0.1037256786436347
  (0, 23534)	0.08939502489135233
  (0, 25811)	0.11122024402243162
  (0, 33677)	0.13902041143340646
  (0, 32250)	0.16722308086155893
  (0, 23527)	0.13239013628447852
  (0, 30833)	0.16133926755549224
  (0, 30205)	0.17005935834583902
  (0, 32094)	0.1721640261119851
  (0, 14761)	0.15280106636921786
  (0, 1075)	0.13284135865628227
  (0, 33255)	0.16300644231521
  (0, 29429)	0.11323843556886455
  (0, 34467)	0.1711593373129311
  (0, 30367)	0.16471851032372145
  (0, 21798)	0.19491566910014013
  (0, 32698)	0.1859439852372793
  (0, 26226)	0.18566159866786178
  (0, 7270)	0.1797067090900766
  (0, 570)	0.14994085911964772
  (0, 19627)	0.20561581420041686
  (0, 28897)	0.16324399375369777
  (0, 30135)	0.1354409067525283
  (0, 36984)	0.2190889973500009
  (0, 26405)	0.22296469860446785
  (0, 31949)	0.21957254388868913
  (0, 5328)	0.23288998763199392
  (0, 437)	

In [17]:
# process.extractone(text, candidate_list) は、text に最も近い文字列を candidate_list から選ぶ
t1 = time.time()
print(process.extractOne('Ministry of Justice', company_names[0:999])) #org names is our list of organization names
t = time.time()-t1
print("SELFTIMED:", t)
print("Estimated hours to complete for 1000 rows of  dataset:", (t*len(company_names[0:999]))/60/60)

('07 GRAEME HALL/VINES OF JUSTICE, LLC', 86)
SELFTIMED: 0.05625629425048828
Estimated hours to complete for 1000 rows of  dataset: 0.015611121654510498
