## Использование Doc2Vec для получения эмбеддингов названий компаний

Загрузим тренировочный датасет

In [1]:
import pandas as pd
import numpy as np
train_ds = pd.read_csv('https://raw.githubusercontent.com/vladseve7n/ITMO.DUBL/main/dataset/train.csv').drop('pair_id', axis=1)
train_ds

Unnamed: 0,name_1,name_2,is_duplicate
0,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
1,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
2,"Rishichem Distributors Pvt., Ltd.",Dsa,0
3,Powermax Rubber Factory,Co. One,0
4,Tress A/S,Longyou Industries Park Zhejiang,0
...,...,...,...
497814,BIT-MAT PRODUCTS,The Goodyear Tire and Rubber Company,0
497815,"Bnd Trading Co., Ltd.",Zhong Shan Yue Liang Economy& Trade Imp. & Exp...,0
497816,"Xeikon Industrial Co., Ltd. Of Dongguan City","Yi Cheng Trading Co., Ltd. Of Dongguan City",0
497817,"Shanghai Kechuan Trading Co., Ltd.",Shanghai M&G Stationery Inc.,0


Загрузим предварительно запарсенные кластеры

In [2]:
import requests
clusters = requests.get('https://raw.githubusercontent.com/vladseve7n/ITMO.DUBL/main/dataset/clusters.json').json()['clusters']

In [64]:
clusters

[['ALFAGOMMA INDUSTRIAL SPA', ' Alfagomma'],
 [' SO.F.TER. SPA', 'Softer Us Inc.'],
 ['A.P.I. APPLICAZIONI PLASTICHE INDUSTRIALI S.P.A.',
  'A.P.I. Applicazioni Plastiche Industriali SPA',
  'API',
  'A.P.I.',
  'Trinseo API'],
 ['ABRO INDUSTRIES, INC.', 'ООО "АБРО ИНДАСТРИС"'],
 ['ADI COMMERCE', 'ADI commerce ltd', 'ADI (SALAMBO)'],
 ['ООО "АДЖИЛЕНТ ТЕКНОЛОДЖИЗ"',
  'AGILENT TECHNOLOGIES MFG GMBH & SHIPPING DEPARTMENT'],
 ['agip spa',
  'Azienda Generale Italiana Petroli',
  'AGIP',
  'AGIP  ( ENI GROUP)'],
 ['Andaluza De Ligantes Bituminosos Y Betunes Sa (En Liquidacion)',
  'Andaluza de Ligante Bituminosos y Betunes S.A.',
  'ALIBESA'],
 ['Alpha Trading S.p.a', 'ALPHA TRADING'],
 ['ANKARA INSAAT',
  'ANKARA İNŞAAT ',
  'ANKARA INSAAT TICARET ve SANAYI LIMITED SIRKETI'],
 ['SA APLIX', 'Aplix S.A.', 'APLIX'],
 ['APPIA LIANTS OUEST', 'APPIA LIANTS OUEST ( ALO)'],
 ['APS Paving&Stone Inc', 'APS', 'APS Paving Stone Inc'],
 ['Yalteks Yalitim Malzemeleri Sanayi Ve Ticaret Anonim Sirketi',


Инициализируем модель D2V

In [40]:
unique_names = list(np.unique(np.append(train_ds.name_1.values, train_ds.name_2.values)))
print(f'Всего уникальных имен {len(unique_names)}')

Всего уникальных имен 18022


In [41]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/usersp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [42]:
tagged_company_names = [TaggedDocument(nltk.word_tokenize(" ".join(name)), [i]) for i, name in enumerate(unique_names)]

In [57]:
model = Doc2Vec(vector_size=512, window=2, min_count=1, workers=16, epochs = 128)
model.build_vocab(tagged_company_names)

In [58]:
model.train(tagged_company_names, total_examples=model.corpus_count, epochs=model.epochs)

Получим эмбеддинг для каждого уникального названия в датасете

In [59]:
embed = lambda x: np.array(model.infer_vector(nltk.word_tokenize(x)))
embeddings_dict = {name: embed(name) for name in unique_names}

Посмотрим на расстояние между парами в датасете

In [60]:
train_ds['l1'] = (abs(train_ds.name_2.apply(lambda x: embeddings_dict[x]) - train_ds.name_1.apply(lambda x: embeddings_dict[x]))).apply(sum)
train_ds['l2'] = ((train_ds.name_2.apply(lambda x: embeddings_dict[x]) - train_ds.name_1.apply(lambda x: embeddings_dict[x]))**2).apply(sum)**(1/2)

In [61]:
train_ds[train_ds['is_duplicate'] == 1]

Unnamed: 0,name_1,name_2,is_duplicate,l1,l2
161,JX Nippon Oil & Gas Exploration (Brasil) Ltda,JX Nippon Oil & Gas Exploration Technical Serv...,1,59.358015,3.210978
603,Pirelli Neumaticos S.A.I.C.,"Pirelli Tyre Co., Ltd.",1,15.263008,0.838041
835,Brenntag Australia (Pty) Ltd.,Brenntag Group,1,45.552270,2.517950
1328,"PAUL BAUDER GMBH & CO KG, BOCHUM PLANT",Paul Bauder ag,1,52.679408,2.910244
1562,TOTAL CESKA REPUBLIKA s.r.o.,TOTAL FRANCE (ARNAY LE DUC),1,41.229374,2.240990
...,...,...,...,...,...
496574,"Bridgestone （Huizhou）Synthetic Rubber Co., Ltd.","Bridgestone India Pvt., Ltd.",1,7.120265,0.388083
496760,Arlanxeo International Group,Arlanxeo Corp.,1,15.303410,0.845075
497015,Brenntag Peru S.A.C.,Brenntag Chile Comercial E Industrial Ltda,1,23.193217,1.301358
497083,Dow Chemical International Private Ltd.,Dow Chemical Pacific,1,15.794774,0.879468


In [62]:
train_ds[train_ds['is_duplicate'] == 1].l1.mean(), train_ds[train_ds['is_duplicate'] == 1].l2.mean()

(20.2497689859702, 1.1131544279714507)

In [63]:
train_ds[train_ds['is_duplicate'] == 0].l1.mean(), train_ds[train_ds['is_duplicate'] == 0].l2.mean()

(21.333131028018073, 1.1736903110121337)

Создадим датасет со средним вектором каждого кластера и попробуем найти соответствие по минимальному расстоянию

In [50]:
import numpy as np
from tqdm import tqdm

In [51]:
cluster_embedding = {}
for num, cluster in enumerate(tqdm(clusters)):
    cluster_embedding[num] = {}
    cluster_embedding[num]['cluster'] = cluster
    embed_cluster = np.mean(np.array([embeddings_dict[x] for x in cluster]), axis=0)
    cluster_embedding[num]['embedding'] = embed_cluster

100%|██████████████████████████████████████| 450/450 [00:00<00:00, 74086.86it/s]


In [52]:
def find_closest_cluster(cluster_embedding: dict, name_of_company: str, embed, distance: str = 'l1'):
    name_embedding = embed(name_of_company)
    min_distance = 10e12
    result_cluster = None
    for key in cluster_embedding:
        cluster = cluster_embedding[key]['cluster']
        cluster_embed = cluster_embedding[key]['embedding']
        if distance == 'l1':
            distance = np.abs(cluster_embed - name_embedding).sum()
        else:
            distance = (((cluster_embed - name_embedding)**2).sum())**(1/2)
        if distance < min_distance:
            min_distance = distance
            result_cluster = cluster
    return result_cluster, min_distance

In [56]:
find_closest_cluster(cluster_embedding, 'Paul Bauder ag	', embed, distance='l2')

(['Sumitomo Corporation Of America',
  'Sumitomo Industrias Pesadas Do Brasil Ltda',
  'Sumitomo Corporation Do Brasil Sa',
  'Sumitomo Corp. Of America',
  'Sumitomo Rubber Do Brasil Ltda'],
 0.013719932994406842)

In [54]:
find_closest_cluster(cluster_embedding, 'Bridgestone International Group', embed, distance='l2')

(['Sumitomo Corporation Of America',
  'Sumitomo Industrias Pesadas Do Brasil Ltda',
  'Sumitomo Corporation Do Brasil Sa',
  'Sumitomo Corp. Of America',
  'Sumitomo Rubber Do Brasil Ltda'],
 0.013825186022104422)