### 编辑距离（Levenshtein距离）

In [20]:
%%time
import numpy as np

def calculate_levenshtein_distance(text1, text2, t_type=None):
    if t_type
    r1 = companynameparser.parse(text1)
    r2 = companynameparser.parse(text2)
    text1 = r1['brand']
    text2 = r2['brand']
    m, n = len(text1), len(text2)
    dp = np.zeros((m + 1, n + 1))

    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if text1[i - 1] == text2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) + 1

    return dp[m][n]

text1 = "I love Python programming"
text2 = "Python programming is great"

levenshtein_distance = calculate_levenshtein_distance(text1, text2)
print(levenshtein_distance)

0.0
CPU times: total: 0 ns
Wall time: 995 µs


### BERT
（Bidirectional Encoder Representations from Transformers）是一种基于Transformer的预训练模型，用于捕捉上下文相关的单词表示。可以通过BERT模型将文本表示为向量，然后计算余弦相似度。

In [None]:
from sentence_transformers import SentenceTransformer

def calculate_bert_similarity(text1, text2):
    model = SentenceTransformer("bert-base-nli-mean-tokens")
    embeddings = model.encode([text1, text2])
    similarity = cosine_similarity(embeddings)
    return similarity[0][1]

text1 = "I love Python programming"
text2 = "Python programming is great"

bert_similarity = calculate_bert_similarity(text1, text2)
print(bert_similarity)

## 执行

In [14]:
import json

with open('nmpa_company_name.json', 'r') as file:
    names = json.load(file)
file.close()  # 关闭文件

names = [i['Entity_Name_ZH'] for i in names]

names[:10]

['深圳迈瑞生物医疗电子股份有限公司',
 '迈克生物股份有限公司',
 '深圳市亚辉龙生物科技股份有限公司',
 '郑州安图生物工程股份有限公司',
 '深圳市新产业生物医学工程股份有限公司',
 '桂林优利特医疗电子有限公司',
 '迪瑞医疗科技股份有限公司',
 '山东康华生物医疗科技股份有限公司',
 '广州万孚生物技术股份有限公司',
 '中元汇吉生物技术股份有限公司']

In [23]:
%%time
out_data = []
for x in names[:10]:
    for y in names:
        if x != y:
            levenshtein_distance = calculate_levenshtein_distance(x, y)
            out_data.append({'x':x, 'y':y, 'levenshtein_distance':levenshtein_distance})
        

CPU times: total: 23.5 s
Wall time: 40.1 s


In [24]:
%%time
out_data = [{'x': x, 'y': y, 'levenshtein_distance': calculate_levenshtein_distance(x, y)}
            for x in names[:10] for y in names if x != y]

CPU times: total: 20.1 s
Wall time: 46.7 s


In [25]:
%%time

import concurrent.futures

out_data = []

def process_names(x):
    results = []
    for y in names:
        if x != y:
            levenshtein_distance = calculate_levenshtein_distance(x, y)
            results.append({'x': x, 'y': y, 'levenshtein_distance': levenshtein_distance})
    return results

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_names, x) for x in names[:10]]
    for future in concurrent.futures.as_completed(futures):
        out_data.extend(future.result())


CPU times: total: 10.9 s
Wall time: 54.1 s


### 结果展示

In [22]:
import pandas as pd

df = pd.DataFrame(out_data)
df_sorted = df.sort_values(by=['levenshtein_distance'], ascending=True)

df_sorted.head(20)

Unnamed: 0,x,y,levenshtein_distance
509,深圳迈瑞生物医疗电子股份有限公司,武汉迈瑞科技有限公司,0.0
1118,深圳迈瑞生物医疗电子股份有限公司,南京迈瑞生物医疗电子有限公司,0.0
2302,深圳迈瑞生物医疗电子股份有限公司,苏州迈瑞科技有限公司,0.0
16761,深圳迈瑞生物医疗电子股份有限公司,山东迈瑞医疗器械有限公司,0.0
119,深圳迈瑞生物医疗电子股份有限公司,湖南迈瑞医疗科技有限公司,0.0
855,深圳迈瑞生物医疗电子股份有限公司,深圳迈瑞科技有限公司,0.0
2150,深圳迈瑞生物医疗电子股份有限公司,北京迈瑞医疗器械有限公司,0.0
18253,深圳迈瑞生物医疗电子股份有限公司,上海赞瑞实业有限公司,1.0
11929,深圳迈瑞生物医疗电子股份有限公司,北京迈淩医疗技术发展有限公司,1.0
6047,深圳迈瑞生物医疗电子股份有限公司,山西洁瑞医疗器械股份有限公司,1.0


## 聚类

In [None]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity

# 企业名称列表
company_names = [
    "深圳迈瑞生物医疗电子股份有限公司",
    "杭州迈瑞医疗电子有限公司",
    "重庆迈瑞城市建设投资有限责任公司"
]

# 文本预处理函数
def preprocess_text(text):
    # 可根据具体需求进行预处理操作，如去除标点符号、转换为小写等
    return text.lower()

# 对企业名称进行文本预处理
processed_names = [preprocess_text(name) for name in company_names]

# 构建字典和向量空间模型
documents = [name.split() for name in processed_names]
dictionary = Dictionary(documents)
corpus = [dictionary.doc2bow(document) for document in documents]
tfidf_model = TfidfModel(corpus)
tfidf_vectors = tfidf_model[corpus]

# 计算相似度矩阵
similarity_matrix = cosine_similarity(tfidf_vectors)

# 聚类算法
clustering = AgglomerativeClustering(n_clusters=2, affinity='precomputed', linkage='average')
clusters = clustering.fit_predict(similarity_matrix)

# 输出聚类结果
for i, name in enumerate(company_names):
    cluster_label = clusters[i]
    print(f"企业名称：{name}，聚类标签：{cluster_label}")


## Embeddings

In [5]:
%%time
import numpy as np
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

model_name = "GanymedeNil/text2vec-large-chinese"
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={'device': "cpu"})
query_result = embeddings.embed_query("阶级跳跃")

np.array(query_result).shape

No sentence-transformers model found with name C:\Users\SanforZhou/.cache\torch\sentence_transformers\GanymedeNil_text2vec-large-chinese. Creating a new one with MEAN pooling.


CPU times: total: 2.72 s
Wall time: 8.59 s


(1024,)

In [None]:
import torch
from nlptool import name_embedding
from nlptool import TableBuilder

def decomposed_simi(query, decomposed_ems, top_n=5, max_length=100, language=None, tf_idf=False):
    # 清洗原始查询公司名
    cleaned_query = query #clean_company_name(query)
    # 获取语种和去除特殊字符后的公司名
    # language, cleaned_query = get_language_and_cleaned_name(cleaned_query, language)
    # 分词并获取分词列表和分词数
    tokens, num_tokens = tokenize(cleaned_query)
    
    if num_tokens > max_length:
        # 超过最大长度，截取前面的部分分词
        tokens = tokens[:max_length]
        print("警告：公司名分词数超过了最大长度，已截断!")
    
    # 将分词列表转换为嵌入向量
    tgt_em = name_embedding(tokens, language=language)
  
    # 使用预训练的SVD模型进行嵌入向量转换
    tgt_em = svd_model.transform(tgt_em)
  
    # 将嵌入向量转换为PyTorch张量
    tgt_em = torch.tensor(tgt_em)
  
    # 对嵌入向量进行标准化处理
    tgt_em = normalize_embedding(tgt_em)
  
    # 对嵌入向量进行填充，使其维度与待比对对象一致
    padded_tgt_em = TableBuilder.pad_embedding(tgt_em, decomposed_ems.size(1))
  
    # 计算每个分词的权重
    if tf_idf:
        tgt_w = calculate_tf_idf_weights(tokens)
    else:
        tgt_w = calculate_inverse_term_frequency_weights(tokens)
  
    # 对权重进行归一化处理
    tgt_w = normalize_weights(tgt_w)
  
    # 计算查询向量与待比对对象之间的相似度
    similarity = torch.matmul(padded_tgt_em, decomposed_ems.transpose(0, 1))
  
    # 找到每一行中的最大值，并将其乘以对应的权重，得到加权的相似度
    m_simi = torch.max(similarity, dim=1).values * tgt_w
  
    # 计算每个查询向量的相似度加权和
    swm = torch.sum(m_simi, dim=1)
  
    # 找到相似度加权和中最大的top_n个值和对应的索引
    top_scores, top_indices = torch.topk(swm, k=top_n)
    
    return top_scores.tolist(), top_indices.tolist()
