# data loading

In [1]:
import pandas as pd

from similarity_cluster import HierarchicalCluster
from myllm import TextSimilarityCalculator
from cache_manager import CacheManager

In [2]:
df = pd.read_csv('./data_cache/prod_historical_records1.csv')

In [5]:
class TextSimilarity:

    def __init__(self, txts, cache_location='./data_cache/text_similarity_cache'):

        self.txts = txts
        self.sim = TextSimilarityCalculator()
        self.emb_d = {}
        self.embedding_cache_manager = CacheManager(
                                            target_path=cache_location,
                                            cache_tag="emb",
                                            generate_func=self._get_embedding,
                                            cache_key_type='hashable',
                                            format_str="{base}.npy") 
        
        self.hcluster = HierarchicalCluster(data=txts,
                                            caption_func=lambda x:x,
                                            embedding_func=self.embedding_cache_manager.load,
                                            similarity_func=self.sim.similarity_func,
                                           )
        
        # self._load_all_embedding()

    def _get_embedding(self, txt):
        return self.emb_d[txt]

    def _load_all_embedding(self, batch_size=32):
        for i in range(0, len(self.txts), batch_size):
            
            batch = self.txts[i:i + batch_size]
            batch_embeddings = self.sim.get_embeddings(batch, batch_size=batch_size, show_progress_bar=True)

            for txt, emb in zip(batch, batch_embeddings):
                self.emb_d[txt] = emb
                _ = self.embedding_cache_manager.load(txt)

        

In [6]:
txts = df.question.to_list()[:200]

In [7]:
ts = TextSimilarity(txts)

In [8]:
d = ts.hcluster.cluster()

KeyError: 'ceshi'

In [10]:
ts.embedding_cache_manager._to_cache_path_func('ceshi')

'/Users/chengyanru/repos/ai_album/app/data_cache/.similarity_cache/text_similarity_cache/2458370899305644052.npy'

In [3]:
df

Unnamed: 0,created_at,question,conversation_id,cluster_key,project_id
0,2022/4/28 15:06,ceshi,7,,329
1,2022/4/28 15:07,yuyuy,8,,329
2,2022/4/28 15:08,权限,9,,329
3,2022/4/28 15:08,qewe,10,,329
4,2022/4/28 15:09,权限,11,,329
...,...,...,...,...,...
9995,2022/8/13 11:56,拍抖音白名单接口无法删除和添加,10002,,324
9996,2022/8/13 12:16,直播间没有多人k歌功能,10003,,311
9997,2022/8/13 13:01,修改订单状态,10004,,324
9998,2022/8/13 13:21,直播伴侣画面每次调试好之后依然画面会乱掉 已经万能操作过了,10005,,311


# test

In [1]:
import sys

sys.path.append('..')
from similarities import BertSimilarity

sentences = ['如何更换花呗绑定银行卡',
             '花呗更改绑定银行卡']
corpus = [
    '花呗更改绑定银行卡',
    '我什么时候开通了花呗',
    '俄罗斯警告乌克兰反对欧盟协议',
    '暴风雨掩埋了东北部；新泽西16英寸的降雪',
    '中央情报局局长访问以色列叙利亚会谈',
    '人在巴基斯坦基地的炸弹袭击中丧生',
]

In [2]:
model = BertSimilarity(model_name_or_path="shibing624/text2vec-base-chinese")
print(model)
similarity_score = model.similarity(sentences[0], sentences[1])
print(f"{sentences[0]} vs {sentences[1]}, score: {float(similarity_score):.4f}")

tokenizer_config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/856 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/409M [00:00<?, ?B/s]

[32m2024-01-08 10:24:05.679[0m | [34m[1mDEBUG   [0m | [36mtext2vec.sentence_model[0m:[36m__init__[0m:[36m80[0m - [34m[1mUse device: cpu[0m


Similarity: BertSimilarity, matching_model: <SentenceModel: shibing624/text2vec-base-chinese, encoder_type: MEAN, max_seq_length: 256, emb_dim: 768>
如何更换花呗绑定银行卡 vs 花呗更改绑定银行卡, score: 0.8551


In [3]:
print('-' * 50 + '\n')
# 2.Compute similarity between two list
similarity_scores = model.similarity(sentences, corpus)
print(similarity_scores.numpy())
for i in range(len(sentences)):
    for j in range(len(corpus)):
        print(f"{sentences[i]} vs {corpus[j]}, score: {similarity_scores.numpy()[i][j]:.4f}")

print('-' * 50 + '\n')
# 3.Semantic Search
model.add_corpus(corpus)
res = model.most_similar(queries=sentences, topn=3)
print(res)
for q_id, id_score_dict in res.items():
    print('query:', sentences[q_id])
    print("search top 3:")
    for corpus_id, s in id_score_dict.items():
        print(f'\t{model.corpus[corpus_id]}: {s:.4f}')

print('-' * 50 + '\n')
print(model.search(sentences[0], topn=3))

--------------------------------------------------



[32m2024-01-08 10:24:12.742[0m | [1mINFO    [0m | [36msimilarities.bert_similarity[0m:[36madd_corpus[0m:[36m108[0m - [1mStart computing corpus embeddings, new docs: 6[0m


[[0.85514647 0.72119576 0.14502521 0.21666762 0.25171375 0.08089051]
 [1.0000001  0.6807437  0.17136604 0.21621692 0.27282718 0.12791362]]
如何更换花呗绑定银行卡 vs 花呗更改绑定银行卡, score: 0.8551
如何更换花呗绑定银行卡 vs 我什么时候开通了花呗, score: 0.7212
如何更换花呗绑定银行卡 vs 俄罗斯警告乌克兰反对欧盟协议, score: 0.1450
如何更换花呗绑定银行卡 vs 暴风雨掩埋了东北部；新泽西16英寸的降雪, score: 0.2167
如何更换花呗绑定银行卡 vs 中央情报局局长访问以色列叙利亚会谈, score: 0.2517
如何更换花呗绑定银行卡 vs 人在巴基斯坦基地的炸弹袭击中丧生, score: 0.0809
花呗更改绑定银行卡 vs 花呗更改绑定银行卡, score: 1.0000
花呗更改绑定银行卡 vs 我什么时候开通了花呗, score: 0.6807
花呗更改绑定银行卡 vs 俄罗斯警告乌克兰反对欧盟协议, score: 0.1714
花呗更改绑定银行卡 vs 暴风雨掩埋了东北部；新泽西16英寸的降雪, score: 0.2162
花呗更改绑定银行卡 vs 中央情报局局长访问以色列叙利亚会谈, score: 0.2728
花呗更改绑定银行卡 vs 人在巴基斯坦基地的炸弹袭击中丧生, score: 0.1279
--------------------------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[32m2024-01-08 10:24:13.046[0m | [1mINFO    [0m | [36msimilarities.bert_similarity[0m:[36madd_corpus[0m:[36m119[0m - [1mAdd 6 docs, total: 6, emb len: 6[0m


{0: {0: 0.8551464676856995, 1: 0.7211957573890686, 4: 0.25171375274658203}, 1: {0: 1.0000001192092896, 1: 0.6807436943054199, 4: 0.2728271782398224}}
query: 如何更换花呗绑定银行卡
search top 3:
	花呗更改绑定银行卡: 0.8551
	我什么时候开通了花呗: 0.7212
	中央情报局局长访问以色列叙利亚会谈: 0.2517
query: 花呗更改绑定银行卡
search top 3:
	花呗更改绑定银行卡: 1.0000
	我什么时候开通了花呗: 0.6807
	中央情报局局长访问以色列叙利亚会谈: 0.2728
--------------------------------------------------

{0: {0: 0.8551464080810547, 1: 0.7211955785751343, 4: 0.2517136335372925}}
