In [1]:
!nvidia-smi

Wed Sep 27 09:10:26 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A40                     Off | 00000000:AF:00.0 Off |                    0 |
|  0%   30C    P0              69W / 300W |  15303MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [1]:
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document

In [2]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="/root/.cache/torch/sentence_transformers/GanymedeNil_text2vec-large-chinese",
                                                      # local_files_only=True,
                                                      model_kwargs={"device": "cuda"})

  from tqdm.autonotebook import trange
No sentence-transformers model found with name /root/.cache/torch/sentence_transformers/GanymedeNil_text2vec-large-chinese. Creating a new one with MEAN pooling.
2023-09-27 11:42:48.573891: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
persist_directory = 'db_cn'

vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=instructor_embeddings,
                  collection_metadata={"hnsw:space": "cosine"} # l2
                 )

# https://api.python.langchain.com/en/latest/_modules/langchain/vectorstores/chroma.html#Chroma.similarity_search_with_score

In [4]:
greetings = {
    "category": "greetings",
    "greetings": [
        "你好吗?",
        "一切都好嗎?",
        "很高興認識你",
        "早上好"
    ]
}

In [5]:
greeting_docs = list(map(lambda x: Document(page_content=x, metadata={"category": greetings["category"]}), greetings["greetings"]))

In [6]:
greeting_docs[0]

Document(page_content='你好吗?', metadata={'category': 'greetings'})

In [7]:
vectordb._collection.count()

31

In [9]:
new_ids = [greetings["category"]+"_"+str(i) for i in range(len(greeting_docs))]

In [10]:
new_ids

['greetings_0', 'greetings_1', 'greetings_2', 'greetings_3']

In [11]:
return_ids = vectordb.add_documents(
    greeting_docs,
    ids=new_ids
)

In [5]:
vectordb._collection.count()

31

In [13]:
vectordb.persist()

In [14]:
vectordb._collection.get(ids=["greetings_0","greetings_1"])

{'ids': ['greetings_0', 'greetings_1'],
 'embeddings': None,
 'documents': ['你好吗?', '一切都好嗎?'],
 'metadatas': [{'category': 'greetings'}, {'category': 'greetings'}]}

In [16]:
vectordb.get(
    where={"category": greetings["category"]}
)

{'ids': ['greetings_0', 'greetings_1', 'greetings_2', 'greetings_3'],
 'embeddings': None,
 'documents': ['你好吗?', '一切都好嗎?', '很高興認識你', '早上好'],
 'metadatas': [{'category': 'greetings'},
  {'category': 'greetings'},
  {'category': 'greetings'},
  {'category': 'greetings'}]}

### Cosine distance the lower the better [0,1]

### 相似問候語

In [5]:
# 相似問候語 l2
vectordb.similarity_search_with_score("你過得如何呀?", k = 3)

[(Document(page_content='你好吗?', metadata={'category': 'greetings'}),
  330.80352783203125),
 (Document(page_content='很高興認識你', metadata={'category': 'greetings'}),
  427.43768310546875),
 (Document(page_content='一切都好嗎?', metadata={'category': 'greetings'}),
  434.787841796875)]

In [6]:
# 相似問候語
vectordb.similarity_search_with_score("你過得如何呀?", k = 3)

[(Document(page_content='你好吗?', metadata={'category': 'greetings'}),
  -17.13308334350586),
 (Document(page_content='很高興認識你', metadata={'category': 'greetings'}),
  -15.54990005493164),
 (Document(page_content='一切都好嗎?', metadata={'category': 'greetings'}),
  -14.909040451049805)]

In [14]:
embed = instructor_embeddings.client.encode("你過得如何呀?")
print(embed)

[ 1.0175216  -0.31168896 -1.0131493  ...  1.0608912   0.0557702
 -0.25220978]


In [15]:
# 相似問候語
vectordb.similarity_search_by_vector_with_relevance_scores(embed.tolist(), k = 3)

[(Document(page_content='你好吗?', metadata={'category': 'greetings'}),
  -9.387154579162598),
 (Document(page_content='很高興認識你', metadata={'category': 'greetings'}),
  -8.423574447631836),
 (Document(page_content='一切都好嗎?', metadata={'category': 'greetings'}),
  -7.270807266235352)]

### 非數據庫問候語

In [7]:
# 非數據庫問候語
vectordb.similarity_search_with_score("晚安!", k = 3)

[(Document(page_content='早上好', metadata={'category': 'greetings'}),
  -15.772579193115234),
 (Document(page_content='很高興認識你', metadata={'category': 'greetings'}),
  -14.076789855957031),
 (Document(page_content='你好吗?', metadata={'category': 'greetings'}),
  -14.006009101867676)]

### 非問候語

In [8]:
# 非問候語
vectordb.similarity_search_with_score("自动核准后还需要进行单号提交吗?", k = vectordb._collection.count())

[(Document(page_content='2. 单号分配  \n厂家保修管理 后台->保修作业 ->单号分配 , 勾选送修项目 , 点击【分配 Ware no. 】分\n配单号,同客户同申请人的送修清单 , 可一次分配 Ware no.;  \n \n勾选送修项目 , 点击【指派 Ware no.】, 系统根据产品型号、退运原因、维修地址、服\n务类型等对产品分类并分配  Ware no.;  \n备注 : 若客户需使用自主 RMA no.，亦可在【自定单号】添加客户指定 RMA no.', metadata={'source': 'chinese_pdf/保修管理-廠家sop-repair(去圖標).pdf', 'page': 21}),
  -14.272348403930664),
 (Document(page_content='选定不良代码 , 点击进入客户绑定页面 ; 勾选客户 , 点击【保存】完成不良判定代码\n与客户的绑定 ; \n \n \n保修设定  \n1. 定义产品 (整机料号信息 ) \n单笔或批量将产品信息导入系统 ,包含机种、厂家料号、客户料号、供应商料号、替代\n料号等信息  \n1.1 机种管理  \n若产品涉及机种 , 需先将机种信息新增至系统 , 以便后续选择 . \na. 单笔新增机种  \n厂家保修管理 后台 ->保修设定 ->定义产品 , 点击【机种管理】按钮 , 点击【新增】填写\n机种名称 , 点击【保存】完成 ;', metadata={'source': 'chinese_pdf/保修管理-廠家sop-repair(去圖標).pdf', 'page': 10}),
  -13.937506675720215),
 (Document(page_content='1.4 添加客户料号  \n点击【批量绑定附件】下拉框 , 选择【下载模板】 , 填写产品料号及所绑附件的名称和数\n量, 点击【批量上传】添加所编辑档案 , 可完成附件的新增和绑定 . \n \n \n1.5 添加客户料号  \n若厂家PN与客户料号不同 , 为方便客户和厂家识别产品 , 可添加客户料号 , 系统自动\n进行转换 ;  \na. 单笔添加客户料号  \n厂家保修管理 后台->保修设定 ->定义产品 , 点击【料号对照 】按钮; 选定

### 完全相同問候語

In [9]:
# 完全相同問候語
vectordb.similarity_search_with_score("早上好", k = vectordb._collection.count())

[(Document(page_content='早上好', metadata={'category': 'greetings'}),
  -20.439983367919922),
 (Document(page_content='一切都好嗎?', metadata={'category': 'greetings'}),
  -13.827183723449707),
 (Document(page_content='你好吗?', metadata={'category': 'greetings'}),
  -13.778666496276855),
 (Document(page_content='很高興認識你', metadata={'category': 'greetings'}),
  -13.566693305969238),
 (Document(page_content='点击【添加代码】 , 选择原因类别 , 编辑原因代码及保修原因 , 选填解决办法 , 勾选\n适用客户 , 点击【保存】完成保修代码设定  \n \n5. 定义附件  \n单笔或批量将 附件信息导入系统 . \n厂家保修管理后台 ->保修设定 ->定义附件 , 点击【新增附件】输入附件名称及备注 , 完\n成单笔新增附件 ;', metadata={'source': 'chinese_pdf/保修管理-廠家sop-repair(去圖標).pdf', 'page': 19}),
  -11.46774673461914),
 (Document(page_content='2. 单号分配  \n厂家保修管理 后台->保修作业 ->单号分配 , 勾选送修项目 , 点击【分配 Ware no. 】分\n配单号,同客户同申请人的送修清单 , 可一次分配 Ware no.;  \n \n勾选送修项目 , 点击【指派 Ware no.】, 系统根据产品型号、退运原因、维修地址、服\n务类型等对产品分类并分配  Ware no.;  \n备注 : 若客户需使用自主 RMA no.，亦可在【自定单号】添加客户指定 RMA no.', metadata={'source': 'chinese_pdf/保修管理-廠家sop-repair(去圖標).pdf', 'page': 21}),
  -11.

In [22]:
# 完全相同問候語 l2
vectordb.similarity_search_with_score("早上好", k = vectordb._collection.count(), filter={'category': 'greetings'})

[(Document(page_content='早上好', metadata={'category': 'greetings'}),
  232.26988220214844),
 (Document(page_content='一切都好嗎?', metadata={'category': 'greetings'}),
  509.4013671875),
 (Document(page_content='你好吗?', metadata={'category': 'greetings'}),
  518.322021484375),
 (Document(page_content='很高興認識你', metadata={'category': 'greetings'}),
  547.3131103515625)]