In [1]:
from llama_index.embeddings.text_embeddings_inference import TextEmbeddingsInference
from llama_index.core import Settings

embed_model = TextEmbeddingsInference(
    base_url="http://localhost:8000",
    model_name="/root/autodl-tmp/models/bge-large-en-v1.5",  # required for formatting inference text,
    timeout=60,  # timeout in seconds
    embed_batch_size=10,  # batch size for embedding
)

Settings.embed_model = embed_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import requests
import json
import torch

def fhy_default_sparse_encoder(url: str):
    def compute_vectors(texts):
        #print(type(texts))
        data = {"inputs": texts, "normalize": False, "truncate": True}
        headers = {'Content-Type': 'application/json'}
        response = requests.post(url, data=json.dumps(data), headers=headers)
            
        if response.status_code != 200:
            raise Exception(response.text)       
        
        response_json = response.json()
        #print(type(response_json))
        outputs = torch.tensor(response_json)
        if torch.cuda.is_available():
            outputs = outputs.to("cuda")
        
        # extract the vectors that are non-zero and their indices
        indices = []
        vecs = []
        for batch in outputs:
            indices.append(batch.nonzero(as_tuple=True)[0].tolist())
            vecs.append(batch[indices[-1]].tolist())
        return indices, vecs
        
    return compute_vectors
    

In [3]:
import pandas as pd
from llama_index.core.schema import TextNode


nodes = []
eval_docs = []

df = pd.read_excel('/root/autodl-tmp/kong_work_space/qdrant_hybrid/data/label_papers_topic.xlsx',)

for docs, label in zip(df['Representative_Docs'], df['Label-Zh']):
    doc_1, doc_2, doc_3 = eval(docs)
    nodes.append(TextNode(text=doc_1, metadata={"label": label}))
    nodes.append(TextNode(text=doc_2, metadata={"label": label}))
    eval_docs.append((doc_3, label))
    
nodes[: 6]

[TextNode(id_='39026fff-2343-428b-abbb-4b06587598e6', embedding=None, metadata={'label': '催化'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='We present here a joint experimental and computational study on the formation of benzothiazoles. Our investigation reveals a green protocol for accessing benzothiazoles from acyl chlorides using either water alongside a reducing agent as the reaction medium or in combination with stoichiometric amounts of a weak acid, instead of the harsh conditions and catalysts previously reported. Specifically, we show that a protic solvent, particularly water, enables the formation of 2-substituted benzothiazoles from N-acyl 1,2-aminothiophenols already at room temperature, without the need for strong acids or metal catalysts. DFT Molecular Dynamics simulations coupled with advanced enhanced sampling techniques provide a clear understanding of the catalytic role of water. We demonstrate how bulk water 鈥?due to its ext

In [4]:
import qdrant_client
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore


client = qdrant_client.QdrantClient(url="localhost")
# 先删除
client.delete_collection(collection_name="abstract_collection")

vector_store = QdrantVectorStore(
    client=client, 
    collection_name="abstract_collection", 
    enable_hybrid=True,
    sparse_doc_fn=fhy_default_sparse_encoder("http://localhost:8001/embed"),
    sparse_query_fn=fhy_default_sparse_encoder("http://localhost:8002/embed"),
    batch_size=32,    
)

storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(nodes, storage_context=storage_context)

retriever = index.as_retriever(
    similarity_top_k=5, 
    sparse_top_k=15, 
    vector_store_query_mode="hybrid", 
)

In [5]:
num_right = 0
for doc_3, label in eval_docs:
    response = retriever.retrieve(doc_3)
    if response[0].metadata['label'] == label:
        num_right += 1
    else:
        print('True label is {}, '.format(label), end='')
        for node in response:
            print('{}: {}'.format(node.metadata['label'], node.score), end=' ')
        print()

print('{} / {}'.format(num_right, len(eval_docs)))

True label is 吸附技术, 电催化: 0.8045306661658713 热电: 0.5 光催化: 0.49653380784608986 电池技术: 0.3630125760097395 层析重建: 0.3455790663187003 
True label is 纳米复合材料, 光催化: 1.0 纳米复合材料: 0.701514495646155 光伏: 0.5508564746701721 光催化: 0.45148178205009104 光伏: 0.3773208965307339 
True label is 复合材料, 纳米复合材料: 1.0 复合材料: 0.40090179366225503 MXene材料: 0.3109226147213543 电催化: 0.2845296272574742 循环经济: 0.2639786541843684 
87 / 90


In [6]:
num_right = 0
for doc_3, label in eval_docs:
    response = retriever.retrieve(doc_3)
    label_dict = {}
    for node in response:
        pre_label = node.metadata['label']
        if pre_label not in label_dict:
            label_dict[pre_label] = node.score
            if len(label_dict) == 3:
                break
    if label in label_dict:
        num_right += 1
    else:
        print('True label is {}, '.format(label), end='')
        print(label_dict)

print('{} / {}'.format(num_right, len(eval_docs)))

True label is 吸附技术, {'电催化': 0.8045306661658713, '热电': 0.5, '光催化': 0.49653380784608986}
89 / 90
