In [2]:
import uuid

import os
from langchain.retrievers import MultiVectorRetriever
from langchain_community.document_loaders import CSVLoader, UnstructuredExcelLoader
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_core.stores import InMemoryByteStore
import time
from langchain_community.vectorstores import FAISS
import faiss

# 定义 xlsx 文件路径
csv_file_path = './data'

def generate_path(folder_path: str) -> list:
    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
    return file_paths


def generate_loaders(file_paths: list) -> list:
    loaders = []
    for file_path in file_paths:
        file_type = file_path.split('.')[-1]
        if file_type == 'csv':
            loaders.append(CSVLoader(file_path, encoding='utf-8'))
    return loaders

def exec_load(loaders: list) -> list:
    texts = []
    for loader in loaders:
        texts.extend(loader.load())
    return texts


In [4]:
embedding = SentenceTransformerEmbeddings(model_name=r"D:\Model\LLM\cache\bge-large-zh-v1.5")
vectorstore = FAISS.load_local("./database/faiss", embeddings=embedding, allow_dangerous_deserialization = True)

  embedding = SentenceTransformerEmbeddings(model_name=r"D:\Model\LLM\cache\bge-large-zh-v1.5")
  from .autonotebook import tqdm as notebook_tqdm


In [7]:
vectorstore.similarity_search('健康')

[]

In [8]:
path = generate_path(csv_file_path)
loders = generate_loaders(path)

In [9]:
print(loders[0].load()[0])

page_content='﻿item: 蜜汁鸡腿超级碗
cate_1_name: 色拉轻食
cate_2_name: 健康餐
cate_3_name: ' metadata={'source': './data\\商品.csv', 'row': 0}


In [10]:
from langchain_community.docstore.in_memory import InMemoryDocstore
def summ_retriever(texts):
    # The vectorstore to use to index the child chunks
    embedding = SentenceTransformerEmbeddings(model_name=r"D:\Model\LLM\cache\bge-large-zh-v1.5")
    # vectorstore = Chroma(collection_name= "summaries", embedding_function=embedding, persist_directory="./database/chroma")
    
    index = faiss.IndexFlatL2(len(embedding.embed_query("hello world")))
    vectorstore = FAISS(embedding_function=embedding,docstore=InMemoryDocstore(),index_to_docstore_id={},index=index)

    store = InMemoryByteStore()
    id_key = "doc_id"
    # The retriever (empty to start)
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        byte_store=store,
        id_key=id_key,
    )

    doc_ids = [str(uuid.uuid4()) for _ in texts]

    summary_docs = [
        Document(page_content=get_sum(s), metadata={id_key: doc_ids[i]})
        for i, s in enumerate(texts)
    ]
    retriever.vectorstore.add_documents(summary_docs)
    vectorstore.save_local("./database/faiss")
    with open('.\data\doc_ids.txt', 'w') as file:
        for doc_id in doc_ids:
            file.write(doc_id + '\n')

    retriever.docstore.mset(list(zip(doc_ids, texts)))

    return retriever

def get_sum(doc):
    # 提取 department 和 title
    cate_1_name = next(line for line in doc.page_content.splitlines() if line.startswith("cate_1_name:"))
    cate_2_name = next(line for line in doc.page_content.splitlines() if line.startswith("cate_2_name:"))
    cate_3_name = next(line for line in doc.page_content.splitlines() if line.startswith("cate_3_name:"))

    # 获取具体内容
    cate_1_name = cate_1_name.split(": ")[1].strip()
    cate_2_name = cate_2_name.split(": ")[1].strip()
    cate_3_name = cate_3_name.split(": ")[1].strip()

    # 组合成摘要
    if cate_3_name:
        summary = f"{cate_1_name},{cate_2_name},{cate_3_name}"
    else:
        summary = f"{cate_1_name},{cate_2_name}"
    return summary

In [11]:
get_sum(loders[0].load()[123])

'主食,其他主食'

In [12]:
text = exec_load(loders)
re = summ_retriever(text)

In [60]:
re.invoke("色拉轻食,健康餐")

[Document(metadata={'source': './data\\商品.csv', 'row': 0}, page_content='\ufeffitem: 蜜汁鸡腿超级碗\ncate_1_name: 色拉轻食\ncate_2_name: 健康餐\ncate_3_name: '),
 Document(metadata={'source': './data\\商品.csv', 'row': 1}, page_content='\ufeffitem: （招牌）香料烤鸡胸健康餐\ncate_1_name: 色拉轻食\ncate_2_name: 健康餐\ncate_3_name: '),
 Document(metadata={'source': './data\\商品.csv', 'row': 2}, page_content='\ufeffitem: 低卡鸡胸肉减脂沙拉\ncate_1_name: 色拉轻食\ncate_2_name: 健康餐\ncate_3_name: '),
 Document(metadata={'source': './data\\商品.csv', 'row': 3}, page_content='\ufeffitem: 果木烟熏鸡胸超级碗\ncate_1_name: 色拉轻食\ncate_2_name: 健康餐\ncate_3_name: ')]

In [1]:
re.vectorstore.similarity_search("色拉轻食,健康餐") 

NameError: name 're' is not defined