In [1]:
'''RAG的utils'''
import jsonlines
from langchain.indexes import SQLRecordManager, index
from langchain.text_splitter import RecursiveJsonSplitter,CharacterTextSplitter
import sentence_transformers
from typing import List
from langchain_core.documents import Document
from langchain.vectorstores.chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PDFMinerLoader, TextLoader
import os
import re
import json

# 获取文件地址
def acq_filenames(folder_path):
    filename_list = []
    for filename in os.listdir(folder_path):
        filename_list.append(folder_path + "/" + filename)
    return filename_list
    
# 读取jsonl文件
def read_jsonl(jsonl_file):
    new_dict = {}
    with open(jsonl_file,"r",encoding="utf-8") as jsonfile:
        for item in jsonlines.Reader(jsonfile):
            for key, value in item.items():
                new_dict[key] = value
    return new_dict
    
# 将jsonl转换为document格式
def jsonl2doc(file_path):
    json_data = read_jsonl(file_path)

    splitter = RecursiveJsonSplitter(max_chunk_size=2000)
    
    docs = splitter.create_documents(texts=[json_data])
    return docs
    
# 将json转换为document格式
def json2doc(file_path):
    json_data = json.loads(file_path)
    splitter = RecursiveJsonSplitter(max_chunk_size=2000)
    docs = splitter.create_documents(texts=[json_data])
    return docs
    
 # 将txt转换为document格式  
def txt2doc(file_path):
    docs = TextLoader(file).load()
    docs = CharacterTextSplitter(separator="\n", chunk_size=1024, chunk_overlap=64, add_start_index=True).split_documents(documents=docs)
    return docs
    
# 将pdf转换为document格式
def pdf2doc(file_path):
    docs = PDFMinerLoader(file_path).load()
    docs = CharacterTextSplitter(separator="\n", chunk_size=1024, chunk_overlap=64, add_start_index=True).split_documents(documents=docs)
    return docs

# 将文本永久储存在向量库
def permanent_stored(chunks):
    embedding_model = "bce-embedding-base_v1"
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model, model_kwargs={'device': "cuda:1"})
    embeddings.client = sentence_transformers.SentenceTransformer(embeddings.model_name, device="cuda:1")

    vectorstore = Chroma.from_documents(
        documents=chunks, embedding=embeddings,
        persist_directory='chroma_db_demo')  # 存储db地址)

    return vectorstore

  from tqdm.autonotebook import tqdm, trange
