In [25]:
"""
RAG的基本流程和方法
1、使用langchain的document_load来加载文本
2、使用text_split来分割文本
3、加载为document类型数据
4、embedding成向量
5、使用chorma和fassio来存储向量（非持久性，存在内存中）
6、可以将向量存在splite中做永久性储存
"""


In [20]:
'''RAG的utils'''
import jsonlines
from langchain.indexes import SQLRecordManager, index
from langchain.text_splitter import RecursiveJsonSplitter,CharacterTextSplitter
import sentence_transformers
from typing import List
from langchain_core.documents import Document
from langchain.vectorstores.chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PDFMinerLoader
import os

def acq_filenames(folder_path):
    filename_list = []
    for filename in os.listdir(folder_path):
        filename_list.append(folder_path + "/" + filename)
    return filename_list

def read_jsonl(jsonl_file):
    new_dict = {}
    with open(jsonl_file,"r",encoding="utf-8") as jsonfile:
        for item in jsonlines.Reader(jsonfile):
            for key, value in item.items():
                new_dict[key] = value
    return new_dict

def json2doc(file_path):
    json_data = read_jsonl(text_path)

    splitter = RecursiveJsonSplitter(max_chunk_size=500)
    # 递归拆分json数据 - 如果需要访问/操作较小的json块
    # json_chunks = splitter.split_json(json_data=json_data)
    # 拆分器还可以输出文档
    docs = splitter.create_documents(texts=[json_data])
    return docs

def pdf2doc(file_path):

    docs = PDFMinerLoader(file_path).load()
    
    return docs
    
    

def text2embedding(docs):
    # 把文档进行分块
    chunks = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=10, add_start_index=True).split_documents(documents=docs)
    
    # 向量化
    embedding_model = "localmodel/bce-embedding-base_v1"
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model, model_kwargs={'device': "cuda:0"})
    embeddings.client = sentence_transformers.SentenceTransformer(embeddings.model_name, device="cuda:0")
    
    vector_store = Chroma.from_documents(documents=chunks, embedding=embeddings)
    
    return vector_store
    
def permanent_stored(chunks):
    chunks = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=10, add_start_index=True).split_documents(documents=docs)
    # 向量化
    embedding_model = "bce-embedding-base_v1"
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model, model_kwargs={'device': "cuda:0"})
    embeddings.client = sentence_transformers.SentenceTransformer(embeddings.model_name, device="cuda:0")

    vectorstore = Chroma.from_documents(
        documents=chunks, embedding=embeddings,
        persist_directory='chroma_db_demo'  # 存储db地址
                )

    return vectorstore



In [25]:
'''stored embedding'''
from langchain.document_loaders.pdf import PDFMinerLoader
from langchain_community.document_loaders import JSONLoader
import time
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
import torch
from tqdm.notebook import tqdm
from time import sleep


# 使用json
text_path = "files"
file_list = acq_filenames(text_path)
docs = []

for file in tqdm(file_list):

    if file.endswith(".jsonl"):
        doc = json2doc(file)
        for item in doc:
            docs.append(item)
    elif file.endswith(".pdf"):
        doc = pdf2doc(file)
        for item in doc:
            docs.append(item) 





  0%|          | 0/2 [00:00<?, ?it/s]

In [26]:
vector_store = permanent_stored(docs)