In [1]:
import os

from dotenv import load_dotenv
#加载 .env文件中的环境变量。windows下，conda会将新的环境安装到 %USERFROFILE%/.cond/envs下
load_dotenv()

True

In [2]:
#定义llm函数，链接本地部署的ChatGLM3, 支持各种推理和判断
import openai
from openai import OpenAI
from langchain.schema import HumanMessage
from langchain_openai import ChatOpenAI
import json
#用langchain提供的openai接口连接chatglm3API
#llm= ChatOpenAI(model_name="chatglm3-6b", temperature=0.0)
#如果需要支持流模式
# from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
# llm= ChatOpenAI(model_name="chatglm3-6b",streaming=True, callbacks=[StreamingStdOutCallbackHandler()],)

In [3]:
#用OPENAI的API连接chatglm3API，这个client将是embedding的基础
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"], base_url=os.environ["OPENAI_API_BASE"])
#定义本地部署chatGLM3的生成函数
def get_completion(prompt, model="chatglm3-6b"):
	messages=[{"role":"user", "content":prompt}]
	response=client.chat.completions.create(
		model=model,
		messages=messages,
		temperature=0
	)
	return response.choices[0].message.content

In [4]:
#用openai的client定义Embedding函数，使用模型bge-large-zh-1.5
def openai_embedding(text):
    response = client.embeddings.create(
    model="bge-large-zh-1.5",
    input=[text],
    )
    return response.data[0].embedding

In [6]:
#测试openai client的embedding接口
result=openai_embedding('这是一个测试')
print(result)

[0.008789479732513428, -0.025418104603886604, -0.01684509590268135, 0.02325964719057083, 0.02803938277065754, -0.004015109036117792, 0.005794750526547432, -0.028220778331160545, -0.01670771650969982, 0.023248163983225822, 0.011479878798127174, -0.008791911415755749, -0.010179485194385052, -0.01919841766357422, 0.007210812531411648, 0.02419707365334034, 0.0393543541431427, -0.00832420028746128, 0.014231963083148003, -0.0488472543656826, 0.048904627561569214, -0.01777009293437004, -0.03577018156647682, -0.051787350326776505, 0.0504925362765789, -0.04979908466339111, -0.044894829392433167, -5.909622632316314e-05, 0.0220244862139225, -0.005367631558328867, -0.024169225245714188, 0.031042277812957764, 0.016757095232605934, 0.01182562205940485, -0.016673225909471512, 0.017949871718883514, -0.04391251131892204, 0.006360325496643782, 0.008264102973043919, 0.0033351003658026457, -0.002400108613073826, -0.0080823739990592, 0.008888650685548782, 0.0049156490713357925, 0.042663007974624634, 0.0590

In [7]:
#测试，将openai的embedding到封装到langchain提供的embedding类里面，链接本地模型，分别支持字符串和列表的embedding
from typing import List, Optional
from chromadb.types import Vector
from langchain_core.embeddings import Embeddings
class langchain_EmbeddingFunction(Embeddings):
    def embed_query(self, text: str) -> List[Vector]:
        openai.base_url = 'http://192.168.1.8:8000/v1/'
        self.model="bge-large-zh-1.5"
        db_embeddings = openai.embeddings.create(
            input=[text], model=self.model
            ).data[0].embedding
        return db_embeddings
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        openai.base_url = 'http://192.168.1.8:8000/v1/'
        self.model="bge-large-zh-1.5"
        db_embeddings = [openai.embeddings.create(
            input=[text], model=self.model
            ).data[0].embedding
            for text in texts
        ]
        return db_embeddings

In [8]:
langchain_ef=langchain_EmbeddingFunction()
embedding_result=langchain_ef.embed_query('这是一个测试')
print(embedding_result)

[0.008789479732513428, -0.025418104603886604, -0.01684509590268135, 0.02325964719057083, 0.02803938277065754, -0.004015109036117792, 0.005794750526547432, -0.028220778331160545, -0.01670771650969982, 0.023248163983225822, 0.011479878798127174, -0.008791911415755749, -0.010179485194385052, -0.01919841766357422, 0.007210812531411648, 0.02419707365334034, 0.0393543541431427, -0.00832420028746128, 0.014231963083148003, -0.0488472543656826, 0.048904627561569214, -0.01777009293437004, -0.03577018156647682, -0.051787350326776505, 0.0504925362765789, -0.04979908466339111, -0.044894829392433167, -5.909622632316314e-05, 0.0220244862139225, -0.005367631558328867, -0.024169225245714188, 0.031042277812957764, 0.016757095232605934, 0.01182562205940485, -0.016673225909471512, 0.017949871718883514, -0.04391251131892204, 0.006360325496643782, 0.008264102973043919, 0.0033351003658026457, -0.002400108613073826, -0.0080823739990592, 0.008888650685548782, 0.0049156490713357925, 0.042663007974624634, 0.0590

In [9]:
import chromadb
from chromadb.utils import embedding_functions
# 指定chromadb提供的embedding function使用openai接口能力
bge_embeddings = embedding_functions.OpenAIEmbeddingFunction(
    api_base=os.environ["OPENAI_API_BASE"],
    api_key=os.environ["OPENAI_API_KEY"],
    model_name="bge-large-zh-1.5"
)

In [1]:
#用python内置的with open语句 读写文件是最常见的IO操作
with open("c:\\Users\\lenovo\\生活百科.txt") as file:
    test_text = file.read() 

print(type(test_text),test_text)

<class 'str'> 
------------

第1节

    :bookben

    附：本作品来自互联网,本人不做任何负责内容版权归作者所有

    “快乐生活一点通”部分内容摘录

    微波炉烹制菊花粥	6

    好喝又滋补的糯米百合粥的做法	7

    啤酒果冻的做法	7

    蓑衣黄瓜的做法	7

    凉粉的制作方法	7

    豉椒鲳鱼	8

    酸辣瓜条	8

    730串烧鸡柳制作法	9

    微波炉嫩蛋羹	9

    微波炉做丝糕	10

    蔬菜蘸酱蚝油甜酱的做法	10

    微波炉**翅	10

    朝鲜冷面的做法	11

    朝鲜泡菜	11

    朝鲜冷面专用的面汁的做法	11

    89豆腐烧鱼	11

    89豆腐饼	12

    810打茄酱	13

    810做凉菜	13

    老北京风味凉菜独咸茄的制作方法	13

    快速制作豆腐脑的方法	14

    蜜汁杏仁豆腐制作法	14

    山药	15

    巧做五味酱	15

    脆皮豆腐	15

    好吃又营养的元宝肉的做法	16

    泡菜炒肥牛的做法	16

    蜜豆	16

    小炸蛋	17

    用微波炉做美容保健粥	17

    什锦泡菜	17

    玉兔什锦菜	18

    百果松糕	18

    椒盐土豆饼	19

    小糖饼	19

    咖喱土豆鸡块	19

    茶水豆腐	20

    珍珠丸子的做法	20

    自制天津小吃“面茶”	20

    四川凉面的做法	21

    油炸冰淇淋	21

    水煮鱼的制作方法	22

    炸酱面	23

    下酒菜麻辣鸡丝拌面皮制作法	23

    美味的五香豆腐肉卷	24

    糊塌子	24

    干烧鱼	25

    巧做农家饼	26

    泡椒凤爪的做法	26

    自制河南冻肉	27

    团圆饼的做法	27

    梅干菜扣肉的制作方法	28

    萝卜丝饼	28

    巴基斯坦小羊腿	29

    巴基斯坦风味香煎鱼块	29

    美味排骨	30

    美味的老人乐	30

    火腿香菇鱼	31


In [11]:
#langchain提供的多格式loaders
from langchain.document_loaders.text import TextLoader
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_community.document_loaders.web_base import WebBaseLoader
#加载两部书（生活百科txt 和 明朝简史pdf） 以及知乎的网页
# WebBaseLoader requires installing BeautifulSoup: pip install beautifulsoup4
#loaders = [
#    TextLoader("c:\\Users\\lenovo\\生活百科.txt",encoding='utf8'),
#    PyPDFLoader("c:\\Users\\lenovo\\明朝简史.pdf"),
#    WebBaseLoader("https://www.zhihu.com/question/555165534/answer/3356950812")
#]
#docs = []
#for loader in loaders:
#    docs.extend(loader.load())

loader= TextLoader("c:\\Users\\lenovo\\生活百科.txt",encoding='utf8')
text= loader.load()

In [12]:
print(type(text))
print(text)
chn_text=text[0].page_content
print(chn_text)

<class 'list'>
[Document(page_content='\n------------\n\n第1节\n\n    :bookben\n\n    附：本作品来自互联网,本人不做任何负责内容版权归作者所有\n\n    “快乐生活一点通”部分内容摘录\n\n    微波炉烹制菊花粥\t6\n\n    好喝又滋补的糯米百合粥的做法\t7\n\n    啤酒果冻的做法\t7\n\n    蓑衣黄瓜的做法\t7\n\n    凉粉的制作方法\t7\n\n    豉椒鲳鱼\t8\n\n    酸辣瓜条\t8\n\n    730串烧鸡柳制作法\t9\n\n    微波炉嫩蛋羹\t9\n\n    微波炉做丝糕\t10\n\n    蔬菜蘸酱蚝油甜酱的做法\t10\n\n    微波炉**翅\t10\n\n    朝鲜冷面的做法\t11\n\n    朝鲜泡菜\t11\n\n    朝鲜冷面专用的面汁的做法\t11\n\n    89豆腐烧鱼\t11\n\n    89豆腐饼\t12\n\n    810打茄酱\t13\n\n    810做凉菜\t13\n\n    老北京风味凉菜独咸茄的制作方法\t13\n\n    快速制作豆腐脑的方法\t14\n\n    蜜汁杏仁豆腐制作法\t14\n\n    山药\t15\n\n    巧做五味酱\t15\n\n    脆皮豆腐\t15\n\n    好吃又营养的元宝肉的做法\t16\n\n    泡菜炒肥牛的做法\t16\n\n    蜜豆\t16\n\n    小炸蛋\t17\n\n    用微波炉做美容保健粥\t17\n\n    什锦泡菜\t17\n\n    玉兔什锦菜\t18\n\n    百果松糕\t18\n\n    椒盐土豆饼\t19\n\n    小糖饼\t19\n\n    咖喱土豆鸡块\t19\n\n    茶水豆腐\t20\n\n    珍珠丸子的做法\t20\n\n    自制天津小吃“面茶”\t20\n\n    四川凉面的做法\t21\n\n    油炸冰淇淋\t21\n\n    水煮鱼的制作方法\t22\n\n    炸酱面\t23\n\n    下酒菜麻辣鸡丝拌面皮制作法\t23\n\n    美味的五香豆腐肉卷\t24\n\n    糊塌子\t24\n\n    干烧

In [9]:
#测试，PyPDF提供的PDF文件加载器，对metadata有更好的编辑能力
from PyPDF2 import PdfReader

# 打开 PDF 文件
with open('c:\\Users\\lenovo\\PROLONGADA.pdf', 'rb') as file:
    # 创建 PdfReader 对象
    pdf_reader = PdfReader(file)
    #读取meta data
    meta = pdf_reader.metadata
    print('meta数据:',meta)
    # 获取页面数
    num_pages = len(pdf_reader.pages)
    print(f'Total number of pages: {num_pages}')
 
    # 逐页提取文本
    eng_text=''
    for page_number in range(num_pages):
        page = pdf_reader.pages[page_number]
        eng_text += page.extract_text()
        #print(f'Page {page_number + 1} Text:\n{text}')
    

meta数据: {'/CreationDate': "D:20160502103552-03'00'", '/Creator': 'Adobe InDesign CC (Windows)', '/ModDate': "D:20160502103554-03'00'", '/Producer': 'Adobe PDF Library 10.0.1', '/Trapped': '/False', '/rgid': 'PB:303568104_AS:366525790867456@1464397966883'}
Total number of pages: 6


In [7]:
#打印文字部分
print(eng_text)

See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/303568104
Prolonged sitting and physical discomfort in u niversity students
Article    in  Acta Fisiátric a · Dec ember 2015
DOI: 10.5935/0104-7795.20150034
CITATIONS
21READS
8,875
8 author s, including:
Fátima Ap arecida Car omano
Univ ersity of São P aulo
173 PUBLICA TIONS    1,169  CITATIONS    
SEE PROFILE
Francis F avero
Univ ersidade F eder al de São P aulo
118 PUBLICA TIONS    726 CITATIONS    
SEE PROFILE
Jecilene R Cost a
Univ ersidade F eder al de São P aulo
29 PUBLICA TIONS    202 CITATIONS    
SEE PROFILE
Milena Kaw ai
Univ ersity of São P aulo
2 PUBLICA TIONS    23 CITATIONS    
SEE PROFILE
All c ontent f ollo wing this p age was uplo aded b y Fátima Ap arecida Car omano  on 28 May 2016.
The user has r equest ed enhanc ement of the do wnlo aded file.ORIGINAL ARTICLE176Prolonged sitting and physical discomfort in 
university students
1 PhD, Professor, Depart

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator = '\n\n',
    chunk_size=450,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)
chunks = text_splitter.split_text(chn_text)

In [11]:
print(type(chn_text),len(chn_text))
print(len(chunks),"\n\n")
print('*0*', chunks[0],"\n\n")
print('*1*', chunks[11],"\n\n")
print('*2*', chunks[12],"\n\n")
print('*3*', chunks[13],"\n\n")
print('*4*', chunks[14],"\n\n")
print('*5*', chunks[15],"\n\n")
print('*6*', chunks[16],"\n\n")
print('*7*', chunks[17],"\n\n")
print('*8*', chunks[18],"\n\n")
print('*9*', chunks[19],"\n\n")

<class 'str'> 86778
237 


*0* ------------

第1节

    :bookben

    附：本作品来自互联网,本人不做任何负责内容版权归作者所有

    “快乐生活一点通”部分内容摘录

    微波炉烹制菊花粥	6

    好喝又滋补的糯米百合粥的做法	7

    啤酒果冻的做法	7

    蓑衣黄瓜的做法	7

    凉粉的制作方法	7

    豉椒鲳鱼	8

    酸辣瓜条	8

    730串烧鸡柳制作法	9

    微波炉嫩蛋羹	9

    微波炉做丝糕	10

    蔬菜蘸酱蚝油甜酱的做法	10

    微波炉**翅	10

    朝鲜冷面的做法	11

    朝鲜泡菜	11

    朝鲜冷面专用的面汁的做法	11

    89豆腐烧鱼	11

    89豆腐饼	12

    810打茄酱	13

    810做凉菜	13

    老北京风味凉菜独咸茄的制作方法	13

    快速制作豆腐脑的方法	14

    蜜汁杏仁豆腐制作法	14 


*1* 步骤：

    1、糯米用水洗净，放大碗里，接一碗水，加盖放进微波炉，高火8分钟；

    2、菊花用擀面杖碾碎、去蒂；枸杞用温水浸泡10分钟；

    3、8分钟过去，取出半成品粥，加入适量枸杞，再入微波炉，中火5分钟；

    4、5分钟后加入碾碎的菊花，再中火2分钟

    5、取出放入冰糖

    好喝又滋补的糯米百合粥的做法

    原料：糯米、百合、莲子。

    作法：首先，将所有原料一一洗净，然后上锅点火，将水烧到半开时，倒入所有原料。糯米、百合、莲子的比例大概为4：1：1。将水烧到半开在将原料放入的原因是为了避免粘锅，而且可以利用这段时间泡一下米，当锅被烧开之后，将火调至小火，再慢慢熬制。而当锅再次被烧开的时候，这样好喝又具有滋补供销的糯米百合粥就做好了

    啤酒果冻的做法

    原料：鱼胶粉农贸市场就能买到,苹果。 


*2* 啤酒果冻的做法

    原料：鱼胶粉农贸市场就能买到,苹果。

    作法：将鱼胶粉加入苹果汁中搅拌，鱼胶粉与苹果汁的比例为1：5，搅拌均匀后，将混有鱼胶粉的果汁放入微波炉加热30秒。30秒过后，将果汁取出，然后将其放在冷水中冷却至常温，再放

In [15]:
meta_datas = [{"document": 1}, {"document": 2}]
documents = text_splitter.create_documents(
    [chn_text, eng_text], metadatas=meta_datas
)
print(len(documents))
print(documents[0])
print(len(documents[0].page_content))
print(documents[237])
print(len(documents[237].page_content))

238
page_content='------------\n\n第1节\n\n    :bookben\n\n    附：本作品来自互联网,本人不做任何负责内容版权归作者所有\n\n    “快乐生活一点通”部分内容摘录\n\n    微波炉烹制菊花粥\t6\n\n    好喝又滋补的糯米百合粥的做法\t7\n\n    啤酒果冻的做法\t7\n\n    蓑衣黄瓜的做法\t7\n\n    凉粉的制作方法\t7\n\n    豉椒鲳鱼\t8\n\n    酸辣瓜条\t8\n\n    730串烧鸡柳制作法\t9\n\n    微波炉嫩蛋羹\t9\n\n    微波炉做丝糕\t10\n\n    蔬菜蘸酱蚝油甜酱的做法\t10\n\n    微波炉**翅\t10\n\n    朝鲜冷面的做法\t11\n\n    朝鲜泡菜\t11\n\n    朝鲜冷面专用的面汁的做法\t11\n\n    89豆腐烧鱼\t11\n\n    89豆腐饼\t12\n\n    810打茄酱\t13\n\n    810做凉菜\t13\n\n    老北京风味凉菜独咸茄的制作方法\t13\n\n    快速制作豆腐脑的方法\t14\n\n    蜜汁杏仁豆腐制作法\t14' metadata={'document': 1}
445
page_content='See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/303568104\nProlonged sitting and physical discomfort in u niversity students\nArticle \xa0\xa0 in\xa0\xa0Acta Fisiátric a · Dec ember 2015\nDOI: 10.5935/0104-7795.20150034\nCITATIONS\n21READS\n8,875\n8 author s, including:\nFátima Ap arecida Car omano\nUniv ersity of São P aulo\n173 PUBLICA TIONS \x

In [12]:
#测试，加载langchain的文本递归分割器
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

#RecursiveCharacterTextSplitter默认使用[“\n\n” ,"\n" ," ",""] 这四个特殊符号作为分割文本的标记, 按不同的字符递归地分割文档，同时要兼顾被分割文本的长度和重叠字符
#递归分块使用一组分隔符以分层和迭代的方式将输入文本分成更小的块。如果分割文本开始的时候没有产生所需大小或结构的块，那么这个方法会使用不同的分隔符或标准对生成的块递归调用，直到获得所需的块大小或结构。
#这意味着虽然这些块的大小并不完全相同，但它们仍然会逼近差不多的大小。
 
#参数说明如下：
#chunk_size：被切割的字符串的最大长度
#chunk_overlap：如果仅仅使用chunk_size来切割时，前后两段字符串重叠的字符数量。
#length_function:如何计算块的长度。默认情况下，只计算字符数，但通常在此处传递令牌计数器

def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=450,
        chunk_overlap=50,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

In [13]:
chunks=get_text_chunks(eng_text)

In [20]:
#检验文本块
print(len(chunks),"\n\n")
for chunk in chunks:
    print(chunk,"\n_______",len(chunk),"________\n")

88 


See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/303568104
Prolonged sitting and physical discomfort in u niversity students
Article    in  Acta Fisiátric a · Dec ember 2015
DOI: 10.5935/0104-7795.20150034
CITATIONS
21READS
8,875
8 author s, including:
Fátima Ap arecida Car omano
Univ ersity of São P aulo
173 PUBLICA TIONS    1,169  CITATIONS    
SEE PROFILE
Francis F avero 
_______ 445 ________

SEE PROFILE
Francis F avero
Univ ersidade F eder al de São P aulo
118 PUBLICA TIONS    726 CITATIONS    
SEE PROFILE
Jecilene R Cost a
Univ ersidade F eder al de São P aulo
29 PUBLICA TIONS    202 CITATIONS    
SEE PROFILE
Milena Kaw ai
Univ ersity of São P aulo
2 PUBLICA TIONS    23 CITATIONS    
SEE PROFILE
All c ontent f ollo wing this p age was uplo aded b y Fátima Ap arecida Car omano  on 28 May 2016. 
_______ 409 ________

The user has r equest ed enhanc ement of the do wnlo aded file.ORIGINAL ARTICLE176Prolonged 

In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
#text_splitter = SemanticChunker(OpenAIEmbeddings(tiktoken_enabled=False), breakpoint_threshold_type="interquartile")
text_splitter = SemanticChunker(langchain_EmbeddingFunction(), breakpoint_threshold_type="interquartile")
docs = text_splitter.create_documents([text])
print(docs)

In [16]:
#基于相近句子语意判断的文本切割器
#【【RAG DEMO 优化】如何按照语义来优化段落分割及Chunk】https://www.bilibili.com/video/BV1YZ421v76n
#先要pip install ltp, 哈工大语言技术平台
import numpy as np
from ltp import StnSplit
#here is the setting for the size of chunk, 100 is one article only one chunk
THRESHOLD = 70

class SemanticParagraphSplitter:
    def __init__(self, threshold=THRESHOLD):
        self.threshold = threshold
        self.model =  bge_embeddings

    @staticmethod
    def cut_sentences(text):
        sentences = StnSplit().split(text)
        return sentences

    @staticmethod
    def combine_sentences(sentences, buffer_size=2):
        # Go through each sentence dict
        for i in range(len(sentences)):

            # Create a string that will hold the sentences which are joined
            combined_sentence = ''

            # Add sentences before the current one, based on the buffer size.
            for j in range(i - buffer_size, i):
                # Check if the index j is not negative (to avoid index out of range like on the first one)
                if j >= 0:
                    # Add the sentence at index j to the combined_sentence string
                    combined_sentence += sentences[j]['sentence'] + ' '

            # Add the current sentence
            combined_sentence += sentences[i]['sentence']

            # Add sentences after the current one, based on the buffer size
            for j in range(i + 1, i + 1 + buffer_size):
                # Check if the index j is within the range of the sentences list
                if j < len(sentences):
                    # Add the sentence at index j to the combined_sentence string
                    combined_sentence += ' ' + sentences[j]['sentence']

            # Then add the whole thing to your dict
            # Store the combined sentence in the current sentence dict
            sentences[i]['combined_sentence'] = combined_sentence

        return sentences

    def build_sentences_dict(self, sentences):
        indexed_sentences = [{'sentence': x, 'index': i} for i, x in enumerate(sentences)]
        combined_sentences = self.combine_sentences(indexed_sentences)

        embeddings = self.model([x['combined_sentence'] for x in combined_sentences])

        for i, sentence in enumerate(combined_sentences):
            sentence['combined_sentence_embedding'] = embeddings[i]

        return combined_sentences

    @staticmethod
    def calculate_cosine_distances(sentences):
        distances = []
        for i in range(len(sentences) - 1):
            embedding_current = sentences[i]['combined_sentence_embedding']
            embedding_next = sentences[i + 1]['combined_sentence_embedding']

            # Calculate cosine similarity
            # similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
            similarity = np.array(embedding_current) @ np.array(embedding_next).T
            # Convert to cosine distance
            distance = 1 - similarity

            # Append cosine distance to the list
            distances.append(distance)

            # Store distance in the dictionary
            sentences[i]['distance_to_next'] = distance

        # Optionally handle the last sentence
        # sentences[-1]['distance_to_next'] = None  # or a default value

        return distances, sentences

    def calculate_indices_above_thresh(self, distances):
        breakpoint_distance_threshold = np.percentile(distances, self.threshold)
        # The indices of those breakpoints on your list
        indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold]
        return indices_above_thresh

    @staticmethod
    def cut_chunks(indices_above_thresh, sentences):
        # Initialize the start index
        start_index = 0

        # Create a list to hold the grouped sentences
        chunks = []

        # Iterate through the breakpoints to slice the sentences
        for index in indices_above_thresh:
            # The end index is the current breakpoint
            end_index = index

            # Slice the sentence_dicts from the current start index to the end index
            group = sentences[start_index:end_index + 1]
            combined_text = ' '.join([d['sentence'] for d in group])
            chunks.append(combined_text)

            # Update the start index for the next group
            start_index = index + 1

        # The last group, if any sentences remain
        if start_index < len(sentences):
            combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]])
            chunks.append(combined_text)

        return chunks

    def split(self, text):
        single_sentences = (self.cut_sentences(text)) #Pre-split with standard function
        print(f"{len(single_sentences)} single sentences were found")
        chunks = self.split_passages(single_sentences)
        return chunks

    def split_passages(self, passages):
        combined_sentences = self.build_sentences_dict(passages)
        distances, sentences = self.calculate_cosine_distances(combined_sentences)

        indices_above_thresh = self.calculate_indices_above_thresh(distances)
        chunks = self.cut_chunks(indices_above_thresh, sentences)
        return chunks


In [17]:
#用语意文本切割器将之前加载的知乎页面内容切成文本块
text_splitter = SemanticParagraphSplitter(threshold=THRESHOLD)
chunks=text_splitter.split(chn_text)

#检验文本块
print(len(chunks),"\n\n")
print('*0*', chunks[0],"\n\n")
print('*1*', chunks[11],"\n\n")
print('*2*', chunks[12],"\n\n")
print('*3*', chunks[13],"\n\n")
print('*4*', chunks[14],"\n\n")
print('*5*', chunks[15],"\n\n")
print('*6*', chunks[16],"\n\n")
print('*7*', chunks[17],"\n\n")
print('*8*', chunks[18],"\n\n")
print('*9*', chunks[19],"\n\n")

2625 single sentences were found
788 


------------ 


第1节 


    :bookben 


    附：本作品来自互联网,本人不做任何负责内容版权归作者所有 


    “快乐生活一点通”部分内容摘录     微波炉烹制菊花粥	6     好喝又滋补的糯米百合粥的做法	7 


    啤酒果冻的做法	7     蓑衣黄瓜的做法	7 


    凉粉的制作方法	7     豉椒鲳鱼	8 


    酸辣瓜条	8 


    730串烧鸡柳制作法	9     微波炉嫩蛋羹	9     微波炉做丝糕	10 


    蔬菜蘸酱蚝油甜酱的做法	10     微波炉**翅	10     朝鲜冷面的做法	11     朝鲜泡菜	11 




In [None]:
#测试，读取一个文件夹里的PDF文件并存入向量数据库

path_docfolder = "/Users/qicao/Documents/GitHub/RAG_langchain/data/AutomobileIndustry_raw"
path_db = "/Users/qicao/Documents/GitHub/RAG_simp_DEMO/data/DB"

import os
from langchain_community.document_loaders import (PyPDFLoader)
from langchain_community.vectorstores import Chroma

def read_pdf_files_in_folder_onebyone_and_Store(path_docfolder, path_db, embedding):
    # Iterate over all files in the folder
    for filename in os.listdir(path_docfolder):
        #print(filename)
        if filename.endswith('.pdf'):  # Check if the file is a PDF
            file_path = os.path.join(path_docfolder, filename)
            print(f"Reading file: {file_path}")

            # Open the PDF file
            with open(file_path, 'rb') as file:
                loader = PyPDFLoader(file_path)
                pages_pypdf = loader.load()
                pages = pages_pypdf[0].page_content

                text_splitter = SemanticParagraphSplitter(threshold=THRESHOLD)
                # text_splitter = RecursiveCharacterTextSplitter(
                #     chunk_size=260,
                #     chunk_overlap=20,
                # )
                docs = text_splitter.split(pages)

                # Facility Step 3:用特定模型做embedding
                #db2 = Chroma.from_documents(docs, embedding, persist_directory=path_db)
                db2 = Chroma.from_texts(docs, embedding, persist_directory=path_db)
                print("Successfully save the embedding into DB")

    return True

read_pdf_files_in_folder_onebyone_and_Store(path_docfolder, path_db, EMBEDDING)