In [1]:
import os
from dotenv import load_dotenv
from pathlib import Path

import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

import tiktoken
from llama_index import SimpleDirectoryReader, GPTVectorStoreIndex
from llama_index import LangchainEmbedding, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter

load_dotenv()

True

In [2]:
docs_dir = Path('/home/jinzy/work/automation/ask_gpt/documents')
text_splitter=TokenTextSplitter(chunk_size=1000, chunk_overlap=200)
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", 
                                                      model_kwargs={"device": "cuda"})

load INSTRUCTOR_Transformer
max_seq_length  512


In [3]:
import PyPDF2

#filename = '/home/jinzy/work/automation/ask_gpt/documents/process-mining-0529/process-discovery-algorithms/Leemans_thesis_Hierarchical Process Mining.pdf'

def find_unprocessable_pdfs(directory):
    unprocessable_pdfs = []

    # Walk through the directory tree
    for root, dirs, files in os.walk(directory):
        for file in files:
            # If the file is a PDF
            if file.endswith('.pdf') or file.endswith('.PDF'):
                pdf_path = os.path.join(root, file)
                
                # Attempt to open the file with PyPDF2
                try:
                    with open(pdf_path, 'rb') as f:
                        PyPDF2.PdfReader(f)
                except:
                    print(pdf_path)
                    # If an error is raised, add it to the list of unprocessable files
                    unprocessable_pdfs.append(pdf_path)

    return unprocessable_pdfs

In [4]:
find_unprocessable_pdfs(docs_dir)

[]

In [5]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
documents = SimpleDirectoryReader(docs_dir, recursive=True, required_exts=['.txt', '.pdf'], errors='ignore').load_data()
print(len(documents))

Multiple definitions in dictionary at byte 0x1e96c6 for key /ToUnicode
Multiple definitions in dictionary at byte 0x1e9f9a for key /ToUnicode
Multiple definitions in dictionary at byte 0x1e851d for key /ToUnicode
Multiple definitions in dictionary at byte 0x1e925b for key /ToUnicode
Multiple definitions in dictionary at byte 0x1e9b2f for key /ToUnicode
Multiple definitions in dictionary at byte 0x1ea31a for key /ToUnicode
Multiple definitions in dictionary at byte 0x1ebc64 for key /ToUnicode
Multiple definitions in dictionary at byte 0x1eaad5 for key /ToUnicode
Multiple definitions in dictionary at byte 0x1ec62a for key /ToUnicode
Multiple definitions in dictionary at byte 0x1e7c4b for key /ToUnicode
Multiple definitions in dictionary at byte 0x1ed4d7 for key /ToUnicode
Multiple definitions in dictionary at byte 0x1ecff3 for key /ToUnicode
Multiple definitions in dictionary at byte 0x1eaf10 for key /ToUnicode
Multiple definitions in dictionary at byte 0x1e8986 for key /ToUnicode
Multip

226


In [6]:
node_parser = SimpleNodeParser(text_splitter=text_splitter)
service_context = ServiceContext.from_defaults(
    embed_model=LangchainEmbedding(instructor_embeddings), node_parser=node_parser
)
index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)

In [7]:
query_engine = index.as_query_engine()
r = query_engine.query('请解释一下什么是任务挖掘?')
r.response

'\n任务挖掘是一种机器学习技术，它可以从大量的数据中提取有用的信息，以发现有价值的模式和规律。它可以帮助企业更好地理解客户的行为，并为企业提供有用的洞察，以改善客户体验和提高企业的效率。'

In [8]:
r.source_nodes

[NodeWithScore(node=Node(text='©', doc_id='42403477-d039-4535-adb7-8012c28cc2eb', embedding=None, doc_hash='cadd241f8828aab05b32b0dd8eba71ecf51cc548de4d7cc1c87218d09995cb56', extra_info=None, node_info={'start': 44165, 'end': 44166}, relationships={<DocumentRelationship.SOURCE: '1'>: '82e4ed06-4cbe-43f8-b438-bffcdb55946c', <DocumentRelationship.PREVIOUS: '2'>: 'ac92a0f3-c5da-4c50-aa2e-767a9785fe33', <DocumentRelationship.NEXT: '3'>: '06497e14-fca1-4fc4-8dad-9ef75c9f00e5'}), score=0.8781919393015661),
 NodeWithScore(node=Node(text='©', doc_id='06497e14-fca1-4fc4-8dad-9ef75c9f00e5', embedding=None, doc_hash='cadd241f8828aab05b32b0dd8eba71ecf51cc548de4d7cc1c87218d09995cb56', extra_info=None, node_info={'start': 44166, 'end': 44167}, relationships={<DocumentRelationship.SOURCE: '1'>: '82e4ed06-4cbe-43f8-b438-bffcdb55946c', <DocumentRelationship.PREVIOUS: '2'>: '42403477-d039-4535-adb7-8012c28cc2eb', <DocumentRelationship.NEXT: '3'>: 'ae946c3b-eb48-494d-9f43-76d5b5ea1b35'}), score=0.8781919

In [9]:
r.extra_info

{'42403477-d039-4535-adb7-8012c28cc2eb': None,
 '06497e14-fca1-4fc4-8dad-9ef75c9f00e5': None}

In [10]:
persist_directory = 'rpmv1'
index.storage_context.persist(persist_dir=persist_directory)

In [11]:
r = query_engine.query('请解释一下什么是流程挖掘?')
r.response

'\n流程挖掘是一种数据挖掘技术，它可以从历史事件日志中提取有用的信息，以发现业务流程的模式和规律。它可以帮助企业更好地理解业务流程，并有助于优化业务流程，以提高企业的效率和效果。'