In [9]:
import os

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import hashlib
import time
from dotenv import load_dotenv
embeddings = OpenAIEmbeddings()

In [12]:
load_dotenv()

True

In [2]:
def convert_bytes(bytes, precision=2):
    """Converts bytes into a human-friendly format."""
    abbreviations = ['B', 'KB', 'MB']
    if bytes <= 0:
        return '0 B'
    size = bytes
    index = 0
    while size >= 1024 and index < len(abbreviations) - 1:
        size /= 1024
        index += 1
    return f'{size:.{precision}f} {abbreviations[index]}'

def get_file_size(filepath):
    file_size = os.path.getsize(filepath)
    return file_size

def compute_sha1_from_file(file_path):
    with open(file_path, "rb") as file:
        bytes = file.read()
        readable_hash = compute_sha1_from_content(bytes)
    return readable_hash


def compute_sha1_from_content(content):
    readable_hash = hashlib.sha1(content).hexdigest()
    return readable_hash    
def loadFile(filepath):
    loader = UnstructuredMarkdownLoader(filepath)
    documents = loader.load()
    file_sha1 = compute_sha1_from_file(filepath)
    file_size = get_file_size(filepath)
    return documents, file_sha1, file_size


In [3]:
chunk_size = 500
chunk_overlap = 0
file_path = './api_docs/aiRecognize.md'
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap)
file_documents, file_sha1, file_size = loadFile(file_path)
documents = text_splitter.split_documents(file_documents)
file_name = os.path.basename(file_path)
dateshort = time.strftime("%Y%m%d")
enable_summarization = False
print(len(documents))
#db = Chroma.from_documents(documents, embeddings)
#print(db)
CHROMA_DB_PATH = './db/chroma/api_doc'
for doc in documents:
    metadata = {
        "file_sha1": file_sha1,
        "file_size": file_size,
        "file_name": file_name,
        "chunk_size": chunk_size,
        "chunk_overlap": chunk_overlap,
        "date": dateshort,
        "summarization": "true" if enable_summarization else "false"
    }
    doc_with_metadata = Document(page_content=doc.page_content, metadata=metadata)


6


In [5]:
from langchain.evaluation.qa import QAGenerateChain
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
import langchain
langchain.debug = True

In [14]:
example_gen_chain = QAGenerateChain.from_llm(ChatOpenAI(), verbose=True)
new_examples = example_gen_chain.apply_and_parse(
    [{"doc": t} for t in documents]
)
print(new_examples)

6


[1m> Entering new QAGenerateChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a teacher coming up with questions to ask on a quiz. 
Given the following document, please generate a question and answer based on that document.

Example Format:
<Begin Document>
...
<End Document>
QUESTION: question here
ANSWER: answer here

These questions should be detailed and be based explicitly on information in the document. Begin!

<Begin Document>
page_content='kl.aiRecognize\n\nAPI名称\n\naiRecognize\n\n功能描述\n\n快驴容器增加视觉识别能力\n\n参数说明\n\n输入参数' metadata={'source': './api_docs/aiRecognize.md'}
<End Document>[0m
Prompt after formatting:
[32;1m[1;3mYou are a teacher coming up with questions to ask on a quiz. 
Given the following document, please generate a question and answer based on that document.

Example Format:
<Begin Document>
...
<End Document>
QUESTION: question here
ANSWER: answer here

These questions should be detailed and be based explicitly on information in the document.