In [9]:
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import hashlib
import time
from dotenv import load_dotenv
import langchain
import glob
import time

In [10]:
load_dotenv()
embeddings = OpenAIEmbeddings()

In [11]:
def convert_bytes(bytes, precision=2):
    """Converts bytes into a human-friendly format."""
    abbreviations = ['B', 'KB', 'MB']
    if bytes <= 0:
        return '0 B'
    size = bytes
    index = 0
    while size >= 1024 and index < len(abbreviations) - 1:
        size /= 1024
        index += 1
    return f'{size:.{precision}f} {abbreviations[index]}'

def get_file_size(filepath):
    file_size = os.path.getsize(filepath)
    return file_size

def compute_sha1_from_file(file_path):
    with open(file_path, "rb") as file:
        bytes = file.read()
        readable_hash = compute_sha1_from_content(bytes)
    return readable_hash


def compute_sha1_from_content(content):
    readable_hash = hashlib.sha1(content).hexdigest()
    return readable_hash    

def loadFile(filepath):
    loader = UnstructuredMarkdownLoader(filepath)
    documents = loader.load()
    file_sha1 = compute_sha1_from_file(filepath)
    file_size = get_file_size(filepath)
    return documents, file_sha1, file_size


def find_md_files(directory):
    files = []
    print('---->S')
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".md"):
                #print(os.path.join(root, file))
                files.append(os.path.join(root, file))
    return files


In [12]:
chunk_size = 500
chunk_overlap = 0
file_path = './api_docs/'
print(os.getenv('CHROMA_DB_PATH'))
collection_name = os.getenv('COLLECTION_NAME')
CHROMA_DB_PATH = os.getenv('CHROMA_DB_PATH')
#CHROMA_DB_PATH = 'chromadb'

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap)

db/chroma/api_doc


In [13]:
def splite_file_to_db(file_path):
    print(file_path)
    file_documents, file_sha1, file_size = loadFile(file_path)
    documents = text_splitter.split_documents(file_documents)
    file_name = os.path.basename(file_path)
    
    dateshort = time.strftime("%Y%m%d")
    enable_summarization = False
    index = 0
    all_docs = []
    for doc in documents:
        metadata = {
            "file_sha1": file_sha1,
            "file_size": file_size,
            "file_name": file_name,
            "chunk_size": chunk_size,
            "chunk_overlap": chunk_overlap,
            "date": dateshort,
            "summarization": "true" if enable_summarization else "false"
        }
        doc_with_metadata = Document(page_content=doc.page_content, metadata=metadata)
        
        # if not os.path.exists(CHROMA_DB_PATH):
        #     db = Chroma.from_documents(collection_name=collection_name, documents=[doc_with_metadata], embedding=embeddings, persist_directory=CHROMA_DB_PATH)
        # else:
        #     db = Chroma.from_documents(collection_name=collection_name, documents=[doc_with_metadata], embedding=embeddings, persist_directory=CHROMA_DB_PATH)
        #db.persist()
        all_docs.append(doc_with_metadata)
        index += 1
        print(f" {file_path} ------> {index}")
    return all_docs

In [14]:
all_md_files = glob.glob(os.path.join(file_path, '**/*.md'), recursive=True)

all_files_docs = []
for file in all_md_files:
    time.sleep(4)
    docs = splite_file_to_db(file)
    all_files_docs.extend(docs)


db = Chroma.from_documents(collection_name=collection_name, documents=all_files_docs, embedding=embeddings, persist_directory=CHROMA_DB_PATH)
db.persist()
db = None
print('finished')

./api_docs/klReportMetrics.md
 ./api_docs/klReportMetrics.md ------> 1
 ./api_docs/klReportMetrics.md ------> 2
./api_docs/aiRecognize.md
 ./api_docs/aiRecognize.md ------> 1
 ./api_docs/aiRecognize.md ------> 2
 ./api_docs/aiRecognize.md ------> 3
 ./api_docs/aiRecognize.md ------> 4
 ./api_docs/aiRecognize.md ------> 5
 ./api_docs/aiRecognize.md ------> 6
./api_docs/msi.md
 ./api_docs/msi.md ------> 1
./api_docs/isStepCounterExist.md
 ./api_docs/isStepCounterExist.md ------> 1
 ./api_docs/isStepCounterExist.md ------> 2
./api_docs/startStepCounter.md
 ./api_docs/startStepCounter.md ------> 1
 ./api_docs/startStepCounter.md ------> 2
./api_docs/checkSession.md
 ./api_docs/checkSession.md ------> 1
 ./api_docs/checkSession.md ------> 2
./api_docs/startStepCounterForTimer.md
 ./api_docs/startStepCounterForTimer.md ------> 1
 ./api_docs/startStepCounterForTimer.md ------> 2
./api_docs/stopStepCounter.md
 ./api_docs/stopStepCounter.md ------> 1
 ./api_docs/stopStepCounter.md ------> 2
./a