# Purpose
Using local (private) documents in different formats, a chatbot answers questions (in a dialogue). Later, local hosting and extension towards PA is planned. 

# Open Point List & Ideas

## OPL
- include reference

## Ideas 
- Change from OpenAI to another model
- Establish dialogue instead of question only
- increase document base
- involve OpenAI in case no local answer is found (e.g. based on similarity in vectorstore) - avoid hallucination

# Set the project environment

## Import all packages

In [1]:
import os
import openai
import uuid

import pandas as pd

from dotenv import load_dotenv

from typing import List
import magic

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import VectorDBQA
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader

from chromadb.config import Settings


openai.api_key = os.getenv("OPENAI_API_KEY")

from constants import CHROMA_SETTINGS


In [2]:
from langchain.document_loaders import (
    CSVLoader,
    EverNoteLoader,
    PDFMinerLoader,
    TextLoader,
    UnstructuredEmailLoader,
    UnstructuredEPubLoader,
    UnstructuredHTMLLoader,
    UnstructuredMarkdownLoader,
    UnstructuredODTLoader,
    UnstructuredPowerPointLoader,
    UnstructuredWordDocumentLoader,
)

In [3]:
LOADER_MAPPING = {
    ".csv": (CSVLoader, {}),
    # ".docx": (Docx2txtLoader, {}),
    ".doc": (UnstructuredWordDocumentLoader, {}),
 #   ".docx": (UnstructuredWordDocumentLoader, {}),
    ".enex": (EverNoteLoader, {}),
 #   ".eml": (MyElmLoader, {}),
    ".epub": (UnstructuredEPubLoader, {}),
    ".html": (UnstructuredHTMLLoader, {}),
    ".md": (UnstructuredMarkdownLoader, {}),
    ".odt": (UnstructuredODTLoader, {}),
    ".pdf": (PDFMinerLoader, {}),
    ".ppt": (UnstructuredPowerPointLoader, {}),
    ".pptx": (UnstructuredPowerPointLoader, {}),
    ".txt": (TextLoader, {"encoding": "utf8"}),
    # Add more mappings for other file extensions and loaders as needed
}

# determine available loaders based on their extension
available_loader_types = [ext.lstrip(".") for ext in LOADER_MAPPING.keys()]

## Define the variables 

In [4]:
# load environmental variables
load_dotenv()

# directory definition
output_directory = os.environ.get("OUTPUT_DIRECTORY")
doc_source = os.environ.get("DOC_SOURCE_DIRECTORY")
persist_directory = os.environ.get('SIMPLEGPT_PERSIST_DIRECTORY')
embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME')

# Define the Chroma settings
chroma_settings = Settings(chroma_db_impl='duckdb+parquet',persist_directory=persist_directory,anonymized_telemetry=False)

# import tracking file
import_tracking_file = os.path.join(persist_directory, "simpleGPT_import_tracking.csv")

used_model = "OPENAI"   
# used_model = ""

# Definition of essential functions

## Generate a list of all files in one directory and its subdirectories

In [5]:
def get_all_file_paths(directory_path):
    file_paths = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
    return file_paths

## Database to track already imported documents

In [6]:
### Check, if data-file with list of imported files and their uuid already exists
def open_import_tracking(IN_tracking_file:str):
    
    if os.path.isfile(IN_tracking_file):
        print("[LOADING...] import tracking file.")
        import_tracking_df = pd.read_csv(IN_tracking_file)
    else:
        print("[MISSING] import tracking file. No worries, will be generated later automatically!")
        columns = ["uuid", "file_path","type of file"]
        import_tracking_df = pd.DataFrame(columns=columns)
    return import_tracking_df


# generate an uuid for a document
def get_uuid5(IN_file):
    unique_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, IN_file))
    return(unique_id)



In [7]:
# save dataframe to disk
def save_dataframe_to_file(dataframe, filename):    
    dataframe.to_csv(filename, index=False)
    print(f"[SUCCESS] Dataframe saved to {filename}.")

## Read a file using the defined document loader

In [8]:
def read_single_file(link_to_file: str):
    ext = "." + link_to_file.rsplit(".", 1)[-1]

    document = []
    if ext in LOADER_MAPPING:
        # generate unique id from document and save both the name and unique ID into the pandas dataframe
        uuid_value = get_uuid5(link_to_file)

        global import_tracking_df
        if uuid_value in import_tracking_df["uuid"].values:
            print(f"[SKIPPED] Document {link_to_file} already exists in import-record, hence in database. Skipped.")

        else:
            new_row_df = pd.DataFrame({"file_path": [link_to_file], "uuid": [uuid_value], "type of file":[type(link_to_file)]})
            import_tracking_df = pd.concat([new_row_df, import_tracking_df], ignore_index=True)

            # load the document
            loader_class, loader_args = LOADER_MAPPING[ext]
            loader = loader_class(link_to_file, **loader_args)
            document = loader.load()
            print(f"[INFO] Document loaded: {link_to_file}.")

    else:
        print(f"[ERROR] Document NOT loaded: {link_to_file} with extension {ext} skipped due to unknown extension.")
        
    return document


## Load all documents from a list containing the full path to files

In [9]:
def load_all_documents(IN_doc_source: str):
    list_of_files = get_all_file_paths(IN_doc_source)
    result = []

    for single_file in list_of_files:
        result.extend(read_single_file(single_file))
        
    return result

## Split documents into chunks

In [10]:
def split_doc_into_chunks(in_documents):

    texts = []
    if len(in_documents) != 0:
        # Split the documents
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
        texts = text_splitter.split_documents(in_documents)
    else:
        print("[ERROR] No documents loaded!")
        
    return(texts)

# Vectorstore processing

## Check, if vectorstock exists

In [11]:
def does_vectorstore_exist(persist_directory: str) -> bool:
    """
    Checks if vectorstore exists
    """
    if os.path.exists(os.path.join(persist_directory, 'index')):
        if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
            list_index_files = os.path.join(persist_directory, 'index/*.bin')
            list_index_files += os.path.join(persist_directory, 'index/*.pkl')
            # At least 3 documents are needed in a working vectorstore
            if len(list_index_files) > 3:
                return True
    return False

# MAIN part

In [12]:
# initializing the vectorstore
embeddings = OpenAIEmbeddings()

# check, if import_tracking_file exists
import_tracking_df = open_import_tracking(IN_tracking_file= import_tracking_file)

documents = load_all_documents(doc_source)
print(f"[INFO] {len(documents)} documents were loaded")


text_chunks = split_doc_into_chunks(documents)
print(f"[INFO] Number of chunks {len(text_chunks)}")


# check, if the vectorstore exists
if does_vectorstore_exist(persist_directory=persist_directory) is True:
    print("[INFO] A vectorstore exists. I will append to this one!")

    # loading the vectorstore
    vectordb = Chroma(persist_directory=persist_directory, 
                      embedding_function=embeddings,
                      client_settings=chroma_settings)
    
    # adding documents
    vectordb.add_documents(text_chunks)

else:
    print("[INFO] No vectorstore exists. I will create a new one for you!")
    vectordb = Chroma.from_documents(documents=text_chunks, 
                               embedding=embeddings, 
                               persist_directory=persist_directory, 
                               client_settings=chroma_settings)

# saving the vectorstore
vectordb.persist()
vectordb = None

# save import log
save_dataframe_to_file(dataframe=import_tracking_df,
                       filename=import_tracking_file)

# loading the vectorstore
vectordb = Chroma(persist_directory=persist_directory,
                   embedding_function=embeddings)

# Create retriever and the chain
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=vectordb.as_retriever(), return_source_documents=True)

query = "How do you change an organization?"
# qa.run(query)



result = qa({"query": query})

#print(qa)

print("[INFO] Query")
print(result["query"])

print("[INFO] Result")
print(result["result"])

print("\n[INFO] Source")
print(result["source_documents"])

Unable to connect optimized C data functions [No module named 'clickhouse_connect.driverc.buffer'], falling back to pure Python
Unable to connect ClickHouse Connect C to Numpy API [No module named 'clickhouse_connect.driverc.npconv'], falling back to pure Python


[LOADING...] import tracking file.
[SKIPPED] Document /Users/swmoeller/python/2023/NLP/BrainGPT/data/00_doc2scan/Die Dritte Dimension - Martin Pfiffner.pdf already exists in import-record, hence in database. Skipped.
[ERROR] Document NOT loaded: /Users/swmoeller/python/2023/NLP/BrainGPT/data/00_doc2scan/No ETA Maverick 400 Specifications.docx with extension .docx skipped due to unknown extension.
[SKIPPED] Document /Users/swmoeller/python/2023/NLP/BrainGPT/data/00_doc2scan/Bosch_Washing_Machine.pdf already exists in import-record, hence in database. Skipped.
[SKIPPED] Document /Users/swmoeller/python/2023/NLP/BrainGPT/data/00_doc2scan/state_of_the_union copy.txt already exists in import-record, hence in database. Skipped.
[SKIPPED] Document /Users/swmoeller/python/2023/NLP/BrainGPT/data/00_doc2scan/state_of_the_union.txt already exists in import-record, hence in database. Skipped.
[SKIPPED] Document /Users/swmoeller/python/2023/NLP/BrainGPT/data/00_doc2scan/Regulierung_von_Large_Langua