In [None]:
#Libraries

!pip install pandoc langchain gradio chromadb tiktoken clean-text
!pip install "unstructured[local-inference]"
!pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"
!pip install layoutparser pypdf unidecode

import os
import glob
import pandoc
from io import StringIO
import gradio as gr
import re
import time

from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.document_loaders import UnstructuredEPubLoader
from langchain.document_loaders import PyPDFLoader


from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma


from html.parser import HTMLParser
import chromadb    

In [15]:
OPENAI_API_KEY = "your openai api key goes here"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


In [68]:
def examinelibrary():
    # Define the extensions we're looking for
    extensions = ['txt', 'md', 'pdf', 'doc', 'docx']

    # Initialize a counter
    counter = 0

    # Scan the directory
    for extension in extensions:
        counter += len(glob.glob("/books/*." + extension))
        
    # Initialize library as dict and load documents into it
    library = {}
    for extension in extensions:
        for file_name in glob.glob("books/*." + extension):
            content = ""
            if extension in ['txt', 'md']:
                loader = TextLoader(file_name)
            elif extension in ['doc', 'docx']:
                loader = UnstructuredWordDocumentLoader(file_name) 
            elif extension == 'pdf':
                loader = PyPDFLoader(file_name)
            elif extension == 'epub':  
                loader = UnstructuredEPubLoader(file_name) 
        # Load the contents of the file        
        documents = loader.load()
        # Add the file and its contents to the library
        library[file_name] = documents
        
    # Display the count of files detected   
    return(library, counter)

# Split the texts in the library into chunks
def processtext(library):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = []
    for file_name in library:
          texts += text_splitter.split_documents(library[file_name])
    return texts

# Produce a librarian (database of embedded texts)
def prodlibrarian():
    library,counter = examinelibrary()
    texts = processtext(library)
    embeddings = OpenAIEmbeddings()
    db = Chroma.from_documents(texts, embeddings)
    return db

# Call prodlibrarian on startup
db = prodlibrarian()

# Initialize language model and qa chain
llm = OpenAI(temperature=0)
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff")

# Call prodlibrarian and display progress
def scan(progress=gr.Progress()):
    progress(0.2, desc="Examining /books folder ...")
    time.sleep(1)
    progress(0.4, desc="Counting documents ...")
    time.sleep(1.5)
    progress(0.6, desc="Generating embeddings ...")
    time.sleep(1.5)
    db = prodlibrarian()
    return "Librarian found and embedded " + str(counter) + " documents."

# Main query code.
def ask(query, progress=gr.Progress()):
    progress(0.1, desc="Scanning embedded documents for matches ...")
    time.sleep(1)
    progress(0.2, desc="Assembling request...")
    time.sleep(1.5)
    progress(0.4, desc="Appending citations and metadata ...")
    time.sleep(1.5)
    progress(0.5, desc="Talking to LLM for answers ...")
    time.sleep(1.5)
    #generate docs which are texts relevant to the query
    docs = db.similarity_search(query) 
    #some muckwork to log sources
    x = 0
    citations = ""
    for x in range(len(docs)): 
        citations += docs[x].metadata['source'] + " in page: " + str(docs[x].metadata['page'])  + "\n"
        
    #calls llm with the query and relevant docs in hand and returns both the response and the sources    
    librarianoutput = f"{chain.run(input_documents=docs, question=query)}"
    output = "Answer: \n" + librarianoutput + "\n\nI found this in: \n" +  str(citations) 
    return (output)

#Gradio UI
with gr.Blocks() as app:
    
    with gr.Row():
        gr.Markdown("# Welcome to your antilibrary!")
        scan_btn = gr.Button("Scan the library again.")
        
    query = gr.Textbox(label="What can I help you find?")
    output = gr.Textbox(label="Response:")
    ask_btn = gr.Button("Ask Librarian")
    
    ask_btn.click(fn=ask, inputs=query, outputs=output)
    scan_btn.click(fn=scan, outputs=output)
    gr.Markdown("*...a private library is not an ego-boosting appendage but a research tool. The library should contain as much of what you do not know ... You will accumulate more knowledge and more books as you grow older, and the growing number of unread books on the shelves will look at you menacingly. Indeed, the more you know, the larger the rows of unread books. Let us call this collection of unread books an antilibrary.* \n - Nassim Nicholas Taleb, The Black Swan")
app.queue(concurrency_count=10).launch()

Created a chunk of size 1081, which is longer than the specified 1000
Using embedded DuckDB without persistence: data will be transient


Running on local URL:  http://127.0.0.1:7896

To create a public link, set `share=True` in `launch()`.


