Scientific Publications as PDFs, Text Extraction, Storage SQL database, Chunks, Embedding and Vectorization, Retrieval, GPT

In [None]:
# Not required, but can be usefull if a Cell need local proxy connection to Docker or LM Studio
'''
import os
os.environ['http_proxy']="http://localhost:1238"
os.environ['https_proxy']="http://localhost:1238"
'''

## 1<sup>st</sup> Setting Up PDF Collection and DATABASE Paths

In [None]:
import json
import os
%run assets/func_inputoutput.py
# Setup paths
cwd = os.getcwd()
print("Current working directory:", cwd)
# Define pdf file directory
pdf_collection_path = 'PET_Methods'
pdf_path = os.path.join(cwd, "docs", pdf_collection_path)
# Define database directory
base_name = 'PET_Methods.db'
database_name = os.path.join(cwd, "docs", pdf_collection_path, base_name)
# Create settings dictionary
settings = {
    'working_directory': cwd,
    'pdf_collection_path': pdf_collection_path,
    'pdf_path': pdf_path,
    'base_name': base_name,
    'database_name': database_name,
    'additional': 'value1'  # 
}
# Save settings to a file
save_settings(settings)
# Later on, load settings and unpack directly into variables
working_directory, pdf_collection_path, pdf_path, base_name, database_name, remaining_settings = load_settings()
# Print the loaded settings
print(f'\nworking_directory {working_directory}')
print(f"\nPDF Collection Path: {pdf_collection_path}")
print(f"\npdf_path: {pdf_path}")  
print(f"\nbase_name: {base_name}")  
print(f"\ndatabase_name: {database_name}")  
print(f"\nRemaining Settings: {remaining_settings}")

In [None]:
working_directory, pdf_collection_path, pdf_path, base_name, database_name, remaining_settings = load_settings()

## Read PDF and Extract TEXT, Stores in a SQLite3 database (db): GROBID based approach

# Instructions for Running GROBID Docker
1. Download the small GROBID docker image and copy `config.json` into the Jupyter notebook working directory.
2. Run local Docker:
    - Start Docker for Windows.
    - Use CMD.
3. Execute the following command to run the docker container:
    ```bash
    docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:0.8.0
    ```
4. Check the running docker with [http://localhost:8070](http://localhost:8070) and ensure this matches the one in your `config.json`.
5. path to the PDF collections and database are setted above   
6. Choose to recreate the SQL (y/n) or just reload the current one.
7. Note that processing 70 PDFs can take over 10 minutes.
8. Wait for the console message: "Finished".
9. If you encounter an XML error or some "localhost" mix up, ensure you stay connected with the docker by avoiding proxy configurations elsewhere.
10. todo: add new pdfs to database


In [None]:
import os
import sqlite3
import pandas as pd
from grobid_client.grobid_client import GrobidClient
import json
import grobid_tei_xml



class GrobidAuthor:
    def __init__(self, full_name):
        self.full_name = full_name

class GrobidBiblio:
    def __init__(self, index, authors, title, date, volume, pages, issue, journal, doi):
        self.index = index
        self.authors = authors
        self.title = title
        self.date = date
        self.volume = volume
        self.pages = pages
        self.issue = issue
        self.journal = journal
        self.doi = doi

def extract_bibliographic_details(biblios, print_choice=False):
    for biblio in biblios:
        if print_choice:
            print("Index", biblio.index, "| Title:", biblio.title)
            print("Authors:")
            for author in biblio.authors:
                print("-", author.full_name)

            print("Date:", biblio.date)
            print("Volume:", biblio.volume)
            print("Pages:", biblio.pages)
            # print("Issue:", biblio.issue)
            print("Journal:", biblio.journal)
            print("Doi:", biblio.doi)
            print()
        else:
            i=1# Add any other logic you want to perform when print_choice is False
            pass
    ref_list = '*'.join([
        f" * Index: {biblio.index} | Title: {biblio.title} | Authors: {', '.join([author.full_name for author in biblio.authors])} | Date: {biblio.date} | Volume: {biblio.volume} | Pages: {biblio.pages} | Journal: {biblio.journal} | Doi: {biblio.doi}\|"
        for biblio in biblios
    ])
    return ref_list
    
    
client = GrobidClient(config_path="config.json")





print("\n\ndatabase_pathname",database_name)



service_name = "processFulltextDocument"

# Check if the database file exists
if os.path.exists(database_name):
    while True:
        choice = input("\nThe database file already exists. Do you want to create a new one? (y/n): ")
        if choice.lower() == 'y':
            # Close the connection to the database if it is open
            if 'conn' in locals():
                conn.close()

            # Delete the existing database file
            conn = None  # Reset conn variable
            os.remove(database_name)
            break
        elif choice.lower() == 'n':
            break
        else:
            print("Invalid choice! Please enter 'y' or 'n'.")

# Connect to the SQLite database
conn = sqlite3.connect(database_name)
# Create the table if it doesn't exist
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
result = cursor.fetchone()

if result is None:
    table_name = "your_table_name"  # Provide a table name of your choice
    cursor.execute('''CREATE TABLE {table_name}
                    (ID INTEGER PRIMARY KEY,
                    Title TEXT,
                    Authors TEXT,
                    DOI TEXT,
                    Citations INTEGER,
                    Abstract TEXT,
                    Body TEXT,
                    Refs TEXT)'''.format(table_name=table_name))
else:
    table_name = result[0]
    
# Retrieve the maximum ID value from the table
cursor.execute("SELECT MAX(ID) FROM {table_name}".format(table_name=table_name))
max_id = cursor.fetchone()[0]
# Increment the ID value by one
id_key = max_id + 1 if max_id is not None else 1


if choice.lower() == 'y':
    # Loop through the files in the directory
    for filename in os.listdir(path):
        if filename.endswith(".pdf"):
            print(f"filename: {filename}")
            file_path = os.path.join(path, filename)

            pdf_file, status, text = client.process_pdf(service_name, 
                                     file_path, 
                                     generateIDs=True, 
                                     consolidate_header=True, 
                                     consolidate_citations=True, 
                                     include_raw_citations=True, 
                                     include_raw_affiliations=True, 
                                     tei_coordinates=True,                          
                                     segment_sentences=True)


            grobid_biblios=grobid_tei_xml.parse_citation_list_xml(text)
            # Extract metadat and text and print the bibliographic details
            ref_list=extract_bibliographic_details(grobid_biblios, print_choice=False)

            doc = grobid_tei_xml.parse_document_xml(text)
            title = doc.header.title
            authors = ';'.join([a.full_name for a in doc.header.authors])
            doi = str(doc.header.doi)
            citations = str(len(doc.citations))
            abstract = doc.abstract
            body = doc.body

            #----------------------------------------
            print(f"id_key: {id_key}")
            # Insert the information into the database
            cursor.execute('''
                INSERT INTO {table_name} (ID, Title, Authors, DOI, Citations, Abstract, Body, Refs)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            '''.format(table_name=table_name), (id_key, title, authors, doi, citations, abstract, body, ref_list))

            # Commit the changes to the database
            conn.commit()
            # Increment the ID key for the next iteration
            id_key += 1

            # Retrieve the count of records from the table
            cursor.execute("SELECT COUNT(*) FROM {table_name}".format(table_name=table_name))
            count = cursor.fetchone()[0]
            print("record loop:", count)
elif choice.lower() == 'n':
    print("\n\nActive database_pathname", database_name)
else:
    print("Invalid choice! Please enter 'y' or 'n'.")
    
# Retrieve the count of records from the table
cursor.execute("SELECT COUNT(*) FROM {table_name}".format(table_name=table_name))
count = cursor.fetchone()[0]
print("Total records:", count)        
# Close the connection
conn.close()
print("finished")

# Check records of SQL database: 
### Can also be used alone without Docker, once SQL database was created
 * if start from here, then run settings in second cell at start 
 * database
    * base_name='sample.db'
    * database_name=os.path.join(cwd, "docs", pdf_collection_path, base_name) #database
 * Enter the record number or 'q' to quit: After: Enter 'n' for next choice or 'q' for quitEnter the record number or 'q' to quit: After: Enter 'n' for next choice or 'q' for quit# Check of SQL database: 

In [None]:
import panel as pn
import sqlite3
import pandas as pd

pn.extension()

# --- Database Connection ---
try:
    conn = sqlite3.connect(database_name)
    cursor = conn.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    if not tables:
        raise ValueError("No tables found in the database.")
    table_name = tables[0][0]

    # --- Check if the table has any rows ---
    cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
    row_count = cursor.fetchone()[0]
    if row_count == 0:
        raise ValueError(f"Table '{table_name}' is empty.")

except Exception as e:
    print(f"Database Error: {e}")
    import sys
    sys.exit(1)


user_input = input("dashboard: 'n' : 'y'")

if user_input.lower() == 'y':
    
    # --- Record Selection ---
    record_ids_select = pn.widgets.Select(
        name='Record ID', 
        sizing_mode='stretch_width',
        styles={
            'background-color': 'white',  # White background
            'color': 'black',  # Black text
            'border-color': '#555'
        }
    )

    def populate_record_ids():
        try:
            cursor.execute(f"SELECT rowid, body FROM {table_name}")
            records = cursor.fetchall()
            if records:
                # Create options with a truncated preview and full record ID
                record_options = {}
                for row in records:
                    # Truncate body for display, but keep full rowid
                    preview = row[1][:100] + '...' if len(row[1]) > 100 else row[1]
                    record_options[f"ID {row[0]}: {preview}"] = row[0]

                record_ids_select.options = record_options
                # Set initial value to the first record
                record_ids_select.value = list(record_options.values())[0]
            else:
                print("No records found.")
                record_ids_select.options = []
                output_area.value = "No records found in the table."

        except Exception as e:
            print(f"Error populating record IDs: {e}")
            output_area.value = f"Error: {e}"

    # --- Output Area with full screen height and dark theme ---
    output_area = pn.widgets.TextAreaInput(
        value="", 
        sizing_mode='stretch_width',  # Stretch to full width
        height=600,  # Increased height to 600 pixels
        styles={
            'background-color': 'white', #'#1a1a1a',  # Very dark background
            'color': '#e0e0e0',  # Light gray text for better readability
            'font-family': 'monospace',
            'font-size': '14px',
            'border-color': '#555',
            'padding': '10px'
        },
        disabled=True
    )

    @pn.depends(record_ids_select.param.value, watch=True)
    def update_output(record_id):
        if record_id is None:
            output_area.value = "No record selected or table is empty."
            return

        try:
            cursor.execute(f"SELECT body FROM {table_name} WHERE rowid = ?", (record_id,))
            record_body = cursor.fetchone()

            if record_body:
                # Ensure text is fully visible
                output_area.value = str(record_body[0])
            else:
                output_area.value = f"Record with ID {record_id} not found."

        except Exception as e:
            output_area.value = f"Error retrieving record: {e}"

    # Initial population and update
    populate_record_ids()
    update_output(record_ids_select.value)

    # --- Dashboard Layout ---
    dashboard_layout = pn.Column(
        pn.pane.Markdown(
            "## Record Viewer", 
            styles={'color': 'white', 'background-color': '#222'}
        ),
        record_ids_select,
        output_area,
        sizing_mode='stretch_width',
        styles={
            'background-color': '#222',  # Dark background for the entire layout
        }
    )

    # Add global dark theme CSS
    pn.config.raw_css.append("""
        body {
            background-color: #222 !important;
            color: #e0e0e0 !important;
        }
        .bk-root {
            background-color: #222;
        }
        .bk-input-container {
            background-color: #333 !important;
        }
    """)

    # Display the dashboard
    dashboard = pn.panel(dashboard_layout)
    dashboard.show()
    
# Close the connection


# Running LM Studio and Creating or Loading Vector Database with or without previous steps
based on: Chroma version: 0.5.0 | Langchain version: 0.2.3

1. Start LM Studio 0.2.27.
2. Activate the Local Server Tab (" <-> ").
3. Select a model, e.g., "Meta-Llama-3-8B-Instruct-Q8_0.gguf".
    - Choose the preset "Llama3".
4. Choose a server port that does not conflict with the docker port.
5. Load the "Embedding Model", e.g., "nomic embed text v1 5 Q8_0.gguf".
    - Models were previously downloaded manually from Huggingface/Models/.
6. Press "Start Server".
7. Define if required the following paths in line 80-81:
    ```python
    
    user_path = os.path.join(os.getcwd(), "docs", pdf_collection_path, "sample")
    temp_dir = os.path.join(os.getcwd(), "docs", "temp")
    
    ```
8. Run the cell:
    - If there is an error indicating the base is in use, ensure the path is correct and start from the cell "Check SQL Database" after restarting the kernel.
9. If there is a connection error to LM, remove any other open proxy or browser proxy settings in "Settings" in browser proxy settings, restart the kernel, and start from "Check SQL Database", or disable warnings and proxy settings in line 130 
10. After `embed_and_chunk` with adding metadate, creating the vector database, set mode in line 186 to `load` or ``append``.
11. Also chose "new_vectordbnew_vectordb= False or True" 

In [1]:
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from openai import OpenAI
from typing import List
import os
import shutil
import tempfile
import time
from langchain.text_splitter import RecursiveCharacterTextSplitter  # Import here
import sqlite3

%run assets/func_inputoutput.py

class CustomEmbedding2:  # If you MUST keep this, initialize client here
    def __init__(self, client):  # Pass client to the constructor
        self.client = client  # Store the client
        self.embeddings = []


    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        embeddings = [self.get_embedding(text) for text in texts] # use self.get_embedding
        self.embeddings = embeddings
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        embedding = self.get_embedding(text)  # Use self.get_embedding
        self.embeddings = [embedding]
        return embedding

    def get_embedding(self, text, model="TheBloke/nomic-embed-text"): # make this a method of the class
        text = text.replace("\n", " ")
        return self.client.embeddings.create(input=[text], model=model).data[0].embedding # Use self.client



def embed_and_chunk2(docs, embedding, chunk_size=2000, chunk_overlap=200):
    """Embeds and chunks a list of Documents, returning a tuple (texts, embeddings)."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(docs)

    embeddings = embedding.embed_documents([doc.page_content for doc in texts])

    # Store embeddings in a dictionary, keyed by document chunk ID or index
    embedding_dict = {i: emb for i, emb in enumerate(embeddings)}  # Or use some unique ID from doc.metadata

    return texts, embedding_dict

def embed_and_chunk(docs, embedding, chunk_size=2000, chunk_overlap=200):
    """Embeds and chunks a list of Documents."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(docs)

    embeddings = embedding.embed_documents([doc.page_content for doc in texts])


    embedding_dict = {i: emb for i, emb in enumerate(embeddings)}  # Still useful for efficient embedding lookup


    # IMPORTANT: Add "doc_id" to each chunk's metadata:
    for i, text in enumerate(texts):
        text.metadata['doc_id'] = text.metadata['ID'] # Or whatever unique ID you are using from SQLite


        text.metadata['chunk_id'] = i  # It's often helpful to have a unique chunk ID as well

    return texts, embedding_dict

def create_vectordb(texts, embedding, user_path=None, persist_directory=None, mode="create", force_overwrite=True):
    """Creates, loads, or appends to a vector database."""
    cwd = os.getcwd()

    if mode == "create":  # Create a new database (overwrites if exists)
        if persist_directory is None:
            if user_path:
                persist_directory = os.path.join(cwd, user_path, 'chroma_db')
            else:
                persist_directory = os.path.join(cwd,'chroma_db')
                raise ValueError(f"persist_directory or user_path must be provided when creating, now used {persist_directory}")
                
        if os.path.exists(persist_directory): # Crucial check for empty directory
            if os.listdir(persist_directory): # Check if the directory is NOT empty
                if force_overwrite:  # Only remove if force_overwrite is True and not empty
                    shutil.rmtree(persist_directory)
                else:
                    raise FileExistsError(f"Vector database directory '{persist_directory}' is not empty. Use mode='append' or set force_overwrite=True to overwrite.")
            else: # Directory exists but empty
                shutil.rmtree(persist_directory) # clean since old chroma files might be in

        print("Creating a new vector database...")  # More informative message
        start_time = time.time()
        vectordb = Chroma.from_documents(texts, embedding, persist_directory=persist_directory)
        end_time = time.time()
        print(f"New vector database created in {end_time - start_time:.2f} seconds. Directory: {persist_directory}")



    elif mode == "load":  # Load existing database
        if persist_directory is None:
            if user_path:
                persist_directory = os.path.join(cwd, user_path, 'chroma_db')
            else:
                raise ValueError("persist_directory or user_path must be provided when loading")

        if not os.path.exists(persist_directory):
            raise FileNotFoundError(f"Vector database not found: {persist_directory}") # Raise exception

        print(f"Loading vector database from {persist_directory}...")
        start_time = time.time()
        vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
        end_time = time.time()
        print(f"Vector database loaded in {end_time - start_time:.2f} seconds.")


    elif mode == "append":
        if persist_directory is None:
            if user_path:
                persist_directory = os.path.join(cwd, user_path, 'chroma_db')
            else:
                raise ValueError("persist_directory or user_path must be provided when appending")


        if not os.path.exists(persist_directory):
            raise FileNotFoundError(f"Vector database not found: {persist_directory}") # Raise exception

        print(f"Appending to vector database at {persist_directory}...")
        start_time = time.time()

        vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
        vectordb.add_documents(texts)
        end_time = time.time()
        print(f"Documents appended in {end_time - start_time:.2f} seconds.")

    else:
        raise ValueError(f"Invalid mode: {mode}. Choose 'create', 'load', or 'append'.")

    return vectordb, persist_directory

import os
import urllib3

# Disable warnings and proxy settings
os.environ['NO_PROXY'] = 'localhost,127.0.0.1'
urllib3.disable_warnings()
urllib3.util.connection.is_connection_dropped = lambda conn: False

# Configure requests to ignore proxies
import requests
requests.Session().trust_env = False

working_directory, pdf_collection_path, pdf_path, base_name, database_name,_ = load_settings()

user_path = os.path.join(os.getcwd(), "docs", pdf_collection_path, "sample")
temp_dir = os.path.join(os.getcwd(), "docs", "temp")

desired_chunk_size = 2000
chunk_overlap = 450

persist_directory=temp_dir

client = OpenAI(base_url="http://localhost:1238/v1", api_key="lm-studio") # Important: Initialize client before embedding
embedding = CustomEmbedding2(client=client)


# Database connection and document loading:
conn = sqlite3.connect(database_name)
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
if not tables:
    raise ValueError("No tables found in the database.")
table_name = tables[0][0]

cursor.execute(f"SELECT ID, Title, Abstract, Body FROM {table_name}")  # Simplified query (removed Authors, DOI if not needed)
#cursor.execute("SELECT ID, Title, Authors, DOI, Abstract, Body FROM {table_name}".format(table_name=table_name))
results = cursor.fetchall()

num_original_docs = 0  # Initialize counter
docs = []  # List to store the Document objects *before* chunking

for result in results:
    record_id, title, abstract, body = result
    
    #print(f"Record_id {record_id} | title {title}")
    num_original_docs += 1  # Increment counter
    if abstract is None:
        abstract = ""
    if body is None:
        body = ""

    combined_text = abstract + " " + body
    doc = Document(page_content=combined_text, metadata={'ID': record_id, 'Title': title})
    docs.append(doc)  # Add the whole document
print(f"\n Finished reading and adding Metadata to database {database_name} records ")

conn.close()
#---------------------------------------------------------------------------------

mode="load"
force_overwrite=True


print(f'\ncreates texts, embedding_dict and verctordb if mode is "load": Current Mode: {mode}')
texts, embedding_dict = embed_and_chunk(docs, embedding, chunk_size=desired_chunk_size, chunk_overlap=chunk_overlap)


print(f'mode {mode} | force_overwrite {force_overwrite}')
vectordb, _ = create_vectordb(texts, embedding, user_path=user_path, persist_directory=persist_directory, mode=mode, force_overwrite=force_overwrite)  # Pass force_overwrite

#vectordb.persist()  # Persist the database and release locks
print(f"Number of original documents: {num_original_docs} | chunks {vectordb._collection.count()}")  # Now you have the original count


print("\nfinished", persist_directory, "\n chunks: ", len(vectordb))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thomas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Settings loaded from settings.json.

path and name settings from above were loaded

  There are additional settings that could be added:
  additional: value1

 Finished reading and adding Metadata to database e:\users\behnisch\python\envs\GPT\openai-quickstart-node-master\PDF_screen\Langchain\LangChain-Chat-with-Your-Data-main\docs\PET_Methods\PET_Methods.db records 

creates texts, embedding_dict and verctordb if mode is "load": Current Mode: load
mode load | force_overwrite True
Loading vector database from e:\users\behnisch\python\envs\GPT\openai-quickstart-node-master\PDF_screen\Langchain\LangChain-Chat-with-Your-Data-main\docs\temp...
Vector database loaded in 1.22 seconds.
Number of original documents: 17 | chunks 332

finished e:\users\behnisch\python\envs\GPT\openai-quickstart-node-master\PDF_screen\Langchain\LangChain-Chat-with-Your-Data-main\docs\temp 
 chunks:  332


In [2]:
skip=True
if not skip:
    # To retrieve and display chunks for a specific document ID:
    doc_id_to_retrieve = 14  # Example: Retrieve chunks for document with ID 1

    # Method 1: Filter chunks after retrieval (simpler if your vector database doesn't support metadata filtering during search)
    chunks = vectordb.similarity_search("your query", k=100) # retrieve a larger number of chunks than needed
    #relevant_chunks = [c for c in chunks if c.metadata.get('doc_id') == doc_id_to_retrieve]


    # Method 2: Filter during retrieval (if your vector database supports it – more efficient):
    relevant_chunks = vectordb.similarity_search("your query", k=100, filter={"doc_id": doc_id_to_retrieve})  # This depends on vectordb capabilities


    for chunk in relevant_chunks:
        print(f"Chunk ID: {chunk.metadata['chunk_id']}")
        print(f"Document ID: {chunk.metadata['doc_id']}")
        print(chunk.page_content)
        print("-" * 30)

# Required to Run: Alternative Retrieval

In [3]:
#from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import OpenAI

llm  = OpenAI(base_url="http://localhost:1238/v1", api_key="lm-studio")
#llm = ChatOpenAI(temperature=0)

# we instantiated the retreiever above
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(), 
    llm=llm
)
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

#unique_docs = retriever_from_llm.get_relevant_documents(query=question)

# PDF retrieval with keeping metdata as citation with contents in text
# PANEL DASHBOARD
## 1st Based on  client.chat.completions.create  : 
    * Browser TAB
    * Prompt Input
    * Query
    * Response
    * Whole message sent to Model
## Chose Retrieval: Standard or Multi (langChain): line 42
    * retriever_from_llm

In [7]:
# this change original is above seems to wrok with correct template
import panel as pn
from openai import OpenAI
from langchain_community.vectorstores import Chroma
#from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
import re
import time
%run assets/func_inputoutput.py
# Disable warnings and proxy settings
os.environ['NO_PROXY'] = 'localhost,127.0.0.1'
urllib3.disable_warnings()
urllib3.util.connection.is_connection_dropped = lambda conn: False

# Configure requests to ignore proxies
import requests
requests.Session().trust_env = False

# Define the template for the prompt t1
template1 = """
Use the following retrieved documents and the previous conversation to answer the query. Incorporate your own knowledge and reasoning as an AI assistant:
Previous Conversation: {history}
Query: {query}
Retrieved Documents: {retrieved_docs}
Answer:
{answer}
"""

# Define the template for the prompt t2
template2 = """
Use the following Retrieved Documents and the previous conversation to answer the query. Incorporate your own knowledge and reasoning as an AI assistant.  Keep the content together with document and chunk numbers at the end of sentences in brackets. If content from different documents is combined into a new sentence, place the document and chunk numbers at the end of the sentence, separated by semicolons.
Previous Conversation: {history}
Query: {query}
Retrieved Documents: {retrieved_docs}
Answer:
{answer}
"""

template = """
CRITICAL CITATION INSTRUCTIONS:
1. MANDATORY: Cite the EXACT Document and Chunk number for EVERY piece of information retrieved from Retrieved Documents
2. Format citations EXACTLY like this: 
   - Single source: "(Document Number, Chunk Number)"
   - Multiple sources: "(Document Number, Chunk Number; Document Number, Chunk Number)"
3. Place citations IMMEDIATELY after the relevant information, inside parentheses
4. NEVER omit citations
5. If rephrasing content, still include original source citation

Use the following Retrieved Documents and the previous conversation to answer the query:

Previous Conversation: {history}
Query: {query}
Retrieved Documents: {retrieved_docs}

RESPONSE GUIDELINES:
- Create a cohesive, well-structured paragraph
- Ensure EVERY fact is explicitly cited
- Maintain scientific rigor and precision
- Include citations WITHOUT FAIL

Answer:
"""
# Create the prompt template
prompt_template = PromptTemplate(
    input_variables=["history", "query", "retrieved_docs", "answer"],  # Removed 'context'
    template=template
)


history = []

# Point to the local OpenAI server
client = OpenAI(base_url="http://localhost:1238/v1", api_key="lm-studio")


which_retriever='Standard'
#which_retriever='Standard'

if which_retriever =='Multi':
    retriever=retriever_from_llm
else:
    # Set up the vector retriever
    retriever = vectordb.as_retriever()




    

# Define a function to handle the user's input
def handle_input2(vectordb, embedding, llm, retriever): # Pass as arguments
    global is_first_run  # Access global variables
    wrap_width = 100
    start_time = time.time()
    
    if is_first_run:
        query = "Hello, introduce yourself to someone opening this program for the first time. Be concise. Do not add citations in this message."
        
        similar_docs=[]        
        output_box_retrievalTime.object = f"Processing Retrieval {is_first_run}"
    else:
        #output_box_retrievalTime.object = f"Processing Retrieval: {is_first_run}"
        output_box_gpt_responseTime = f"Waiting for GPT"
        #start_timere = time.time()
        query = input_box.value.strip()

    #history.append({"role": "user", "content": query)# + " Database content: [" + some_context + "]"})

    if query:
        managed_history = manage_conversation_history(history)
        #vector_db_query = "Image normalization methods for analyzing brain metabolism in different brain regions."  # Use the concise prompt
        
        selected_method ='combined' #'keywords' #llm, combined
        
        retrieved_docs = ""        
        retriever = DocumentRetriever(vectordb)
        retrieved_docs, used_query, method_retquery = retriever.retrieve_documents(query, is_first_run, k=4, method=selected_method)
                
        used_query=f"**Used Retrieval Query:**  \n {used_query}\n"        
        
        output_box3.object=used_query

        prompt = prompt_template.format(
            history=managed_history,#history,
            query=query,
            retrieved_docs=retrieved_docs,  # Use retrieved_text here
            answer=""
        )
        output_box2.object = prompt
        
        end_timere = time.time()
        # Calculate the elapsed time
        elapsed_timere = end_timere - start_time
        
        retrieved_tokens=word_count(retrieved_docs)
        output_box_retrievalTime.object = f"Time for Retrieval with: {elapsed_timere:.2f} seconds | retrieved_docs_length {retrieved_tokens} | selected_method {selected_method}\n | {method_retquery}"
       

        # Display formatted messages in output box
        messages = managed_history + [{
            "role": "system", 
            "content": """You are a precise scientific document analysis AI. 
            ABSOLUTE REQUIREMENTS:
            - Cite sources for EVERY piece of information recieved from Retrieved_Docs
            - Use EXACT Document and Chunk citation Numbers indicated in front of each text paragraphs in Retrieved_Docs
            - Never generate unsourced claims
            - If taken a citation from text like (Lee et al.,) list at the end, but do not invent title or other data
            - Be scientifically accurate and rigorous and write with excelent transfer sentences in the Sytle of Eric Kandel"""},
            {"role": "user", "content": prompt}
            ]       
        
 
                    
        
        completion = client.chat.completions.create(
            model="LLMA/Meta-Llama-3-8B",  # Correct model name/path here!
            messages=messages,
            temperature=0.7,
            stream=True,
        )

        new_message = {"role": "assistant", "content": ""}
        full_response = ""
        for chunk in completion:
            if chunk.choices[0].delta.content:
                new_message["content"] += chunk.choices[0].delta.content
                full_response += chunk.choices[0].delta.content


        # Combine the summary and LLM response in the displayed output
        final_output = f"Final Answer:\n{full_response}"  # Adjust formatting as needed
        formatted_text = f"**Query:**  \n{query}\n\n**Final Answer (with Summaries):**  \n{final_output}"
        #not tested: formatted_text = f"<div style='white-space: normal; word-wrap: break-word; overflow-wrap: break-word; font-family: Arial, sans-serif; font-size: 12px;'>\
        #                **Query:**<br>{query}<br><br>**Final Answer (with Summaries):**<br>{final_output}</div>"
        output_box.object = formatted_text  # Update output box  
        
        message_tokens=word_count(messages)
        history_tokens=word_count(managed_history) 
        
        history.append(new_message)   
        
        input_box.value = ""        
        end_time = time.time()
        elapsed_time = end_time - start_time
        output_box_responseTime.object = f"Time taken for Model: {elapsed_time:.2f} seconds | message_length: {message_tokens} | history {history_tokens}"
        
    is_first_run = False


# Define a function to handle user input changes
def handle_input_change(event):
    handle_input2(vectordb, embedding, llm, retriever)

# Apply debounce to input handling
@debounce(wait=0.1)  # 500 ms delay
def handle_input_debounced(event):
    handle_input2(vectordb, embedding, llm, retriever)

    
# Load the Panel extension
pn.extension(
    design='bootstrap',
    theme='default', #'dark'
    sizing_mode='stretch_both'  # Ensures full-width rendering
)

# Improved Input Box Styling
input_box = pn.widgets.TextAreaInput(
    width=None, 
    height=300, 
    placeholder="Enter your query here...",
    styles={
        'color': '#333333',
        'background-color': '#ffffff',
        'border': '1px solid #cccccc',
        'border-radius': '6px',
        'padding': '10px',
        'font-family': 'Arial, sans-serif',
        'font-size': '16px',
        'line-height': '1.5',  # Consistent line height
        'overflow-y': 'auto',
        'margin-top': '20px',
        'box-sizing': 'border-box',  # Include padding in width calculation
        'width': '100%', # Ensure the box stretches to the full width of its container
        'resize': 'none',  # Prevent manual resizing
        'word-wrap': 'break-word',
        'word-break': 'break-word',
        'white-space': 'pre-wrap'
    }
)

# Loading Indicator Styling
loading_indicator = pn.indicators.LoadingSpinner(
    value=False, 
    width=20, 
    height=20,
    styles={
        'margin-left': '10px',
        'color': '#007bff'  # Bootstrap primary color
    }
)

output_box = pn.pane.Markdown(
    width=None,
    height=300,
    styles={
        'width': '100%',
        'color': '#212529',
        'background-color': '#f0f2f4',
        'border': '1px solid #dee2e6',
        'border-radius': '6px',
        'font-size': '14px',
        'font-weight': '400',
        'font-family': 'Arial, sans-serif',
        'line-height': '1.6',
        'padding': '10px',
        'box-sizing': 'border-box',
        
        # Text wrapping properties
        'word-wrap': 'break-word',
        'word-break': 'break-word',
        'overflow-wrap': 'break-word',
        'white-space': 'normal',
        'text-align': 'left',
        'hyphens': 'auto',
        'overflow-y': 'auto',
        
        'display': 'block',
        'max-width': '100%'
    }
)

output_box3 = pn.pane.HTML(
    width=None,
    height=100,
    styles={
        'color': '#212529',
        'background-color': '#f0f2f4',
        'border': '1px solid #dee2e6',
        'border-radius': '6px',
        'font-size': '18px',
        'font-weight': '400',
        'font-family': 'Arial, sans-serif',
        'line-height': '1.6',
        'padding': '10px',
        'box-sizing': 'border-box',
        'overflow-y': 'auto',
        'max-width': '100%'
    }
)
# Apply similar styling to output_box2
output_box2 = pn.pane.Markdown(
    width=None,
    height=300,
    styles={
        'width': '100%',
        'color': '#212529',
        'background-color': '#f0f2f4',
        'border': '1px solid #dee2e6',
        'border-radius': '6px',
        'font-size': '16px',
        'font-weight': '400',
        'font-family': 'Arial, sans-serif',
        'line-height': '1.6',
        'padding': '10px',
        'box-sizing': 'border-box',
        
        # Text wrapping properties
        'word-wrap': 'break-word',
        'word-break': 'break-word',
        'overflow-wrap': 'break-word',
        'white-space': 'normal',
        'text-align': 'left',
        'hyphens': 'auto',
        'overflow-y': 'auto',
        
        'display': 'block',
        'max-width': '100%'
    }
)

# Time Output Boxes with more subtle styling
output_box_retrievalTime = pn.pane.Markdown(
    styles={
        'color': '#666666',
        'background-color': '#e9ecef',  # Consistent light gray background
        'padding': '4px 8px',  # Slightly more padding
        'border-radius': '4px',
        'font-size': '12px',
        'width': '100%',
        'overflow': 'hidden',
        'text-overflow': 'ellipsis',
        'white-space': 'nowrap',
        'border': '1px solid #ced4da',  #Subtle border
        'font-family': 'Arial, sans-serif' 
    }
)

output_box_responseTime = pn.pane.Markdown(
    styles={
        'color': '#666666',
        'background-color': '#e9ecef',  # Consistent light gray background
        'padding': '4px 8px',  # Slightly more padding
        'border-radius': '4px',
        'font-size': '12px',
        'width': '100%',
        'overflow': 'hidden',
        'text-overflow': 'ellipsis',
        'white-space': 'nowrap',
        'border': '1px solid #ced4da',  # Subtle border
        'font-family': 'Arial, sans-serif'
    }
)

# Watch for changes in the input box value
input_box.param.watch(lambda event: handle_input2(vectordb, embedding, llm, retriever), 'value')  # Pass arguments here
#input_box.param.watch(handle_input_debounced, 'value')
# Create a variable to track if it's the first run
is_first_run = True
if is_first_run:
        # Call handle_input2()
        handle_input2(vectordb, embedding, llm, retriever)
        is_first_run = False


# Create the dashboard layout with improved responsiveness
dashboard_layout = pn.Column(
    pn.Row(
        pn.pane.Markdown(
            "# ChatGPT-like Conversation: RAG", 
            styles={
                'color': '#2c3e50',
                'text-align': 'center',
                'font-weight': 'bold',
                'background-color': '#ecf0f1',
                'padding': '5px',  # Reduced padding
                'border-radius': '5px',
                'width': '100%',
                'margin-bottom': '5px',  # Reduced margin
                'line-height': '1.2',  # Tighter line height
                'font-size': '20px'  # Adjust font size if needed
            }
        ),
        loading_indicator,
        sizing_mode='stretch_width',
        align='center',
        height=50  # Set a fixed, smaller height
    ),
    
    # Wrapper Row with explicit styling
    pn.Row(
        pn.Column(
            input_box,
            width_policy='max',
            sizing_mode='stretch_width',
            styles={
                'flex': '1'  # 1/3 of the row
            }
        ),
        pn.Column(
            output_box,
            width_policy='max',
            sizing_mode='stretch_width',
            styles={
                'flex': '2'# 2/3 of the row
                #'width': '100%', #
                #'min-width': '0'  # #Ensures the column can shrink

            }
        ),
        sizing_mode='stretch_width',
        styles={
            'gap': '2px',
            'column-gap': '10px',
            'width': '100%'  # Ensure full width of the row
        }
    ),
    
    # Compact Time Output Row
    pn.Row(
        pn.Column(
            output_box_retrievalTime, 
            width_policy='max',
            sizing_mode='stretch_width'
        ), 
        pn.Column(
            output_box_responseTime,
            width_policy='max', 
            sizing_mode='stretch_width'
        ),
        sizing_mode='stretch_width',
        styles={
            'gap': '10px',  # Add gap between columns
            'max-height': '40px',
            'overflow': 'hidden',
            'min-height': '60px',   # Increased minimum height
            'max-height': '80px',   # Keep max height
            'height': '60px',       # Explicit height
        }
    ),
    # Additional Output Row
    pn.Row(
        output_box3,
        sizing_mode='stretch_width'
    ),    
    # Additional Output Row
    pn.Row(
        output_box2,
        sizing_mode='stretch_width'
    ),
    
    # Global Column Styles
    styles={
        'background-color': '#f1f3f5',
        'width': '100%',
        'max-width': '100%',
        'padding': '10px',
        'gap': '10px'  # Add vertical gap between rows
    },
    sizing_mode='stretch_width'
)



# Create a dashboard object using the layout
dashboard = pn.panel(
    dashboard_layout, 
    sizing_mode='stretch_both'  # Ensure dashboard itself stretches
)

# Display the dashboard in a new browser tab
dashboard.show()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thomas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Launching server at http://localhost:51059


<panel.io.server.Server at 0x2435fbe6b80>

# Stop: Next similar but more genereal chat

## 2nd Based on  client.chat.completions.create  : 
    * Browser TAB
    * Prompt Input
    * Query
    * Response
    * Whole message sent to Model
## Chose Retrieval: Standard or Multi (langChain): line 32
 * overall the responses are quite slow, because of retrieval of useful text snippets

In [None]:
#2nd
import panel as pn
from openai import OpenAI
from langchain_community.vectorstores import Chroma
#from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
import re
import time


# Define the template for the prompt
template = """
Use the following context from the vector database and the previous conversation to answer the query. Incorporate your own knowledge and reasoning as an AI assistant:
Previous Conversation: {history}
Context: {context}
Query: {query}
Answer:
{answer}
"""



# Create the prompt template
prompt_template = PromptTemplate(
    input_variables=["history", "context", "query", "answer"],
    template=template
)



history = []

# Point to the local OpenAI server
client = OpenAI(base_url="http://localhost:1238/v1", api_key="lm-studio")


which_retriever='Multi'
#which_retriever='Standard'

if which_retriever =='Multi':
    retriever=retriever_from_llm
else:
    # Set up the vector retriever
    retriever = vectordb.as_retriever()

    
# Define a function to handle the user's input
def handle_input2():
    global is_first_run
    wrap_width = 100

    if is_first_run:
        query = "Hello, introduce yourself to someone opening this program for the first time. Be concise"
        is_first_run = False
        some_context = ""
    else:
        output_box_retrievalTime.object = f"Processing Retrieval {which_retriever}"
        output_box_gpt_responseTime = f"Waiting for GPT"
        start_time = time.time()
        query = input_box.value.strip()
        search_results = retriever.get_relevant_documents(query)
        #search_results = retriever_from_llm.get_relevant_documents(query)
        some_context = ""
        for result in search_results:
            some_context += result.page_content + "'Title-- '"+result.metadata['Title']+"--EndTitle | \n\n"
        
        end_time = time.time()
        # Calculate the elapsed time
        elapsed_time = end_time - start_time
        output_box_retrievalTime.object = f"Time taken for Retrieval with: {elapsed_time:.2f} seconds"
    #history.append({"role": "user", "content": query)# + " Database content: [" + some_context + "]"})

    if query:
        # Generate the prompt using the template and input values
        prompt = prompt_template.format(history=history, context=some_context, query=query, answer="")
        #prompt = prompt_template.format(history=history, answer="")
        output_box2.object=prompt
        start_time = time.time()
        completion = client.chat.completions.create(
            model="LLMA/Meta-Llama-3-8B",
            messages=history + [{"role": "user", "content": prompt}],
            temperature=0.7,
            stream=True,
        )

        
        new_message = {"role": "assistant", "content": ""}

        full_response = ""
        for chunk in completion:
            if chunk.choices[0].delta.content:
                new_message["content"] += chunk.choices[0].delta.content
                full_response += (chunk.choices[0].delta.content)

        formatted_text = f"<pre style='white-space: pre-wrap; width: {wrap_width}ch; font-family: Arial, sans-serif; font-size: 12px;'>**Query:**\n{query}\n\n**Final Answer:**\n{full_response}</pre>"
        output_box.object = formatted_text

        history.append(new_message)
        
        input_box.value = ""
        
        end_time = time.time()
        elapsed_time = end_time - start_time
        output_box_responseTime.object = f"Time taken for Model: {elapsed_time:.2f} seconds"


# Define a function to handle user input changes
def handle_input_change(event):
    handle_input2()

# Load the Panel extension
pn.extension()

# Create an input box
input_box = pn.widgets.TextAreaInput(width=400, height=100, styles={'overflow-y': 'scroll', 'text-align': 'center'})
# Create the output box
output_box = pn.pane.Markdown(
    width=800,
    height=400,
    styles={
        'overflow-y': 'scroll',
        'color': 'black',                   # Text color
        'background-color': '#f0f0f0',      # Background color (light gray)
        'font-size': '14px',                # Font size
        'font-family': 'Courier New',       # Font family
        'padding': '5px',                   # Padding inside the pane
        'border': '1px solid red',          # Border around the pane
        'text-align': 'center'              # Center the text
    }
)

output_box2 = pn.pane.Markdown(
    height=400,
    styles={
        'overflow-y': 'scroll',
        'overflow-x': 'scroll',
        'color': 'black',        # Text color
        'background-color': '#f0f0f4',  # Background color
        'font-size': '14px',     # Font size
        'font-family': 'Courier New',  # Font family
        'padding': '5px',        # Padding inside the pane
        'border': '1px solid blue'  # Border around the pane
    }
)


output_box_retrievalTime = pn.pane.Markdown()
output_box_responseTime = pn.pane.Markdown()
# Watch for changes in the input box value
input_box.param.watch(handle_input_change, 'value')

# Create a variable to track if it's the first run
is_first_run = True
if is_first_run:
        # Call handle_input2()
        handle_input2()
        is_first_run = False

         
# Create the dashboard layout using Panel
dashboard_layout = pn.Column(
    pn.pane.Markdown("# ChatGPT-like Conversation: RAG"),
    pn.Spacer(height=20),
    pn.Row(input_box, output_box),
    pn.Row(pn.Spacer(height=20)),
    pn.Row(output_box_retrievalTime, output_box_responseTime),
    pn.Row(output_box2),
    name="ChatGPT Dashboard"
)

# Create a dashboard object using the layout
dashboard = pn.panel(dashboard_layout)

# Display the dashboard in a new browser tab
dashboard.show()

# Alternative Version: based on QA CHAIN with BROWSER TAB - 
 * output still not consitent and short

In [None]:
import panel as pn
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
import textwrap
from langchain.chains import LLMChain
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate



# Create a RAG model
llm = OpenAI(base_url="http://localhost:1238/v1", api_key="lm-studio")
retriever = vectordb.as_retriever()

# Define a custom prompt template
template = """
Use the following context from the vector database and the previous conversation to answer the query. Incorporate your own knowledge and reasoning as an AI assistant:

Previous Conversation: {history}

Context: {context}

Query: {query}

Answer:
"""

prompt_template = PromptTemplate(input_variables=["history", "context", "query"], template=template)

# Create a custom chain
qa_chain = LLMChain(llm=llm, prompt=prompt_template)


# Load the Panel extension
pn.extension()

import textwrap
import panel as pn
# Create an input box
input_box = pn.widgets.TextAreaInput()
# Create the output box
output_box = pn.pane.Markdown()
# Create a history list to store the conversation
history = []
# Define a function to handle the user's input
def handle_input():
    query = input_box.value.strip()
    if query:
        # Add user input to the history
        #history.append({"role": "user", "content": query})
        # Process the query and generate a response
        context = retriever.get_relevant_documents(query)
        result_str = qa_chain.run(
            history="\n".join([f"{msg['role']}: {msg['content']}" for msg in history]),
            query=query,
            #max_tokens=8096,
            max_tokens=-1,
            context="".join([doc.page_content for doc in context])
        )
        final_answer_start = result_str.find("Final Answer:")
        if final_answer_start != -1:
            final_answer = result_str[final_answer_start + len("Final Answer:"):].strip()
            output_box.object = f"**Query:**\n{query}\n\n**Final Answer:**\n{textwrap.fill(final_answer, width=80)}"
        else:
            final_answer=result_str
            output_box.object = f"**Query:**\n{query}\n\n**Answer:**\n{textwrap.fill(result_str, width=80)}"
        
        # Add assistant response to the history
        history.append({"role": "assistant", "content": final_answer})
        history.append({"role": "user", "content": query})
        # Clear the input box
        input_box.value = ""
        
        
# Watch for changes in the input box value
input_box.param.watch(lambda event: handle_input(), 'value')
# Create the dashboard layout using Panel
dashboard_layout = pn.Column(
    pn.pane.Markdown("# ChatGPT-like Conversation"),
    pn.Spacer(height=20),
    pn.Row(input_box, output_box),
    name="ChatGPT Dashboard"
)
# Create a dashboard object using the layout
dashboard = pn.panel(dashboard_layout)
# Display the dashboard in a new browser tab
dashboard.show()