Scientific Publications as PDFs, Text Extraction, Storage SQL database, Chunks, Embedding and Vectorization, Retrieval, GPT

In [None]:
# Not required, but can be usefull if a Cell need local proxy connection to Docker or LM Studio
'''
import os
os.environ['http_proxy']="http://localhost:1238"
os.environ['https_proxy']="http://localhost:1238"
'''

## 1<sup>st</sup> Setting Up PDF Collection and DATABASE Paths

In [None]:
import json
import os
%run assets/func_inputoutput.py
# Setup paths
cwd = os.getcwd()
print("Current working directory:", cwd)
# Define pdf file directory
pdf_collection_path = 'eGRASP'
pdf_path = os.path.join(cwd, "docs", pdf_collection_path)
# Define database directory
base_name = 'eGRASP.db'#'LigaseE3_7.db'
database_name = os.path.join(cwd, "docs", pdf_collection_path, base_name)
# Create settings dictionary
settings = {
    'working_directory': cwd,
    'pdf_collection_path': pdf_collection_path,
    'pdf_path': pdf_path,
    'base_name': base_name,
    'database_name': database_name,
    'additional': 'value1'  # 
}
# Save settings to a file
save_settings(settings)
# Later on, load settings and unpack directly into variables
working_directory, pdf_collection_path, pdf_path, base_name, database_name, remaining_settings = load_settings()
# Print the loaded settings
print(f'\nworking_directory {working_directory}')
print(f"\nPDF Collection Path: {pdf_collection_path}")
print(f"\npdf_path: {pdf_path}")  
print(f"\nbase_name: {base_name}")  
print(f"\ndatabase_name: {database_name}")  
print(f"\nRemaining Settings: {remaining_settings}")

## Read PDF and Extract TEXT, Stores in a SQLite3 database (db): GROBID based approach
 * If use text file with exported References and their Abstracts from Endnote then GROBID is not required
 * Endnote export file for now just hardwired in line 295

# Instructions for Running GROBID Docker
1. Download the small GROBID docker image and copy `config.json` into the Jupyter notebook working directory.
2. Run local Docker:
    - Start Docker for Windows.
    - Use CMD.
3. Execute the following command to run the docker container:
    ```bash
    docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:0.8.0
   OR
    docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:latest-crf
    ```
4. Check the running docker with [http://localhost:8070](http://localhost:8070) and ensure this matches the one in your `config.json`.
5. path to the PDF collections and database are setted above   
6. Choose to recreate the SQL (y/n) or just reload the current one.
7. Note that processing 70 PDFs can take over 10 minutes.
8. Wait for the console message: "Finished".
9. If you encounter an XML error or some "localhost" mix up, ensure you stay connected with the docker by avoiding proxy configurations elsewhere.

### Importing Literature Lists from EndNote (with Abstracts)

Here's how to import your EndNote library, including abstracts, into this application:

1. **Insert References into Word:** In EndNote, select the references you want to import and insert them into a Microsoft Word document.
2. **Create a "Show All Fields" Bibliography:**  In Word, generate a bibliography using the "Show All Fields" style. This will include the abstracts and clearly labeled field names (e.g., "Title:", "Abstract:", etc.).
3. **Save as UTF-8 Text File:** Save the Word document as a plain text file (`.txt`) with UTF-8 encoding.  This ensures proper character encoding.
4. **Link the Script:**  <span style="color:red;">Specify the path to the saved text file in your script (see line 272).</span>
5. **Create from Text File:** Run the script.  Select the option to create a database from the text file.  This method bypasses GROBID; however, GROBID initialization might still run in the background.### Importing Literature Lists from EndNote (with Abstracts)



In [None]:
import os
import sqlite3
import pandas as pd
from grobid_client.grobid_client import GrobidClient
import json
import grobid_tei_xml
import urllib3
import requests
import re

%run assets/func_inputoutput.py
# Disable warnings and proxy settings
os.environ['NO_PROXY'] = 'localhost,127.0.0.1'
urllib3.disable_warnings()
urllib3.util.connection.is_connection_dropped = lambda conn: False

# Configure requests to ignore proxies
requests.Session().trust_env = False


working_directory, pdf_collection_path, pdf_path, base_name, database_name, remaining_settings = load_settings()

class GrobidAuthor:
    def __init__(self, full_name):
        self.full_name = full_name

class GrobidBiblio:
    def __init__(self, index, authors, title, date, volume, pages, issue, journal, doi):
        self.index = index
        self.authors = authors
        self.title = title
        self.date = date
        self.volume = volume
        self.pages = pages
        self.issue = issue
        self.journal = journal
        self.doi = doi

def extract_bibliographic_details(biblios, print_choice=False):
    for biblio in biblios:
        if print_choice:
            print("Index", biblio.index, "| Title:", biblio.title)
            print("Authors:")
            for author in biblio.authors:
                print("-", author.full_name)

            print("Date:", biblio.date)
            print("Volume:", biblio.volume)
            print("Pages:", biblio.pages)
            print("Journal:", biblio.journal)
            print("Doi:", biblio.doi)
            print()
    
    ref_list = '*'.join([
        f" * Index: {biblio.index} | Title: {biblio.title} | Authors: {', '.join([author.full_name for author in biblio.authors])} | Date: {biblio.date} | Volume: {biblio.volume} | Pages: {biblio.pages} | Journal: {biblio.journal} | Doi: {biblio.doi}\|"
        for biblio in biblios
    ])
    return ref_list


   
    
def parse_structured_data(text_path):
    records = []
    
    with open(text_path, 'r', encoding='utf-8') as file:
        
        text = file.read()
        records_text = text.split("\n\n")
        for record_text in records_text:
            record = {}

            match = re.search(r"Record Number:\s*(\d+)", record_text)
            if match:
                record['Record Number'] = match.group(1)

            match = re.search(r"Title:\s*(.*)", record_text)
            if match:
                record['Title'] = match.group(1).strip()

            match = re.search(r"Author:\s*(.*)", record_text)
            if match:
                record['Author'] = match.group(1).strip()

            match = re.search(r"DOI:\s*(.*)", record_text)
            if match:
                record['DOI'] = match.group(1).strip()
            
            match = re.search(r"Abstract:\s*(.*)", record_text)
            if match:
                record['Abstract'] = match.group(1).strip()
                
            match = re.search(r"Year:\s*(.*)", record_text)
            if match:
                record['Date'] = match.group(1).strip()

            match = re.search(r"Journal:\s*(.*)", record_text)
            if match:
                record['Journal'] = match.group(1).strip()
        
            # Set empty values for fields not present in the text file
            record['Citations'] = ""
            
            record['Body'] = ""
            record['Refs'] = ""

            records.append(record)
    return records

def handle_database_existence(database_name):
    if os.path.exists(database_name):
        while True:
            choice = input("\nThe database file already exists. What do you want to do? \n"
                           "1. Create a new database\n"
                           "2. Append to the existing database\n"
                           "3. Do nothing (use the existing database)\n"
                           "Enter your choice (1-3): ")
            if choice in ['1', '2', '3']:
                break
            print("Invalid choice! Please enter 1, 2, or 3.")
        return choice
    else:
        return '1'  # Create a new database if it doesn't exist

def main():
    # Initialize Grobid client
    #client = GrobidClient(config_path="config.json")
     # Check if the database file exists
        
    print(f'database_name {database_name}')
    # Overwrite or append choice (only if the database exists)
    if os.path.exists(database_name):
        while True:
            choice = input("\nDatabase already exists. Do you want to:\n"
                           "1. Overwrite the existing database\n"
                           "2. Append to the existing database\n"
                           "Enter your choice (1/2): ")
            if choice in ['1', '2']:
                break
            print("Invalid choice! Please enter 1 or 2.")

        if choice == '1':
            # Overwrite:
            # Initialize Grobid client
            #client = GrobidClient(config_path="config.json")
            #conn.close()
            conn = sqlite3.connect(database_name)
            #cursor = conn.cursor()
            conn.close()
            
            os.remove(database_name)
            print("Overwriting existing database...")
            id_key=1           #conn = sqlite3.connect(database_name)
            #cursor = conn.cursor()
            #cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
            #result = cursor.fetchone()
        if choice == '2':
            # Append: Connection already established
            # Determine starting ID
                # Initialize Grobid client
            #client = GrobidClient(config_path="config.json")
            conn = sqlite3.connect(database_name)
            cursor = conn.cursor()

            # Check if the table exists
            cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
            result = cursor.fetchone()
            table_name = result[0]
            cursor.execute(f"SELECT MAX(ID) FROM {table_name}")
            max_id = cursor.fetchone()[0]
            id_key = max_id + 1 if max_id is not None else 1
    else:
            id_key=1 
            print("Create Database ")
            
    conn = sqlite3.connect(database_name)
    cursor = conn.cursor()

    # Check if the table exists
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    result = cursor.fetchone()
    
    
    if result is None:
        table_name = "document_table"
        cursor.execute(f'''CREATE TABLE {table_name}
                        (ID INTEGER PRIMARY KEY,
                        Title TEXT,
                        Authors TEXT,
                        DOI TEXT,
                        Citations INTEGER,
                        Abstract TEXT,
                        Body TEXT,
                        Date TEXT,
                        Record_Number INTEGER,
                        Refs TEXT,
                        Journal TEXT)''')
        print("Created new table:", table_name)
    else:
        table_name = result[0]
        print("datbase exists:", table_name)
    
    #conn.close()
        
    # Add processing mode selection
    while True:
        processing_mode = input("\nSelect processing mode:\n1. Grobid extraction\n2. Plain text PDF extraction\n3. Both\nEnter choice (1/2/3): ")
        if processing_mode in ['1', '2', '3']:
            break
        print("Invalid choice! Please enter 1, 2, or 3.")

    # Process PDFs based on selected mode
    if processing_mode in ['1', '3']:
        # Existing Grobid processing logic
        client = GrobidClient(config_path="config.json")
        if not os.path.exists(pdf_path):
            print(f"Error: Directory {pdf_path} does not exist!")
            conn.close()
            return

        # Loop through PDF files
        for filename in os.listdir(pdf_path):
            if filename.endswith(".pdf"):
                try:
                    print(f"Processing filename: {filename}")
                    file_path = os.path.join(pdf_path, filename)

                    # Process PDF
                    pdf_file, status, text = client.process_pdf(
                        "processFulltextDocument", 
                        file_path, 
                        generateIDs=True, 
                        consolidate_header=True, 
                        consolidate_citations=True, 
                        include_raw_citations=True, 
                        include_raw_affiliations=True, 
                        tei_coordinates=True,                          
                        segment_sentences=True
                    )

                    # Verify text extraction
                    if not text or text.strip() == '':
                        raise ValueError("No text extracted from PDF")

                    # Parse citations
                    try:
                        grobid_biblios = grobid_tei_xml.parse_citation_list_xml(text)
                        ref_list = extract_bibliographic_details(grobid_biblios, print_choice=False)
                    except Exception as cite_error:
                        print(f"Citation parsing error: {cite_error}")
                        ref_list = "No references parsed"
                    
                    
                    # Parse document
                    doc = grobid_tei_xml.parse_document_xml(text)
                    title = doc.header.title if hasattr(doc.header, 'title') else "No Title"
                    authors = ';'.join([a.full_name for a in doc.header.authors]) if hasattr(doc.header, 'authors') else "No Authors"
                    doi = str(doc.header.doi) if hasattr(doc.header, 'doi') else "No DOI"
                    citations = str(len(doc.citations)) if hasattr(doc, 'citations') else "0"
                    abstract = doc.abstract if hasattr(doc, 'abstract') else "No Abstract"
                    body = doc.body if hasattr(doc, 'body') else "No Body"
                    date = doc.header.date if hasattr(doc.header, 'date') else "No Date"
                    journal =doc.header.journal if hasattr(doc.header, 'date') else "No Date"
                    #----------
                    # --- Date Extraction (Improved) ---
                    #date_element = doc.header.source_desc.analytic.publication_stmt.date
                    #date = parse_grobid_date(date_element) # Use the parsing function

                    print(f'date {date}\n ')
                    
                    #date=doc.date
                    #print(f'date {date}')

                    # Insert into database
                    cursor.execute(f'''
                        INSERT INTO {table_name} (ID, Title, Authors, DOI, Citations, Abstract, Body, Date, Refs, Journal)
                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                    ''', (id_key, title, authors, doi, citations, abstract, body, date, ref_list, journal))

                    # Commit and increment
                    conn.commit()
                    id_key += 1

                except Exception as e:
                    print(f"Error processing {filename}: {e}")
                    continue

    # Final record count
    #cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
    #count = cursor.fetchone()[0]
   # print("Total records:", count)

    # Close connection
    #conn.close()
    #print("Finished processing")

    if processing_mode in ['2', '3']:
        ## Process the structured data file
        print(f'table_name {table_name}')
        #text_path = input("Enter the path to the directory containing text files: ")
        text_path=r'\users\na\Python\envs\GPT\openai-quickstart-node-master\PDF_screen\Langchain\LangChain-Chat-with-Your-Data-main\docs\eGRASP\egrasp.txt'
        print(f'Note: hardwired path and file name in line 295 \n')
        records = parse_structured_data(text_path)
        
        # Insert records into the database
        
        for record in records:
            
            cursor.execute(f'''
                INSERT INTO document_table (ID, Title, Authors, DOI, Citations, Abstract, Date, Record_Number, Journal)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                id_key,#record.get('Record Number', None),
                record.get('Title', None),
                record.get('Author', None),
                record.get('DOI', None),
                record.get('Citations', None),
                record.get('Abstract', None),
                record.get('Date', None),
                record.get('Record Number', None),
                record.get('Journal', None)
            ))
                
            # Commit and increment
            conn.commit()
            
            print(f'id {id_key}')
            id_key += 1
            
            

    # Final record count
    cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
    count = cursor.fetchone()[0]
    print("Total records:", count)

    # Close connection
    conn.close()
    print("Finished processing")

# Run the main function
if __name__ == "__main__":

    #conn.close()
    main()
    

# Check records of SQL database: 
### Can also be used alone without Docker, once SQL database was created
 * if start from here, then run settings in second cell at start 
 * database
    * base_name='sample.db'
    * database_name=os.path.join(cwd, "docs", pdf_collection_path, base_name) #database
 * Enter the record number or 'q' to quit: After: Enter 'n' for next choice or 'q' for quitEnter the record number or 'q' to quit: After: Enter 'n' for next choice or 'q' for quit# Check of SQL database: 

In [None]:
#database structure
skip=True
if not skip:
    import importlib
    from assets import func_database  # Import the entire module from assets func_database.py
    importlib.reload(func_database) # in case of editing the functions
    # Call the function initially
    func_database.database_structure(database_name)


# Dashboard to see contents of database
 * requires that database_name was set before

In [None]:
import panel as pn
import sqlite3
import pandas as pd
def on_record_selected(change):
    selected_option = change['new']
    
    if selected_option in record_options:
        record_id, record_number = record_options[selected_option]
        record_number_display.value = f"Record Number: {record_number}"
    else:
        record_number_display.value = ""

    return

#js_close_tab = """<script>window.close();</script>"""

def close_app(event):
    try:
        conn.close()
        message_area.value = "Database closed successfully. You may close this tab/window."
    except Exception as e:
        message_area.value = f"Error closing database: {e}" # Show errors to user

pn.extension()
# --- Database Connection ---
try:
    with sqlite3.connect(database_name) as conn: # Replace with your database name
        cursor = conn.cursor()
        conn = sqlite3.connect(database_name)
        cursor = conn.cursor()
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
        tables = cursor.fetchall()
        if not tables:
            raise ValueError("No tables found in the database.")
        table_name = tables[0][0]

        cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
        row_count = cursor.fetchone()[0]
        
        if row_count == 0:
            raise ValueError(f"Table '{table_name}' is empty.")

        
        user_input = input("dashboard: 'n' : 'y'")
        if user_input.lower() == 'y':

            record_ids_select = pn.widgets.Select(
                name='Record ID',
                sizing_mode='stretch_width',
                styles={
                    'background-color': 'white',
                    'color': 'black',
                    'border-color': '#555'
                }
            )
            record_number_display = pn.widgets.StaticText(
                name='Record Number',
                value="",
                sizing_mode='stretch_width',
                styles={
                    'background-color': 'white',
                    'color': 'black',
                    'border-color': '#555',
                    'padding': '5px'
                }
            )
            output_area = pn.widgets.TextAreaInput(
                value="",
                sizing_mode='stretch_width',
                height=400,
                styles={
                    'background-color': 'white',
                    'color': 'black',
                    'font-family': 'monospace',
                    'font-size': '14px',
                    'border-color': '#555',
                    'padding': '10px'
                },
                disabled=True
            )
            def populate_record_ids():
                try:
                    cursor.execute(f"SELECT ID, Title FROM {table_name}")
                    records = cursor.fetchall()
                    if not records:
                        output_area.value = "No records found."
                        record_ids_select.options = []
                        return {}
                    record_options = {}
                    for row in records:
                        record_id = row[0]
                        title = row[1] if row[1] is not None else "No abstract available"
                        record_options[f"ID {record_id}: {title}"] = (record_id)
                    return record_options
                except Exception as e:
                    print(f"Error populating record IDs: {e}")
                    output_area.value = f"Error: {e}"
                    return {}
            record_options = populate_record_ids()
            if record_options:
                record_ids_select.options = record_options
                record_ids_select.value = list(record_options.values())[0]  # Set initial value
            else:
                record_ids_select.options = []
                output_area.value = "No records found or error in fetching records."
            @pn.depends(record_ids_select.param.value, watch=True)
            def update_output(record_id):
                if record_id is None:
                    output_area.value = "No record selected or table is empty."
                    record_number_display.value = ""
                    return
                try:
                    cursor.execute(f"SELECT Abstract, Authors, Body, Date, Record_Number, Journal FROM {table_name} WHERE ID = ?", (record_id,))
                    record_data = cursor.fetchone()
                    if record_data:
                        abstract_text = str(record_data[0])
                        body_text = str(record_data[2])
                        date_text = str(record_data[3])
                        recordnr_integer = str(record_data[4])
                        journal_text=str(record_data[5])
                        output_area.value = f'Journal: {journal_text}\nAbstract: {abstract_text}\n\nBody: {body_text}'
                        record_number_display.value = f"Record Number: {recordnr_integer} | Author: {record_data[1]} | Date: {date_text}"
                    else:
                        output_area.value = f"Record with ID {record_id} not found."
                        record_number_display.value = ""
                except Exception as e:
                    output_area.value = f"Error retrieving record: {e}"
                    record_number_display.value = ""
            if record_options:
                update_output(record_ids_select.value)

            # Create a close button
            #close_button = pn.widgets.Button(label='Close', button_type='danger')
            close_button = pn.widgets.Button(name='Close', button_type='danger')  # Set initial label
            close_button.on_click(close_app)
            message_area = pn.widgets.StaticText(value="")  # Or pn.pane.Markdown("")
            

            dashboard_layout = pn.Column(
                pn.pane.Markdown(
                    "## Record Viewer",
                    styles={'color': 'black', 'background-color': 'white'}
                ),
                record_ids_select,
                record_number_display,
                output_area,
                close_button,
                message_area,
                sizing_mode='stretch_width'
            )
            dashboard = pn.panel(dashboard_layout)
            dashboard.show()
        else:
            print("Dashboard creation cancelled.")
        
except Exception as e: # Catches errors during database operations
    print(f"Database Error: {e}")
# Ensure connection is closed if not already closed
# conn.close()  # Uncomment this line if you want to ensure it closes when the script ends

# This document explains how to run LM Studio and configure your vector database, whether you're using a pre-existing SQLite database or creating a new one.

## Using an Existing SQLite Database

If you have an existing SQLite database, GROBID is not required for this step. You can close or quit any applications related to the SQLite database creation process to free up system resources (RAM).

## Creating a New Vector Database

If you don't have a pre-existing SQLite database, follow these steps to create a new vector database using Chroma (version 0.5.0) and LangChain (version 0.2.3):This document explains how to run LM Studio and configure your vector database, whether you're using a pre-existing SQLite database or creating a new one.

## Using an Existing SQLite Database

If you have an existing SQLite database, GROBID is not required for this step. You can close or quit any applications related to the SQLite database creation process to free up system resources (RAM).

## Creating a New Vector Database

If you don't have a pre-existing SQLite database, follow these steps to create a new vector database using Chroma (version 0.5.0) and LangChain (version 0.2.3):
based on: Chroma version: 0.5.0 | Langchain version: 0.2.3

1. Start LM Studio 0.2.27.
2. Activate the Local Server Tab (" <-> ").
3. Select a model, e.g., "Meta-Llama-3-8B-Instruct-Q8_0.gguf".
    - Choose the preset "Llama3".
4. Choose a server port that does not conflict with the docker port.
5. Load the "Embedding Model", e.g., "nomic embed text v1 5 Q8_0.gguf".
    - Models were previously downloaded manually from Huggingface/Models/.
6. Press "Start Server".
7. Define if required the following paths in line 80-81:
    ```python
    
    user_path = os.path.join(os.getcwd(), "docs", pdf_path, "sample")
    temp_dir = os.path.join(os.getcwd(), "docs", "temp")
    
    ```
8. Run the cell:
    - If there is an error indicating the base is in use, ensure the path is correct and start from the cell "Check SQL Database" after restarting the kernel.
9. If there is a connection error to LM, remove any other open proxy or browser proxy settings in "Settings" in browser proxy settings, restart the kernel, and start from "Check SQL Database", or disable warnings and proxy settings in line 130 
10. After `embed_and_chunk` with adding metadate, creating the vector database, set mode in line 186 to `load` or ``append``.


In [None]:
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from openai import OpenAI
from typing import List
import os
import shutil
import tempfile
import time
from langchain.text_splitter import RecursiveCharacterTextSplitter  # Import here
import sqlite3

%run assets/func_inputoutput.py

def load_and_process_docs_from_sqlite(database_name, table_name="abstracts", id_column="ID", text_column="Abstract"): # Customize column names
    try:
        conn = sqlite3.connect(database_name)
        cursor = conn.cursor()

        cursor.execute(f"SELECT {id_column}, {text_column} FROM {table_name}")
        results = cursor.fetchall()
        conn.close()


        documents = []
        for row in results:
            doc_id = row[0]
            content = row[1]
            if content is not None and content.strip(): # Skip if text content is empty
                metadata = {"ID": doc_id} # Add id to metadata, can add others as needed
                doc = Document(page_content=content, metadata=metadata)
                documents.append(doc)
            else:
                print(f"Skipping empty or None content for ID: {doc_id}")

        return documents


    except sqlite3.Error as e:
        print(f"SQLite error: {e}")
        return []  # Return empty list in case of error
    except Exception as e:  # Catch other potential errors
        print(f"An unexpected error occurred: {e}")
        return []


class CustomEmbedding2:  # If you MUST keep this, initialize client here
    def __init__(self, client):  # Pass client to the constructor
        self.client = client  # Store the client
        self.embeddings = []


    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        embeddings = [self.get_embedding(text) for text in texts] # use self.get_embedding
        self.embeddings = embeddings
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        embedding = self.get_embedding(text)  # Use self.get_embedding
        self.embeddings = [embedding]
        return embedding

    def get_embedding(self, text, model="TheBloke/nomic-embed-text"): # make this a method of the class
        text = text.replace("\n", " ")
        return self.client.embeddings.create(input=[text], model=model).data[0].embedding # Use self.client



def embed_and_chunk2(docs, embedding, chunk_size=2000, chunk_overlap=200):
    """Embeds and chunks a list of Documents, returning a tuple (texts, embeddings)."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(docs)

    embeddings = embedding.embed_documents([doc.page_content for doc in texts])

    # Store embeddings in a dictionary, keyed by document chunk ID or index
    embedding_dict = {i: emb for i, emb in enumerate(embeddings)}  # Or use some unique ID from doc.metadata

    return texts, embedding_dict

def embed_and_chunk(docs, embedding, chunk_size=2000, chunk_overlap=200):
    """Embeds and chunks a list of Documents."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(docs)

    embeddings = embedding.embed_documents([doc.page_content for doc in texts])


    embedding_dict = {i: emb for i, emb in enumerate(embeddings)}  # Still useful for efficient embedding lookup


    # IMPORTANT: Add "doc_id" to each chunk's metadata:
    for i, text in enumerate(texts):
        text.metadata['doc_id'] = text.metadata['ID'] # Or whatever unique ID you are using from SQLite


        text.metadata['chunk_id'] = i  # It's often helpful to have a unique chunk ID as well

    return texts, embedding_dict

def create_vectordb(texts, embedding, user_path=None, persist_directory=None, mode="create", force_overwrite=True):
    """Creates, loads, or appends to a vector database."""
    cwd = os.getcwd()

    if mode == "create":  # Create a new database (overwrites if exists)
        if persist_directory is None:
            if user_path:
                persist_directory = os.path.join(cwd, user_path, 'chroma_db')
            else:
                persist_directory = os.path.join(cwd,'chroma_db')
                raise ValueError(f"persist_directory or user_path must be provided when creating, now used {persist_directory}")
                
        if os.path.exists(persist_directory): # Crucial check for empty directory
            if os.listdir(persist_directory): # Check if the directory is NOT empty
                if force_overwrite:  # Only remove if force_overwrite is True and not empty
                    shutil.rmtree(persist_directory)
                else:
                    raise FileExistsError(f"Vector database directory '{persist_directory}' is not empty. Use mode='append' or set force_overwrite=True to overwrite.")
            else: # Directory exists but empty
                shutil.rmtree(persist_directory) # clean since old chroma files might be in

        print("Creating a new vector database...")  # More informative message
        start_time = time.time()
        vectordb = Chroma.from_documents(texts, embedding, persist_directory=persist_directory)
        end_time = time.time()
        print(f"New vector database created in {end_time - start_time:.2f} seconds. Directory: {persist_directory}")



    elif mode == "load":  # Load existing database
        if persist_directory is None:
            if user_path:
                persist_directory = os.path.join(cwd, user_path, 'chroma_db')
            else:
                raise ValueError("persist_directory or user_path must be provided when loading")

        if not os.path.exists(persist_directory):
            raise FileNotFoundError(f"Vector database not found: {persist_directory}") # Raise exception

        print(f"Loading vector database from {persist_directory}...")
        start_time = time.time()
        vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
        end_time = time.time()
        print(f"Vector database loaded in {end_time - start_time:.2f} seconds.")


    elif mode == "append":
        if persist_directory is None:
            if user_path:
                persist_directory = os.path.join(cwd, user_path, 'chroma_db')
            else:
                raise ValueError("persist_directory or user_path must be provided when appending")


        if not os.path.exists(persist_directory):
            raise FileNotFoundError(f"Vector database not found: {persist_directory}") # Raise exception

        print(f"Appending to vector database at {persist_directory}...")
        start_time = time.time()

        vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
        vectordb.add_documents(texts)
        end_time = time.time()
        print(f"Documents appended in {end_time - start_time:.2f} seconds.")

    else:
        raise ValueError(f"Invalid mode: {mode}. Choose 'create', 'load', or 'append'.")

    return vectordb, persist_directory

import os
import urllib3

# Disable warnings and proxy settings
os.environ['NO_PROXY'] = 'localhost,127.0.0.1'
urllib3.disable_warnings()
urllib3.util.connection.is_connection_dropped = lambda conn: False

# Configure requests to ignore proxies
import requests
requests.Session().trust_env = False

working_directory, pdf_collection_path, pdf_path, base_name, database_name,_ = load_settings()

user_path = os.path.join(os.getcwd(), "docs", pdf_collection_path, "sample")
temp_dir = os.path.join(os.getcwd(), "docs", "temp")

desired_chunk_size = 2000
chunk_overlap = 450

persist_directory=user_path#temp_dir

client = OpenAI(base_url="http://localhost:1238/v1", api_key="lm-studio") # Important: Initialize client before embedding
embedding = CustomEmbedding2(client=client)


# Database connection and document loading:
conn = sqlite3.connect(database_name)
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
if not tables:
    raise ValueError("No tables found in the database.")
table_name = tables[0][0]

cursor.execute(f"SELECT ID, Title, Abstract, Body FROM {table_name}")  # Simplified query (removed Authors, DOI if not needed)
#cursor.execute("SELECT ID, Title, Authors, DOI, Abstract, Body FROM {table_name}".format(table_name=table_name))
results = cursor.fetchall()

num_original_docs = 0  # Initialize counter
docs = []  # List to store the Document objects *before* chunking

for result in results:
    record_id, title, abstract, body = result
    
    #print(f"Record_id {record_id} | title {title}")
    num_original_docs += 1  # Increment counter
    if abstract is None:
        abstract = ""
    if body is None:
        body = ""

    combined_text = abstract + " " + body
    #metadata = {'ID': record_id, 'Title': title, 'Record_Number': record_number} # Add Record_Number to metadata
    doc = Document(page_content=combined_text, metadata={'ID': record_id, 'Title': title})
    
    docs.append(doc)  # Add the whole document
print(f"\n Finished reading and adding Metadata to database {database_name} records ")

conn.close()
#---------------------------------------------------------------------------------

#mode="create"
#mode="load"
#force_overwrite=False #when load

while True:
    mode = input("Enter mode ('create', 'load', or 'append'): ").lower()
    if mode in ("create", "load", "append"):
        break
    else:
        print("Invalid mode. Please enter 'create', 'load', or 'append'.")


# Set force_overwrite based on mode
force_overwrite = False  # Default
if mode == "create":
    while True:
        overwrite_choice = input("Overwrite existing database? (y/n): ").lower()
        if overwrite_choice in ("y", "n"):
            force_overwrite = (overwrite_choice == "y")
            break
        else:
            print("Invalid choice. Please enter 'y' or 'n'.")



if not mode=='load':
    print(f'\ncreates texts, embedding_dict and verctordb if mode is "load": Current Mode: {mode}')
    texts, embedding_dict = embed_and_chunk(docs, embedding, chunk_size=desired_chunk_size, chunk_overlap=chunk_overlap)


    print(f'mode {mode} | force_overwrite {force_overwrite}')
    vectordb, _ = create_vectordb(texts, embedding, user_path=user_path, persist_directory=persist_directory, mode=mode, force_overwrite=force_overwrite)  # Pass force_overwrite

else:
    texts=[]
    vectordb, _ = create_vectordb(texts, embedding=embedding, user_path=user_path, persist_directory=persist_directory, mode=mode, force_overwrite=force_overwrite)  # No need to provide texts when loading
    #vectordb.persist()  # Persist the database and release locks

print(f"Number of original documents: {num_original_docs} | chunks {vectordb._collection.count()}")  # Now you have the original count
print("\nfinished", persist_directory, "\n chunks: ", len(vectordb))

# Check vectordb Structure of vectordb: Metadata etc.

In [None]:
        
skip=False
if not skip:
    # 1. Get the correct collection *name*:
    collection_name = next(iter(vectordb._client.list_collections())).name # Get the name as string

    # 2. Get the collection *object* using the correct name:
    collection = vectordb._client.get_collection(collection_name) # Crucial correction


    print(f"Collection name: {collection_name}")
    print(collection.count())  # Now this count will be from correct collection   

    # Either get all the metadata and documents at once (less efficient for large dbs)
    all_docs = collection.get(include=["documents", "metadatas"])
    if all_docs:
        print(len(all_docs['documents']))  # Number of documents

        for i in range(len(all_docs['documents'])):
            print(f"Metadata: {all_docs['metadatas'][i]}")
            #print(f"Document {i+1}:")
            print(f"Content: {all_docs['documents'][i]}\n\n")
            #print(f"Metadata: {all_docs['metadatas'][i]}")
    else:
        print("No documents found in the collection.")




    # Or iterate more efficiently (for very large collections):
    for doc in collection.get(include=["documents", "metadatas"]):
        print(doc)  # Now will show the actual content

In [None]:
def search_vectordb(vectordb, search_field, search_value, k=3):
    """Searches the vector database based on user-specified field and value."""

    collection = vectordb._collection
    results = collection.get(where={search_field: search_value}) # Get by metadata


    if results and results['documents']:
        print(f"Found {len(results['documents'])} document(s) matching {search_field}: {search_value}")
        for i in range(len(results['documents'])):
            print(f"Document {i + 1}:")
            print(f"Content: {results['documents'][i]}")
            print(f"Metadata: {results['metadatas'][i]}")

            # Convert to LangChain Document object if needed
            # doc = Document(page_content=results['documents'][i], metadata=results['metadatas'][i])
            print("-" * 20)

    elif search_field in ['ID', 'chunk_id']: # If not found by metadata, try similarity search
        print(f"No exact match found for {search_field}: {search_value}. Trying a similarity search...")

        try: # Convert to string for similarity search
            search_query = str(search_value) # Convert search term to string

            search_results = vectordb.similarity_search(search_query, k=k)
            print(f"Found {len(search_results)} similar document(s):")
            for result in search_results:
                print(result.page_content)
                print(result.metadata)
                print("-" * 20)
        except Exception as e:
            print(f"Error during similarity search: {e}")

    else:
        print(f"No documents found matching {search_field}: {search_value}")
        
while True:
    search_choice = input("Search by (1) ID or (2) chunk_id? (Enter 1 or 2, or q to quit): ")
    if search_choice.lower() == 'q':
        break

    if search_choice in ('1', '2'):
        search_field = "ID" if search_choice == '1' else "chunk_id"
        try:
            search_value = int(input(f"Enter the {search_field} to search for: "))

            search_vectordb(vectordb, search_field, search_value)

        except ValueError:
            print("Invalid input. Please enter an integer.")
    else:
        print("Invalid choice. Please enter 1 or 2.")
        

# Next functions for retrieval and Chat
 * -----

In [None]:
import re 
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

class DocumentRetriever:
    def __init__(self, vectordb):
        """
        Initializes the DocumentRetriever with a vector database.
        
        Parameters:
            vectordb: The vector database used for similarity search.
        """
        self.vectordb = vectordb
    def retrieve_documents(self, query, is_first_run, k=10, method='combined'):
        # Check for the "do retrieval" label
     
        retrieved_text = ""
        refined_query = ""
        
        if '{do not use retrieval}' in query:
            refined_query = "Retrieval skipped as requested."
            return " ", refined_query, method
        
        if query.startswith("doc_id:"):
            doc_id = query.split(":", 1)[1].strip().split(",")[0]  # Extract the doc_id
            method = "direct_doc_id_search"  # Define a method for clarity
            # Call the search_vectordb method to retrieve the documents
            similar_docs = self.search_vectordb_chat('doc_id', doc_id, k)
            # Initialize retrieved_text
            
            # Format documents if any were retrieved
            if similar_docs:
                refined_query = "documents found."
            else:
                refined_query = "No documents found."  # Handle the case when no documents are returned
            refined_query = f"Searching for document with ID: {doc_id} | {refined_query}"  # Create a simple refined query
            # Return the retrieved text, refined query, and method
            return similar_docs, refined_query, method

        
        if is_first_run:
            # On first run, simply use the original query
            refined_query = "No retrieval"#query
            similar_docs = []#self.vectordb.similarity_search(refined_query, k=k)
        else:
            # Extract keywords or generate refined query based on the method
            if method == 'keywords':
                refined_query = self.extract_keywords(query)
                
            elif method == 'llm':
                refined_query = self.generate_useful_query(query)
                
            elif method == 'combined':
                keywords = self.extract_keywords(query)
                refined_query = self.generate_useful_query(query)
                refined_query += " " + keywords
                
            # Check if the refined query is meaningful
            if self.is_query_meaningful(refined_query):
                similar_docs = self.vectordb.similarity_search(refined_query, k=k)
            else:
                # Handle case where query is not meaningful
                refined_query="No meaningful query could be generated. No retrieval"
                return " ", refined_query, method
        # Format documents if any were retrieved
        if similar_docs:
            for i, doc in enumerate(similar_docs):
                retrieved_text += (
                    f"**Document {doc.metadata['doc_id']}, "
                    f"Chunk {doc.metadata.get('chunk_id', 'N/A')}**:\n{doc.page_content}\n\n"
                )
        else:
            return "No documents found.", refined_query, method
        return retrieved_text, refined_query, method

    def is_query_meaningful(self, query):
        """
        Determines if the query is meaningful enough for retrieval.

        Parameters:
            query (str): The input query.

        Returns:
            bool: True if the query is meaningful, False otherwise.
        """
        # Remove common non-meaningful phrases
        if not query or query.lower() in ['no content found.', 'no keywords found']:
            return False

        # Check query length
        if len(query.strip()) < 3:
            return False

        # Optional: Add more sophisticated meaning detection
        # For example, check against a list of stop words or minimum information content
        meaningless_phrases = [
            'no content found',
            'no keywords',
            'not specified',
            'empty query',
            'no llm',
            'no retrieval',
            'refined query',
            'test',
            'tests'
        ]

        # Convert to lowercase for case-insensitive comparison
        query_lower = query.lower()

        # Check if query matches any meaningless phrases
        if any(phrase in query_lower for phrase in meaningless_phrases):
            return False

        return True

    def extract_keywords(self, query):
        """
        Extracts keywords from the query based on curly braces.

        Parameters:
            query (str): The input query.

        Returns:
            str: A string of extracted keywords or empty string.
        """
        # Find keywords within curly braces
        keywords = re.findall(r'\{(.*?)\}', query)

        # Return joined keywords or empty string if no keywords found
        return ', '.join(keywords) if keywords else ''
    
    def generate_useful_query(self, query):
        """
        Generates a useful query using an LLM based on the given input.
        
        Parameters:
            query (str): The input query.
        
        Returns:
            str: A refined query generated by the LLM.
        """
        try:
            instruction = "Extract named entities and specific technical terms \
            from the following query that are most relevant for search. \
            PROVIDE ONLY the entities/terms, separated by commas. \
            Focus on proper nouns, specific technologies, or key concepts. \
            Do not use format instructions like Document Number for keywords."
           
            prompt_template = """
                                {instruction}
                                Original Query: "{query}"
                                Refined Query: 
                                """
            prompt = prompt_template.format(
                query=query,
                instruction=instruction
            )
            
            # Call the LLM with the generated prompt
            completion = client.chat.completions.create(
                #model="LLMA/Meta-Llama-3-8B",
                model="lmstudio/Meta-Llama-3.1",
                #model="TheBloke/Mistral",
                messages=[
                    {"role": "system", "content": "You are an expert at extracting precise semantic phrase groups from complex queries"},
                    {"role": "user", "content": prompt}
                ],              
                temperature=0.9,
                stream=True,
            )
            
           
            # Collect the response from the LLM
            full_response = ''.join(
                chunk.choices[0].delta.content 
                for chunk in completion 
                if chunk.choices[0].delta.content
            )
            
            return full_response.strip()
        
        except Exception as e:
            print(f"Error in query generation: {e}")
            query='no llm'
            return query  # Fallback to original query

    def should_use_original(self, query):
        """
        Determines if the original query should be used for similarity retrieval.
        
        Parameters:
            query (str): The input query.
        
        Returns:
            bool: True if original query should be used, False otherwise.
        """
        # Add more sophisticated logic if needed
        return len(query.strip()) > 3  # Use original query if longer than 3 characters
    
         
    def search_vectordb_chat(self, search_field, search_value, k=3):
        """Searches the vector database based on user-specified field and value."""
        collection = self.vectordb._collection
        search_field = "ID"
        search_value = int(search_value)  # Cast to int for consistency
        # Fetch the raw results from the database
        results = collection.get(where={search_field: search_value})

        if results['documents']:  # Check if there are any documents
            # Combine documents and metadata into a list of tuples for sorting
            document_metadata_pairs = [
                (results['documents'][i], results['metadatas'][i])
                for i in range(len(results['documents']))
            ]

            # Filter and sort the combined list by chunk_id
            filtered_sorted_chunks = sorted(
                [ (content, metadata) for content, metadata in document_metadata_pairs 
                  if metadata.get('doc_id') == search_value ],
                key=lambda x: x[1].get('chunk_id')  # Sort by chunk_id
            )

            # Prepare the formatted text for the output
            formatted_texts = []
            for doc_number, (content, metadata) in enumerate(filtered_sorted_chunks, start=1):
                chunk_id = metadata.get('chunk_id')
                doc_text = (
                    f"**Document {doc_number}: Chunk ID: {chunk_id}, Metadata: {metadata}**\n"
                    f"Content: {content}\n"
                )
                formatted_texts.append(doc_text.strip())  # Clean up text

            # Join the list into a single string
            final_text = "\n\n".join(formatted_texts).strip()
            # Return the formatted text
            return final_text
        else:
            return "No documents found."    
        

def on_input_change(event):
    # Check if the user pressed ENTER (key code 13)
    if event.new and event.new[-1] == '\n':  # Check if the last character is a newline (ENTER)
        handle_input2(vectordb, 
                      embedding, 
                      llm, 
                      retriever, 
                      event.new[:-1],
                      selected_method_value=selected_method.value,
                      k_value=k_value_widget.value)  # Exclude the newline character
        
        
def load_css_file(file_path):
    """
    Safely load CSS file with error handling
    """
    try:
        with open(file_path, 'r') as f:
            return f.read()
    except FileNotFoundError:
        print(f"CSS file not found: {file_path}")
        return ""
    except Exception as e:
        print(f"Error reading CSS file: {e}")
        return ""
        
        


# Required to Run: Alternative Retrieval

In [None]:
#from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import OpenAI
# Set up environment
# Set environment variable

# Initialize client with local LM Studio endpoint
llm = OpenAI(
    base_url="http://localhost:1238/v1", 
    api_key="lm-studio"
)

#llm  = OpenAI(base_url="http://localhost:1238/v1", api_key="lm-studio")
#llm = ChatOpenAI(temperature=0)

# we instantiated the retreiever above
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(), 
    llm=llm
)
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)


# PDF retrieval with keeping metdata as citation with contents in text
# PANEL DASHBOARD
## 1st Based on  client.chat.completions.create  : 
    * Browser TAB
    * Prompt Input
    * Query
    * Response
    * Whole message sent to Model
## Not tested: Chose Retrieval: Standard or Multi (langChain): line 76
    * retriever_from_llm

In [None]:
#own class
#from assets.class_retrieval import DocumentRetriever
#end own class
import panel as pn
from openai import OpenAI
#from langchain_openai import OpenAI
from langchain_community.vectorstores import Chroma
#from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
import re
import time



%run assets/func_inputoutput.py

# Disable warnings and proxy settings
os.environ['NO_PROXY'] = 'localhost,127.0.0.1'
urllib3.disable_warnings()
urllib3.util.connection.is_connection_dropped = lambda conn: False

# Configure requests to ignore proxies
import requests
requests.Session().trust_env = False








# Define the template for the prompt t1
template1 = """
Use the following retrieved documents and the previous conversation to answer the query. Incorporate your own knowledge and reasoning as an AI assistant:
Previous Conversation: {history}
Query: {query}
Retrieved Documents: {retrieved_docs}
Answer:
{answer}
"""

# Define the template for the prompt t2
template = """
Use the following Retrieved Documents and the previous conversation to answer the query. Incorporate your own knowledge and reasoning as an AI assistant.  Keep the content together with document and chunk numbers at the end of sentences in brackets. If content from different documents is combined into a new sentence, place the document and chunk numbers at the end of the sentence, separated by semicolons.
Previous Conversation: {history}
Query: {query}
Retrieved Documents: {retrieved_docs}
Answer:
{answer}
"""

template2 = """
CRITICAL CITATION INSTRUCTIONS:
1. MANDATORY: Cite the EXACT Document and Chunk number for EVERY piece of information retrieved from Retrieved Documents
2. Format citations EXACTLY like this: 
   - Single source: "(Document Number, Chunk Number)"
   - Multiple sources: "(Document Number, Chunk Number; Document Number, Chunk Number)"
3. Place citations IMMEDIATELY after the relevant information, inside parentheses
4. NEVER omit citations
5. If rephrasing content, still include original source citation

Use the following Retrieved Documents and the previous conversation to answer the query:

Previous Conversation: {history}
Query: {query}
Retrieved Documents: {retrieved_docs}

RESPONSE GUIDELINES:
- Create a cohesive, well-structured paragraph
- Ensure EVERY fact is explicitly cited
- Maintain scientific rigor and precision
- Include citations WITHOUT FAIL

Answer:
"""
# Create the prompt template
prompt_template = PromptTemplate(
    input_variables=["history", "query", "retrieved_docs", "answer"],  # Removed 'context'
    template=template
)


history = []

# Point to the local OpenAI server
client = OpenAI(base_url="http://localhost:1238/v1", api_key="lm-studio")


which_retriever='Standard'
#which_retriever='Standard'

if which_retriever =='Multi':
    retriever=retriever_from_llm
else:
    # Set up the vector retriever
    retriever = vectordb.as_retriever()




    

# Define a function to handle the user's input
def handle_input2(vectordb, embedding, llm, retriever, input_value=None, selected_method_value=None, k_value=None): # Pass as arguments
    global is_first_run  # Access global variables
    wrap_width = 100
    start_time = time.time()
    
    if is_first_run:
        query = "Hello, introduce yourself to someone opening this program for the first time. Be concise. Do not add citations to your first response."
        
        similar_docs=[]        
        output_box_retrievalTime.object = f"Processing Retrieval {is_first_run}"
    else:
        output_box_retrievalTime.object = f"Processing Retrieval:"
        output_box_gpt_responseTime = f"Waiting for GPT"
        #start_timere = time.time()
        query = input_box.value.strip()

    #history.append({"role": "user", "content": query)# + " Database content: [" + some_context + "]"})

    if query:
        managed_history = manage_conversation_history(history)
        #vector_db_query = "Image normalization methods for analyzing brain metabolism in different brain regions."  # Use the concise prompt
        
        selected_method =selected_method_value #'combined' #'keywords' #llm, combined 
        k=k_value #12
        
        retrieved_docs = ""        
        retriever = DocumentRetriever(vectordb)  # Pass the chosen LLM
        retrieved_docs, used_query, method_retquery = retriever.retrieve_documents(query, is_first_run, k=k, method=selected_method)
                
        used_query=f"**Used Retrieval Query:**  \n {used_query}\n"       
        
        output_box3.object=used_query
       
        
        if '{no history}' not in query:
            prompt = prompt_template.format(
                history=managed_history,
                query=query,
                retrieved_docs=retrieved_docs,
                answer=""
            )
        else:
            prompt = prompt_template.format(
                history="",
                query=query,
                retrieved_docs=retrieved_docs,
                answer=""
            )



        output_box2.object = prompt
        
        end_timere = time.time()
        elapsed_timere = end_timere - start_time
        
        retrieved_tokens=word_count(retrieved_docs)
        output_box_retrievalTime.object = f"Time for Retrieval with: {elapsed_timere:.2f} \
        seconds | retrieved_docs_length {retrieved_tokens} | selected_method {selected_method}\n | k {k}"
       

        # Display formatted messages in output box
        messages = [
            {"role": "system", "content": """You are a scientific document analysis AI. """ },
            {"role": "user", "content": prompt}
        ] 
        
 
                    
        
        completion = client.chat.completions.create(
            #model="LLMA/Meta-Llama-3-8B",  # Correct model name/path here!
            model="lmstudio/Meta-Llama-3.1",
                      
            messages=messages,
            temperature=0.7,
            stream=True,
        )

        new_message = {"role": "assistant", "content": ""}
        full_response = ""
        for chunk in completion:
            if chunk.choices[0].delta.content:
                new_message["content"] += chunk.choices[0].delta.content
                full_response += chunk.choices[0].delta.content


        # Combine the summary and LLM response in the displayed output
        final_output = f"Final Answer:\n{full_response}"  # Adjust formatting as needed
        formatted_text = f"**Query:**  \n{query}\n\n**Final Answer (with Summaries):**  \n{final_output}"
        #not tested: formatted_text = f"<div style='white-space: normal; word-wrap: break-word; overflow-wrap: break-word; font-family: Arial, sans-serif; font-size: 12px;'>\
        #                **Query:**<br>{query}<br><br>**Final Answer (with Summaries):**<br>{final_output}</div>"
        output_box.object = formatted_text  # Update output box  
        
        message_tokens=word_count(messages)
        history_tokens=word_count(managed_history) 
        
        history.append(new_message)   
        
        input_box.value = ""        
        end_time = time.time()
        elapsed_time = end_time - start_time
        output_box_responseTime.object = f"Time taken for Model: {elapsed_time:.2f} seconds | message_length: {message_tokens} | history {history_tokens}"
        
    is_first_run = False


# Define a function to handle user input changes
def handle_input_change(event):
    handle_input2(vectordb, embedding, llm, retriever, input_value=input_box.value,
                   selected_method_value=selected_method.value,
                   k_value=k_value_widget.value)

# Apply debounce to input handling
@debounce(wait=0.1)  # 500 ms delay
def handle_input_debounced(event):
    handle_input2(vectordb, embedding, llm, retriever,input_value=input_box.value, 
                   selected_method_value=selected_method.value,
                   k_value=k_value_widget.value)

    



# Specify the path to your CSS file
css_path = os.path.join('assets', 'stylesdeep.css')

# Load the CSS file and pass it to Panel's extension
with open(css_path, 'r') as f:
    custom_css = f.read()

# Load the CSS directly in the extension
pn.extension(raw_css=[custom_css])



selected_method = pn.widgets.Select(
    name='Select Method of Keyword Generation', 
    options=['combined', 'keywords', 'llm'], 
    value='combined',                                    
    css_classes=["outer-style"]        
) 

# Create the slider widget for k_value
k_value_widget = pn.widgets.IntSlider(
    start=1,          # Minimum value
    end=100,          # Maximum value
    value=10,         # Default value
    step=1,           # Step size
    name='Select K Number of Retrievals'
    #css_classes=["k-value-widget"]
)


# Improved Input Box Styling
input_box = pn.widgets.TextAreaInput(
    width=None, 
    height=300,  # Ensure both boxes have the same height
    placeholder="Enter your query here...Execute with ENTER",
    css_classes=["input-box"]
)

# Loading Indicator Styling
loading_indicator = pn.indicators.LoadingSpinner(
    value=False, 
    width=20, 
    height=20,
    css_classes=["loading-indicator"]
)

# Output Box Styling
output_box = pn.pane.Markdown(
    width=None,
    height=300,  # Ensure both boxes have the same height
    css_classes=["output-box"]
)

output_box3 = pn.pane.HTML(
    width=None,
    height=100,
    css_classes=["output-box3"]  

)
# Apply similar styling to output_box2
output_box2 = pn.pane.Markdown(
    width=None,
    height=300,
    css_classes=["output-box2"]
)

# Time Output Boxes with more subtle styling
output_box_retrievalTime = pn.pane.Markdown(css_classes=["output-box-retrieval-time"])

output_box_responseTime = pn.pane.Markdown(css_classes=["output-box-response-time"])

def read_values(event):
    method = selected_method.value  # Get the selected method
    k_value = k_value_widget.value   # Get the K value
# Watch for changes in the input box value
#input_box.param.watch(lambda event: handle_input2(vectordb, embedding, llm, retriever), 'value')  # Pass arguments here
input_box.param.watch(on_input_change, 'value')
selected_method.param.watch(read_values, 'value')
k_value_widget.param.watch(read_values, 'value')
#input_box.param.watch(lambda event: handle_input2(vectordb, embedding, llm, retriever) if event.new.endswith('\n') else None, 'value')
#input_box.param.watch(handle_input_debounced, 'value')
# Create a variable to track if it's the first run
is_first_run = True
if is_first_run:
        # Call handle_input2()
        handle_input2(vectordb, embedding, llm, retriever, input_value=input_box.value,
                   selected_method_value=selected_method.value,
                   k_value=k_value_widget.value)
        is_first_run = False


       
        
        
# Create the dashboard layout with improved responsiveness
dashboard_layout = pn.Column(
    pn.Row(
        pn.pane.Markdown(
             "<h1 style='font-size: 14px; color: #2c3e50;'>ChatGPT-like Conversation: RAG</h1>",
            css_classes=["markdown-header"]
        ),
        #loading_indicator,
        selected_method,
        k_value_widget,
        sizing_mode='stretch_width',
        align='start',
        height=50  # Set a fixed, smaller height
    ),
    
    # Wrapper Row with explicit styling
    pn.Row(
        pn.Column(
            input_box,
            width_policy='max',
            sizing_mode='stretch_width',
            styles={
                'flex': '1'  # 1/3 of the row
            }
        ),
        pn.Column(
            output_box,
            width_policy='max',
            sizing_mode='stretch_width',
            styles={
                'flex': '2'# 2/3 of the row
            }
        ),
        align='start',  # Align items at the top
        sizing_mode='stretch_width',
        css_classes=["dashboard-layout"]
    ),
    
    # Compact Time Output Row
    pn.Row(
        pn.Column(
            output_box_retrievalTime, 
            width_policy='max',
            sizing_mode='stretch_width'
        ), 
        pn.Column(
            output_box_responseTime,
            width_policy='max', 
            sizing_mode='stretch_width'
        ),
        sizing_mode='stretch_width',
        css_classes=["dashboard-layout"]
    ),
    # Additional Output Row
    pn.Row(
        output_box3,
        sizing_mode='stretch_width'
    ),    
    # Additional Output Row
    pn.Row(
        output_box2,
        sizing_mode='stretch_width'
    ),
    
    sizing_mode='stretch_width',
    css_classes=["dashboard-layout"]
)


# Create a dashboard object using the layout
dashboard = pn.panel(
    dashboard_layout, 
    sizing_mode='stretch_both'  # Ensure dashboard itself stretches
)

# Display the dashboard in a new browser tab
dashboard.show()

# Stop: Next similar but more general chat

## 2nd Based on  client.chat.completions.create  : 
    * Browser TAB
    * Prompt Input
    * Query
    * Response
    * Whole message sent to Model
## Chose Retrieval: Standard or Multi (langChain): line 32
 * overall the responses are quite slow, because of retrieval of useful text snippets

In [None]:
import panel as pn
print(pn.__version__)

In [None]:
#2nd
import panel as pn
from openai import OpenAI
from langchain_community.vectorstores import Chroma
#from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
import re
import time


# Define the template for the prompt
template = """
Use the following context from the vector database and the previous conversation to answer the query. Incorporate your own knowledge and reasoning as an AI assistant:
Previous Conversation: {history}
Context: {context}
Query: {query}
Answer:
{answer}
"""



# Create the prompt template
prompt_template = PromptTemplate(
    input_variables=["history", "context", "query", "answer"],
    template=template
)



history = []

# Point to the local OpenAI server
client = OpenAI(base_url="http://localhost:1238/v1", api_key="lm-studio")


which_retriever='Multi'
#which_retriever='Standard'

if which_retriever =='Multi':
    retriever=retriever_from_llm
else:
    # Set up the vector retriever
    retriever = vectordb.as_retriever()

    
# Define a function to handle the user's input
def handle_input2():
    global is_first_run
    wrap_width = 100

    if is_first_run:
        query = "Hello, introduce yourself to someone opening this program for the first time. Be concise"
        is_first_run = False
        some_context = ""
    else:
        output_box_retrievalTime.object = f"Processing Retrieval {which_retriever}"
        output_box_gpt_responseTime = f"Waiting for GPT"
        start_time = time.time()
        query = input_box.value.strip()
        search_results = retriever.get_relevant_documents(query)
        #search_results = retriever_from_llm.get_relevant_documents(query)
        some_context = ""
        for result in search_results:
            some_context += result.page_content + "'Title-- '"+result.metadata['Title']+"--EndTitle | \n\n"
        
        end_time = time.time()
        # Calculate the elapsed time
        elapsed_time = end_time - start_time
        output_box_retrievalTime.object = f"Time taken for Retrieval with: {elapsed_time:.2f} seconds"
    #history.append({"role": "user", "content": query)# + " Database content: [" + some_context + "]"})

    if query:
        # Generate the prompt using the template and input values
        prompt = prompt_template.format(history=history, context=some_context, query=query, answer="")
        #prompt = prompt_template.format(history=history, answer="")
        output_box2.object=prompt
        start_time = time.time()
        completion = client.chat.completions.create(
            model="LLMA/Meta-Llama-3-8B",
            messages=history + [{"role": "user", "content": prompt}],
            temperature=0.7,
            stream=True,
        )

        
        new_message = {"role": "assistant", "content": ""}

        full_response = ""
        for chunk in completion:
            if chunk.choices[0].delta.content:
                new_message["content"] += chunk.choices[0].delta.content
                full_response += (chunk.choices[0].delta.content)

        formatted_text = f"<pre style='white-space: pre-wrap; width: {wrap_width}ch; font-family: Arial, sans-serif; font-size: 12px;'>**Query:**\n{query}\n\n**Final Answer:**\n{full_response}</pre>"
        output_box.object = formatted_text

        history.append(new_message)
        
        input_box.value = ""
        
        end_time = time.time()
        elapsed_time = end_time - start_time
        output_box_responseTime.object = f"Time taken for Model: {elapsed_time:.2f} seconds"


# Define a function to handle user input changes
def handle_input_change(event):
    handle_input2()

# Load the Panel extension
pn.extension()

# Create an input box
input_box = pn.widgets.TextAreaInput(width=400, height=100, styles={'overflow-y': 'scroll', 'text-align': 'center'})
# Create the output box
output_box = pn.pane.Markdown(
    width=800,
    height=400,
    styles={
        'overflow-y': 'scroll',
        'color': 'black',                   # Text color
        'background-color': '#f0f0f0',      # Background color (light gray)
        'font-size': '14px',                # Font size
        'font-family': 'Courier New',       # Font family
        'padding': '5px',                   # Padding inside the pane
        'border': '1px solid red',          # Border around the pane
        'text-align': 'center'              # Center the text
    }
)

output_box2 = pn.pane.Markdown(
    height=400,
    styles={
        'overflow-y': 'scroll',
        'overflow-x': 'scroll',
        'color': 'black',        # Text color
        'background-color': '#f0f0f4',  # Background color
        'font-size': '14px',     # Font size
        'font-family': 'Courier New',  # Font family
        'padding': '5px',        # Padding inside the pane
        'border': '1px solid blue'  # Border around the pane
    }
)


output_box_retrievalTime = pn.pane.Markdown()
output_box_responseTime = pn.pane.Markdown()
# Watch for changes in the input box value
input_box.param.watch(handle_input_change, 'value')

# Create a variable to track if it's the first run
is_first_run = True
if is_first_run:
        # Call handle_input2()
        handle_input2()
        is_first_run = False

         
# Create the dashboard layout using Panel
dashboard_layout = pn.Column(
    pn.pane.Markdown("# ChatGPT-like Conversation: RAG"),
    pn.Spacer(height=20),
    pn.Row(input_box, output_box),
    pn.Row(pn.Spacer(height=20)),
    pn.Row(output_box_retrievalTime, output_box_responseTime),
    pn.Row(output_box2),
    name="ChatGPT Dashboard"
)

# Create a dashboard object using the layout
dashboard = pn.panel(dashboard_layout)

# Display the dashboard in a new browser tab
dashboard.show()

# Alternative Version: based on QA CHAIN with BROWSER TAB - 
 * output still not consitent and short

In [None]:
import panel as pn
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
import textwrap
from langchain.chains import LLMChain
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate



# Create a RAG model
llm = OpenAI(base_url="http://localhost:1238/v1", api_key="lm-studio")
retriever = vectordb.as_retriever()

# Define a custom prompt template
template = """
Use the following context from the vector database and the previous conversation to answer the query. Incorporate your own knowledge and reasoning as an AI assistant:

Previous Conversation: {history}

Context: {context}

Query: {query}

Answer:
"""

prompt_template = PromptTemplate(input_variables=["history", "context", "query"], template=template)

# Create a custom chain
qa_chain = LLMChain(llm=llm, prompt=prompt_template)


# Load the Panel extension
pn.extension()

import textwrap
import panel as pn
# Create an input box
input_box = pn.widgets.TextAreaInput()
# Create the output box
output_box = pn.pane.Markdown()
# Create a history list to store the conversation
history = []
# Define a function to handle the user's input
def handle_input():
    query = input_box.value.strip()
    if query:
        # Add user input to the history
        #history.append({"role": "user", "content": query})
        # Process the query and generate a response
        context = retriever.get_relevant_documents(query)
        result_str = qa_chain.run(
            history="\n".join([f"{msg['role']}: {msg['content']}" for msg in history]),
            query=query,
            #max_tokens=8096,
            max_tokens=-1,
            context="".join([doc.page_content for doc in context])
        )
        final_answer_start = result_str.find("Final Answer:")
        if final_answer_start != -1:
            final_answer = result_str[final_answer_start + len("Final Answer:"):].strip()
            output_box.object = f"**Query:**\n{query}\n\n**Final Answer:**\n{textwrap.fill(final_answer, width=80)}"
        else:
            final_answer=result_str
            output_box.object = f"**Query:**\n{query}\n\n**Answer:**\n{textwrap.fill(result_str, width=80)}"
        
        # Add assistant response to the history
        history.append({"role": "assistant", "content": final_answer})
        history.append({"role": "user", "content": query})
        # Clear the input box
        input_box.value = ""
        
        
# Watch for changes in the input box value
input_box.param.watch(lambda event: handle_input(), 'value')
# Create the dashboard layout using Panel
dashboard_layout = pn.Column(
    pn.pane.Markdown("# ChatGPT-like Conversation"),
    pn.Spacer(height=20),
    pn.Row(input_box, output_box),
    name="ChatGPT Dashboard"
)
# Create a dashboard object using the layout
dashboard = pn.panel(dashboard_layout)
# Display the dashboard in a new browser tab
dashboard.show()