In [None]:
import gradio as gr
from openai import OpenAI as OpenAIClient
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import sys
import re
import time
import os
import urllib3
import requests
import sqlite3
import shutil
import json # For simple settings
from typing import List, Tuple, Dict, Any, Optional

# --- Path Setup for 'assets' ---
project_root_path = Path(os.path.abspath(os.getcwd()))
assets_dir = project_root_path / 'assets'
if str(assets_dir) not in sys.path and assets_dir.exists():
    sys.path.append(str(assets_dir))
    print(f"Added to sys.path: {assets_dir}")

# --- Import from assets ---
try:
    from func_inputoutput import manage_conversation_history, word_count
    # Attempt to import DocumentRetriever if it's there
    try:
        from func_inputoutput import DocumentRetriever as AssetDocumentRetriever
        print("Using DocumentRetriever from assets.func_inputoutput.py")
    except ImportError:
        AssetDocumentRetriever = None
        print("DocumentRetriever not found in assets.func_inputoutput.py, will use inline definition.")
    # Try to import settings functions
    try:
        from func_inputoutput import save_settings as fio_save_settings
        from func_inputoutput import load_settings as fio_load_settings # This now refers to the simplified version
        print("Using save_settings/load_settings from func_inputoutput.py")

        def save_ingestion_settings(settings_data, filename="pdf_ingestion_settings.json"):
            return fio_save_settings(settings_data, filename=filename) # This call is fine

        def load_ingestion_settings(filename="pdf_ingestion_settings.json"):
            # fio_load_settings now directly returns a dict or {}
            loaded_dict = fio_load_settings(filename=filename) # No need for return_dict_directly anymore
            
            # This check is still good as a safeguard, though fio_load_settings should always return a dict.
            if not isinstance(loaded_dict, dict):
                print(f"Warning: fio_load_settings from '{filename}' did not return a dict as expected. Using empty settings.")
                return {} # Fallback
            
            default_ui_settings = {
                # 'ingest_input_path': '', # Commented out as gr.File doesn't use default string path
                'ingest_output_dir': str(project_root_path / "processed_databases"),
                'ingest_db_name_stem': 'processed_docs',
                'ingest_processing_mode': "grobid",
                'ingest_overwrite_db': False,
                'ingest_grobid_config': 'config.json',
                'ingest_server_directory_path': '' # Ensure this default is present
            }
            final_settings = {**default_ui_settings, **loaded_dict}
            print(f"Loaded UI settings dict from '{filename}': {final_settings}")
            return final_settings
        
        
    except ImportError as e:        
        print("Could not load Settings")
except ImportError as e:
    print(f"ERROR: Could not import core functions from func_inputoutput.py in {assets_dir}: {e}")
    # Define basic settings functions if import fails for some reason and they were expected
    if 'fio_save_settings' not in globals():
        def save_ingestion_settings(settings_data, filename="pdf_ingestion_settings.json"): print("Error: save_settings N/A")
        def load_ingestion_settings(filename="pdf_ingestion_settings.json"): return {}
    sys.exit(1) # Or handle more gracefully

# --- Import PDF Processor ---
try:
    from pdftosqlite_processor import process_documents_to_sqlite
    print("pdftosqlite_processor.py loaded successfully.")
except ImportError as e:
    print(f"ERROR: Could not import from pdftosqlite_processor.py: {e}")
    print("Please ensure pdftosqlite_processor.py is in the same directory or Python path.")
    # Define a dummy function so the UI doesn't crash on launch
    def process_documents_to_sqlite(*args, **kwargs):
        return "Error: pdftosqlite_processor.py not found or failed to import.", None
    # sys.exit(1) # Or allow app to run with this feature disabled

# --- Proxy Setup ---
os.environ['NO_PROXY'] = 'localhost,127.0.0.1,127.0.0.1:8070' # Added Grobid port
urllib3.disable_warnings()
if hasattr(urllib3.util.connection, 'is_connection_dropped'): # type: ignore
    urllib3.util.connection.is_connection_dropped = lambda conn: False # type: ignore
if hasattr(requests.Session(), 'trust_env'):
    requests.Session().trust_env = False # type: ignore
print(f"Using NO_PROXY: {os.environ.get('NO_PROXY')}")

# --- Initialize OpenAI Client ---
try:
    oai_client = OpenAIClient(base_url="http://localhost:1238/v1", api_key="lm-studio")
    print("OpenAI client initialized successfully for LM Studio.")
except Exception as e:
    print(f"CRITICAL ERROR initializing OpenAI client: {e}. Ensure LM Studio is running on http://localhost:1238/v1.")
    oai_client = None # type: ignore


BASE_DOCS_PATH = project_root_path / "docs"
INGESTION_SETTINGS_FILE = "pdf_ingestion_settings.json" # For the new tab
# --- Inline DocumentRetriever Definition (if not imported from assets) ---
if AssetDocumentRetriever is None:
    print("Defining DocumentRetriever inline.")
    class DocumentRetriever:
        def __init__(self, vectordb: Chroma, openai_client: Optional[OpenAIClient] = oai_client):
            self.vectordb = vectordb
            self.openai_client = openai_client

        def retrieve_documents(self, query: str, is_first_run: bool, k: int = 10, method: str = 'combined') -> Tuple[str, str, str]:
            retrieved_text = ""
            refined_query_for_display = query 
            similar_docs = [] # Initialize similar_docs

            if '{do not use retrieval}' in query:
                return " ", "Retrieval skipped as per '{do not use retrieval}' instruction.", method

            if query.lower().startswith("doc_id:"):
                try:
                    doc_id_str = query.split(":", 1)[1].strip().split(",")[0]
                    doc_id_val = int(doc_id_str)
                    method = "direct_doc_id_search"
                    similar_docs_content = self.search_vectordb_by_id_chat('ID', doc_id_val, k)
                    
                    if similar_docs_content and similar_docs_content != "No documents found.":
                        refined_query_for_display = f"Found documents for ID: {doc_id_val}"
                    else:
                        refined_query_for_display = f"No documents found for ID: {doc_id_val}"
                    return similar_docs_content, refined_query_for_display, method
                except ValueError:
                    return "Error: Invalid Doc ID format. Must be an integer after 'doc_id:'.", "Invalid Doc ID", method
                except Exception as e:
                    return f"Error during doc_id search: {e}", "Doc ID Search Error", method

            if is_first_run:
                refined_query_for_retrieval = "" 
                refined_query_for_display = "Initial greeting, no retrieval performed."
            else:
                actual_query_for_llm_refinement = query.replace('{no history}', '').strip()
                if method == 'keywords':
                    refined_query_for_retrieval = self.extract_keywords(actual_query_for_llm_refinement)
                elif method == 'llm':
                    refined_query_for_retrieval = self.generate_useful_query(actual_query_for_llm_refinement)
                elif method == 'combined':
                    keywords = self.extract_keywords(actual_query_for_llm_refinement)
                    llm_refined = self.generate_useful_query(actual_query_for_llm_refinement)
                    refined_query_for_retrieval = f"{llm_refined} {keywords}".strip() 
                else: 
                    refined_query_for_retrieval = actual_query_for_llm_refinement
                
                refined_query_for_display = refined_query_for_retrieval 

                if self.is_query_meaningful(refined_query_for_retrieval):
                    try:
                        similar_docs = self.vectordb.similarity_search(refined_query_for_retrieval, k=k)
                    except Exception as e:
                        print(f"Error during similarity search: {e}")
                        return f"Error during similarity search: {e}", refined_query_for_display, method
                else:
                    refined_query_for_display = "No meaningful query generated. No retrieval performed."
                    return " ", refined_query_for_display, method
            
            if similar_docs:
                for i, doc in enumerate(similar_docs):
                    doc_id = doc.metadata.get('doc_id', 'N/A') 
                    chunk_id = doc.metadata.get('chunk_id', 'N/A')
                    retrieved_text += f"**Document {doc_id}, Chunk {chunk_id}**:\n{doc.page_content}\n\n"
            else:
                if not is_first_run : 
                     return "No relevant documents found for your query.", refined_query_for_display, method

            return retrieved_text.strip(), refined_query_for_display, method

        def is_query_meaningful(self, query: str) -> bool:
            if not query or len(query.strip()) < 3: return False
            query_lower = query.lower()
            meaningless_phrases = [
                'no content found', 'no keywords', 'not specified', 'empty query',
                'no llm', 'no retrieval', 'refined query', 'test', 'tests', 'search for'
            ]
            if any(phrase in query_lower for phrase in meaningless_phrases): return False
            if query_lower == query.lower().strip() and len(query.split()) < 2 and len(query) < 5 : 
                if query_lower not in ["hello", "hi"]: 
                    pass
            return True

        def extract_keywords(self, query: str) -> str:
            keywords = re.findall(r'\{(.*?)\}', query)
            return ', '.join(keywords) if keywords else ''

        def generate_useful_query(self, query: str) -> str:
            if not self.openai_client:
                print("Warning: OpenAI client not available for generate_useful_query. Returning original query.")
                return query
            instruction = ("Extract named entities, specific technical terms, and key concepts "
                           "from the following query that are most relevant for a semantic vector search. "
                           "Provide ONLY these entities/terms/concepts, separated by spaces or commas. "
                           "Focus on proper nouns, specific technologies, or multi-word key phrases. "
                           "Do not include conversational filler or instructions like 'search for'.")
            prompt_template_str = "{instruction}\nOriginal Query: \"{query}\"\nRefined Search Terms:"
            prompt = prompt_template_str.format(query=query, instruction=instruction)
            try:
                completion = self.openai_client.chat.completions.create(
                    model="lmstudio/Meta-Llama-3.1", 
                    messages=[
                        {"role": "system", "content": "You are an expert at extracting precise semantic search terms from user queries."},
                        {"role": "user", "content": prompt}
                    ],              
                    temperature=0.2, stream=False,
                )
                full_response = completion.choices[0].message.content.strip()
                return full_response if full_response else query 
            except Exception as e:
                print(f"Error in LLM query generation: {e}")
                return query

        def search_vectordb_by_id_chat(self, search_field: str, search_value: Any, k: int = 3) -> str:
            if not self.vectordb or not hasattr(self.vectordb, '_collection'):
                return "Error: VectorDB not properly initialized for ID search."
            collection = self.vectordb._collection
            try:
                results = collection.get(where={"doc_id": search_value}) 
                if results and results['documents']:
                    doc_meta_pairs = []
                    for i in range(len(results['ids'])): 
                        if i < len(results['metadatas']) and results['metadatas'][i] is not None:
                            doc_meta_pairs.append({
                                "content": results['documents'][i],
                                "metadata": results['metadatas'][i],
                                "id": results['ids'][i] 
                            })
                        else: 
                             doc_meta_pairs.append({
                                "content": results['documents'][i],
                                "metadata": {"doc_id": search_value, "chunk_id": "unknown"}, 
                                "id": results['ids'][i]
                            })
                    sorted_chunks = sorted(
                        doc_meta_pairs,
                        key=lambda x: x["metadata"].get('chunk_id', float('inf')) 
                    )
                    formatted_texts = []
                    for item in sorted_chunks:
                        chunk_id = item["metadata"].get('chunk_id', 'N/A')
                        title = item["metadata"].get('Title', 'N/A')
                        doc_text = (
                            f"**Document ID {search_value} (Title: {title}), Chunk {chunk_id}**:\n"
                            f"{item['content']}\n"
                        )
                        formatted_texts.append(doc_text.strip())
                    return "\n\n".join(formatted_texts) if formatted_texts else "No content found for this Document ID after filtering/sorting."
                else:
                    return "No documents found for this Document ID."
            except Exception as e:
                print(f"Error searching vectordb by ID: {e}")
                return f"Error occurred during Document ID search: {str(e)}"
    DocumentRetrieverClass = DocumentRetriever
else:
    DocumentRetrieverClass = AssetDocumentRetriever # type: ignore
    print("Using DocumentRetriever from assets file.")

# --- Embedding Class ---
class CustomEmbeddingForGradio:
    def __init__(self, openai_client: Optional[OpenAIClient]):
        self.client = openai_client
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        if not self.client: raise ValueError("OpenAI client not initialized for embeddings.")
        return [self.get_embedding(text) for text in texts]
    def embed_query(self, text: str) -> List[float]:
        if not self.client: raise ValueError("OpenAI client not initialized for embeddings.")
        return self.get_embedding(text)
    def get_embedding(self, text: str, model: str = "nomic-embed-text") -> List[float]:
        text = text.replace("\n", " ")
        if not self.client: raise ConnectionError("OpenAI client is not available for embeddings.")
        try:
            response = self.client.embeddings.create(input=[text], model=model)
            return response.data[0].embedding
        except Exception as e:
            print(f"ERROR during embedding generation for text '{text[:50]}...': {e}")
            raise
embedding_function = CustomEmbeddingForGradio(openai_client=oai_client)

# --- Database Processing Functions (for RAG ChromaDB) ---
def load_docs_from_sqlite2(sqlite_db_path: str, table_name: str = "document_table", # Defaulted to new table name
                          id_column: str = "ID", text_column: str = "Abstract", 
                          body_column: str = "Body", title_column: str = "Title") \
                          -> Tuple[List[Document], str, int]:
    documents = []
    processed_ids = set()  # Track unique document IDs
    num_original_docs = 0
    try:
        conn = sqlite3.connect(sqlite_db_path)
        cursor = conn.cursor()
        # First, let's count total records
        cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
        total_records = cursor.fetchone()[0]
        print(f"Total records in databasetb: {total_records}")
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?;", (table_name,))
        if not cursor.fetchone():
            cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
            tables = cursor.fetchall()
            if not tables:
                conn.close()
                return [], f"No tables found in SQLite DB: {sqlite_db_path}", 0
            table_name = tables[0][0] # Use first table if 'document_table' not found
            print(f"Table '{table_name}' not found, using first available table: '{table_name}'")

        cursor.execute(f"PRAGMA table_info({table_name})")
        available_columns = [row[1].lower() for row in cursor.fetchall()]
        
        select_cols_map = { "id": id_column, "title": title_column, "abstract": text_column, "body": body_column }
        actual_select_cols = []
        final_col_names = []

        for key, preferred_name in select_cols_map.items():
            if preferred_name.lower() in available_columns:
                actual_select_cols.append(preferred_name)
                final_col_names.append(key)
            else:
                print(f"Warning: Column '{preferred_name}' for '{key}' not found in table '{table_name}'.")
        
        if not actual_select_cols or ("abstract" not in final_col_names and "body" not in final_col_names):
            conn.close()
            return [], f"Neither abstract nor body columns (or any specified content columns) found. Cannot process.", 0
        if "id" not in final_col_names:
            conn.close()
            return [], f"ID column '{id_column}' not found. Cannot process.", 0

        query = f"SELECT {', '.join(actual_select_cols)} FROM {table_name}"
        cursor.execute(query)
        results = cursor.fetchall()
        conn.close()

        for row_data in results:
            row = dict(zip(final_col_names, row_data))
            record_id = row.get("id")
            title_content = row.get("title", f"Untitled Doc {record_id}")
            abstract_content = str(row.get("abstract", ""))
            body_content = str(row.get("body", ""))
            
            # Fix the variable names here
            if abstract_content is None:
                abstract_content = ""
            if body_content is None:
                body_content = ""
            
            combined_text = (abstract_content + " " + body_content).strip()

            if combined_text:
                processed_ids.add(record_id)  # Track this ID
                num_original_docs += 1
                metadata = {
                    'ID': record_id if record_id is not None else "",
                    'Title': title_content if title_content is not None else "",  # Also fixed title variable
                }
                doc = Document(page_content=combined_text, metadata=metadata)
                documents.append(doc)
        
        if not documents:
            return [], f"No processable documents found in {sqlite_db_path} (table: {table_name}).", 0
        
        return documents, f"Loaded {len(processed_ids)} unique documents from {sqlite_db_path} (table: {table_name}).", len(processed_ids)

    except sqlite3.Error as e:
        return [], f"SQLite error processing {sqlite_db_path}: {e}", 0
    except Exception as e:
        return [], f"Unexpected error processing {sqlite_db_path}: {e}", 0



def load_docs_from_sqlite(
    sqlite_db_path: str,
    table_name: str = "document_table",
    id_column: str = "ID",  # Column for the primary ID of the document in metadata['ID']
    title_column: str = "Title",
    abstract_column: str = "Abstract",
    body_column: str = "Body",
    # For EndNote Record Number or other specific IDs you want in metadata:
    # Maps desired metadata key to actual DB column name
    additional_metadata_cols: Dict[str, str] = None
) -> Tuple[List[Document], str, int]:
    """
    Loads documents from an SQLite database, creating Langchain Document objects.

    Args:
        sqlite_db_path: Path to the SQLite database file.
        table_name: Name of the table containing the documents.
        id_column: Name of the DB column to use for metadata['ID']. This ID is used
                   to count unique documents.
        title_column: Name of the DB column for the title.
        abstract_column: Name of the DB column for the abstract.
        body_column: Name of the DB column for the body text.
        additional_metadata_cols: Dictionary mapping desired metadata keys to actual
                                   DB column names for other fields to include.
                                   Example: {"endnote_id": "RecordNumberFromEndnote"}

    Returns:
        A tuple: (list of Documents, status message, count of unique documents loaded).
    """
    documents: List[Document] = []
    processed_sqlite_ids = set() # Tracks unique IDs from the id_column
    conn = None

    try:
        conn = sqlite3.connect(sqlite_db_path)
        cursor = conn.cursor()

        # 1. Determine actual table name (fallback if specified one not found)
        actual_table_name = table_name
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?;", (actual_table_name,))
        if not cursor.fetchone():
            cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
            tables = cursor.fetchall()
            if not tables:
                return [], f"No tables found in SQLite DB: {sqlite_db_path}", 0
            actual_table_name = tables[0][0]
            print(f"Warning: Table '{table_name}' not found. Using first available table: '{actual_table_name}'")

        # 2. CRITICAL DIAGNOSTIC: Count total records in the identified table
        cursor.execute(f"SELECT COUNT(*) FROM {actual_table_name}")
        total_records_in_db_table = cursor.fetchone()[0]
        print(f"Total records found in SQLite table '{actual_table_name}': {total_records_in_db_table}")

        # 3. Get available columns from the table (lowercased for matching, store original case)
        cursor.execute(f"PRAGMA table_info({actual_table_name})")
        db_columns_info = {row[1].lower(): row[1] for row in cursor.fetchall()} # {lowercase_name: OriginalCaseName}

        # 4. Define which columns to fetch and how to map them
        # Core content/ID columns
        # Key: standardized key used in code; Value: preferred DB column name from function args
        column_selection_map = {
            "doc_id_val": id_column,       # For metadata['ID'] and uniqueness check
            "title_val": title_column,
            "abstract_val": abstract_column,
            "body_val": body_column,
        }
        # Add any additional metadata columns specified by the user
        if additional_metadata_cols:
            for meta_key, db_col_name in additional_metadata_cols.items():
                if meta_key not in column_selection_map: # Avoid overwriting core keys
                    column_selection_map[meta_key] = db_col_name
                else:
                    print(f"Warning: Additional metadata key '{meta_key}' conflicts with a core processing key. Ignoring.")

        select_query_parts = []  # DB column names for SELECT statement
        active_processing_keys = [] # Standardized keys corresponding to select_query_parts

        for key, preferred_db_col_name in column_selection_map.items():
            if preferred_db_col_name.lower() in db_columns_info:
                select_query_parts.append(db_columns_info[preferred_db_col_name.lower()]) # Use original DB column case
                active_processing_keys.append(key)
            else:
                # Warn only if essential columns are missing
                if key in ["doc_id_val", "abstract_val", "body_val"]: # title is optional-ish
                    print(f"Warning: Column '{preferred_db_col_name}' (for internal key '{key}') not found in table '{actual_table_name}'.")

        if "doc_id_val" not in active_processing_keys:
            return [], f"Required ID column '{id_column}' (for metadata['ID']) not found in table '{actual_table_name}'. Cannot process.", 0
        if "abstract_val" not in active_processing_keys and "body_val" not in active_processing_keys:
            return [], f"Neither abstract column ('{abstract_column}') nor body column ('{body_column}') found. No content to process.", 0
        if not select_query_parts:
             return [], f"No columns to select based on specified parameters from table '{actual_table_name}'.", 0


        # 5. Execute query
        query = f"SELECT {', '.join(select_query_parts)} FROM {actual_table_name}"
        # print(f"DEBUG: Executing query: {query}")
        cursor.execute(query)
        
        # 6. Process fetched rows
        fetched_rows = cursor.fetchall()
        # print(f"DEBUG: Number of rows fetched by query: {len(fetched_rows)}")

        for row_tuple in fetched_rows:
            row_dict = dict(zip(active_processing_keys, row_tuple))

            # Primary ID for the document object (will go into metadata['ID'])
            current_doc_id = row_dict.get("doc_id_val")

            title_content = str(row_dict.get("title_val", "")).strip()
            if not title_content and current_doc_id is not None:
                title_content = f"Untitled Document {current_doc_id}"
            elif not title_content:
                title_content = "Untitled Document"

            abstract_content = str(row_dict.get("abstract_val", "")).strip()
            body_content = str(row_dict.get("body_val", "")).strip()
            
            combined_text = (abstract_content + " " + body_content).strip()

            if combined_text:  # Only create a Document if there's actual text content
                if current_doc_id is not None: # Ensure ID is not None before adding to set
                    processed_sqlite_ids.add(current_doc_id)
                
                # Prepare metadata
                metadata = {
                    'ID': current_doc_id, # This 'ID' is what chunk_texts_with_metadata expects
                    'Title': title_content,
                    'source_db_table': actual_table_name # Example of useful extra metadata
                }

                # Add other fetched additional_metadata_cols
                for key, db_col_name in (additional_metadata_cols or {}).items():
                    if key in row_dict: # If it was successfully fetched
                        metadata[key] = row_dict[key]
                
                doc = Document(page_content=combined_text, metadata=metadata)
                documents.append(doc)
        
        num_unique_docs_loaded = len(processed_sqlite_ids)
        
        if not documents:
            msg = (f"No processable documents (with content) found in table '{actual_table_name}'. "
                   f"Total records in DB table was: {total_records_in_db_table}. "
                   f"Rows fetched by query: {len(fetched_rows)}.")
            return [], msg, 0
        
        status_msg = (f"Loaded {num_unique_docs_loaded} unique documents (based on '{id_column}' values) "
                      f"from table '{actual_table_name}'. "
                      f"Total Langchain Documents created: {len(documents)}. "
                      f"(SQLite table initially had {total_records_in_db_table} records).")
        return documents, status_msg, num_unique_docs_loaded

    except sqlite3.Error as e:
        return [], f"SQLite error processing '{sqlite_db_path}' (table: '{table_name}'): {e}", 0
    except Exception as e:
        import traceback
        # print(f"Unexpected error details: {traceback.format_exc()}") # Uncomment for detailed debug
        return [], f"Unexpected error processing '{sqlite_db_path}' (table: '{table_name}'): {e}", 0
    finally:
        if conn:
            conn.close()    
    

def chunk_texts_with_metadata2(docs: List[Document], chunk_size: int = 2000, chunk_overlap: int = 200) -> List[Document]:
    # ... (your existing chunk_texts_with_metadata function - no changes needed here)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunked_texts = text_splitter.split_documents(docs)
    # Track which original documents got chunked
    chunked_doc_ids = set()
    
    for i, text_chunk in enumerate(chunked_texts):
        original_id = text_chunk.metadata.get('ID')
        if original_id: chunked_doc_ids.add(original_id)
        
        text_chunk.metadata['doc_id'] = text_chunk.metadata.get('ID', f'doc_unknown_{i}')
        text_chunk.metadata['chunk_id'] = i
    
    #print(f"Chunks represent {len(chunked_doc_ids)} unique original documents")
    return chunked_texts


def chunk_texts_with_metadata(
    docs: List[Document], 
    chunk_size: int = 2000, 
    chunk_overlap: int = 200,
    original_id_key: str = 'ID' # The metadata key for the original document's ID
) -> List[Document]:
    """
    Chunks documents and assigns 'doc_id' and 'chunk_id' metadata.
    'doc_id' will be the ID of the original document (from original_id_key or a fallback).
    'chunk_id' will be a globally unique sequential ID for each chunk.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    )
    
    all_processed_chunks: List[Document] = []
    # This set will store the unique identifiers of the original documents processed.
    # These identifiers will be what we assign as 'doc_id' to the chunks.
    processed_original_doc_ids: Set[Any] = set() 
    
    global_chunk_counter = 0

    for original_doc_index, original_doc in enumerate(docs):
        # Determine the identifier for this original document.
        # Priority 1: Use the value from original_doc.metadata[original_id_key]
        # Priority 2: Fallback to a generated ID based on the document's index in the input list.
        
        current_original_doc_id = None
        if original_doc.metadata and original_id_key in original_doc.metadata:
            current_original_doc_id = original_doc.metadata[original_id_key]
        
        if current_original_doc_id is None:
            # If the ID is missing or None, create a fallback ID.
            # This ensures that all chunks from *this specific original_doc object*
            # will share the same 'doc_id', even if it's a generated one.
            current_original_doc_id = f"original_doc_idx_{original_doc_index}"
            # You might want to log a warning here if an ID was expected but not found.
            # print(f"Warning: Original document at index {original_doc_index} missing '{original_id_key}'. Using fallback ID: {current_original_doc_id}")

        processed_original_doc_ids.add(current_original_doc_id)
        
        # Split the current original document.
        # Note: text_splitter.split_documents expects a list.
        # Metadata from original_doc is automatically propagated to its chunks.
        chunks_from_this_doc = text_splitter.split_documents([original_doc])
        
        for text_chunk in chunks_from_this_doc:
            # Ensure metadata dictionary exists
            if text_chunk.metadata is None:
                text_chunk.metadata = {}
            
            # Assign 'doc_id': the identifier of the original document this chunk came from.
            text_chunk.metadata['doc_id'] = current_original_doc_id
            
            # Assign 'chunk_id': a globally unique sequential ID for this specific chunk.
            text_chunk.metadata['chunk_id'] = global_chunk_counter
            
            all_processed_chunks.append(text_chunk)
            global_chunk_counter += 1
            
    # This print statement now accurately reflects the number of unique original document identifiers
    # that were processed and assigned as 'doc_id' to chunks.
    print(f"Generated {len(all_processed_chunks)} chunks.")
    print(f"These chunks represent {len(processed_original_doc_ids)} unique original documents (based on '{original_id_key}' or fallback index).")
    
    return all_processed_chunks

def create_or_load_chromadb(texts_to_add: Optional[List[Document]], embedding_fn: Any, 
                            persist_dir: str, mode: str = "create", force_overwrite: bool = True) \
                            -> Tuple[Optional[Chroma], str, int]:
    status_message = ""
    db = None
    num_chunks = 0
    persist_path = Path(persist_dir)

    if mode == "create":
        if persist_path.exists():
            if list(persist_path.iterdir()): # Check if directory is not empty
                if force_overwrite:
                    shutil.rmtree(persist_path)
                    status_message += f"Overwriting existing DB at {persist_path}. "
                    persist_path.mkdir(parents=True, exist_ok=True)
                else:
                    return None, f"DB directory '{persist_path}' is not empty. Use 'Load Existing' or enable overwrite.", 0
            else: 
                try: persist_path.rmdir() # Try removing empty dir first
                except OSError: shutil.rmtree(persist_path) # If hidden files, use rmtree
                persist_path.mkdir(parents=True, exist_ok=True)
        else:
            persist_path.mkdir(parents=True, exist_ok=True)
        
        if not texts_to_add:
             return None, "No texts provided to create new ChromaDB.", 0
        try:
            print(f"Attempting to create ChromaDB with {len(texts_to_add)} text chunks in {persist_path}...")
            start_time = time.time()
            db = Chroma.from_documents(texts_to_add, embedding_fn, persist_directory=str(persist_path))
            end_time = time.time()
            num_chunks = db._collection.count() if db and hasattr(db, '_collection') else 0
            status_message += f"New ChromaDB created in {end_time - start_time:.2f}s at {persist_path}. Chunks: {num_chunks}."
            print(status_message)
        except Exception as e:
            err_msg = f"Error creating ChromaDB: {e}"
            print(err_msg)
            return None, err_msg, 0

    elif mode == "load":
        if not persist_path.exists() or not (persist_path / "chroma.sqlite3").exists():
            return None, f"ChromaDB not found at {persist_path} (or missing chroma.sqlite3). Cannot load.", 0
        try:
            print(f"Attempting to load ChromaDB from {persist_path}...")
            start_time = time.time()
            db = Chroma(persist_directory=str(persist_path), embedding_function=embedding_fn)
            end_time = time.time()
            num_chunks = db._collection.count() if db and hasattr(db, '_collection') else 0
            status_message += f"ChromaDB loaded in {end_time - start_time:.2f}s from {persist_path}. Chunks: {num_chunks}."
            if num_chunks == 0: status_message += " Warning: Loaded DB is empty."
            print(status_message)
        except Exception as e:
            err_msg = f"Error loading ChromaDB from {persist_path}: {e}"
            print(err_msg)
            return None, err_msg, 0
    else:
        return None, f"Invalid mode: {mode}.", 0
    return db, status_message, num_chunks


def list_sqlite_db_files(base_path: Path = BASE_DOCS_PATH) -> List[str]:
    db_files = []
    if not base_path.is_dir():
        return ["Error: Base document path not found or not a directory."]
    
    for root, _, files in os.walk(base_path):
        for file_name in files:
            if file_name.lower().endswith(('.db', '.sqlite', '.sqlite3')):
                # Store relative path from base_path for display, or full path if preferred
                relative_path = Path(root) / file_name
                # Make it relative to base_path for cleaner display if it's a sub-path
                try:
                    display_path = str(relative_path.relative_to(base_path))
                except ValueError: # If not a subpath (e.g., base_path itself is the file's dir)
                    display_path = str(relative_path.name) # Or keep full path if desired
                
                # We need to be able to reconstruct the full path later
                # So, maybe better to store (display_name, full_path_str) for Gradio Dropdown
                db_files.append((str(relative_path), str(Path(root) / file_name))) # (Display, Value)
                
    if not db_files:
        return [("No SQLite DBs found in 'docs' or its subdirectories.", "")] # For (display, value)
    
    # Sort by display name
    return sorted(db_files, key=lambda x: x[0])



def list_potential_db_sources(base_path: Path = BASE_DOCS_PATH) -> List[str]:
    sources = []
    if not base_path.is_dir():
        return ["Error: Base document path not found or not a directory."]
    for item_name in os.listdir(base_path):
        item_path = base_path / item_name
        if item_path.is_dir():
            has_sqlite_for_creation = any(f.lower().endswith(('.db', '.sqlite', '.sqlite3')) for f in os.listdir(item_path))
            is_chroma_dir_itself = (item_path / "chroma.sqlite3").exists()
            has_chroma_subdir_with_file = (item_path / "chroma_db" / "chroma.sqlite3").exists()
            if has_sqlite_for_creation or is_chroma_dir_itself or has_chroma_subdir_with_file:
                sources.append(item_name)
    if not sources:
        return ["No DB sources found. Check 'docs' subdirs for .db/.sqlite files or ChromaDB structures."]
    return sorted(sources)

def list_sqlite_files_in_folder(folder_path_str: Optional[str]) -> List[str]:
    if not folder_path_str:
        return []
    folder_path = Path(folder_path_str)
    if not folder_path.is_dir():
        return []
    
    sqlite_files = [f.name for f in folder_path.iterdir() if f.is_file() and f.name.lower().endswith(('.db', '.sqlite', '.sqlite3'))]
    return sorted(sqlite_files)


def update_rag_sqlite_selector(selected_source_folder_name: str, db_mode: str):
    if db_mode == "Create New ChromaDB (from SQLite)" and selected_source_folder_name and \
       not selected_source_folder_name.startswith("Error") and \
       not selected_source_folder_name.startswith("No DB sources found"):
        
        source_collection_path = BASE_DOCS_PATH / selected_source_folder_name
        sqlite_files = list_sqlite_files_in_folder(str(source_collection_path))
        
        if sqlite_files:
            return gr.update(choices=sqlite_files, value=sqlite_files[0] if sqlite_files else None, visible=True)
        else:
            return gr.update(choices=[], value=None, visible=True, # Show but indicate no files
                             info="No SQLite files found in the selected folder.") 
    else:
        return gr.update(choices=[], value=None, visible=False) # Hide if not creating or no source
    
# --- Prompt Template & History Conversion ---
template_str = """
Use the following Retrieved Documents and the previous conversation to answer the query. Incorporate your own knowledge and reasoning as an AI assistant. Keep the content together with document and chunk numbers at the end of sentences in brackets. If content from different documents is combined into a new sentence, place the document and chunk numbers at theend of the sentence, separated by semicolons.
Previous Conversation: {history}
Query: {query}
Retrieved Documents: {retrieved_docs}
Answer:
{answer}
"""
prompt_template = PromptTemplate(
    input_variables=["history", "query", "retrieved_docs", "answer"], template=template_str
)
def convert_from_gradio_chat(gradio_chat_history: List[Tuple[Optional[str], Optional[str]]]) -> List[Dict[str, str]]:
    app_history = []
    for user_msg, ai_msg in gradio_chat_history:
        if user_msg: app_history.append({"role": "user", "content": user_msg})
        if ai_msg: app_history.append({"role": "assistant", "content": ai_msg})
    return app_history

# --- Main Chat Interaction Function ---
def handle_chat_interaction_gradio(query_text: str, chat_history_tuples: List[Tuple[Optional[str], Optional[str]]],
                                   selected_method_value: str, k_value: int, vectordb_state: Optional[Chroma]):
    start_time = time.time()
    if not vectordb_state:
        err_msg = "VectorDB not loaded. Please load or create a DB first using the 'Database Management' section."
        updated_history = chat_history_tuples + [[query_text, err_msg]]
        yield (updated_history, query_text, "Error: No DB", "Error: No DB", "Error: No DB", err_msg)
        return

    app_conv_history = convert_from_gradio_chat(chat_history_tuples)
    managed_history_str = manage_conversation_history(app_conv_history)
    
    doc_retriever = DocumentRetrieverClass(vectordb_state, openai_client=oai_client)
    retrieved_docs_str, used_query_for_retrieval, _ = doc_retriever.retrieve_documents(
        query_text, is_first_run=(not app_conv_history), k=k_value, method=selected_method_value
    )

    used_query_display = f"**Used Retrieval Query:**  \n{used_query_for_retrieval}\n"
    query_to_format = query_text
    if '{no history}' in query_text:
        history_for_prompt = ""
        query_to_format = query_text.replace('{no history}', '').strip()
    else:
        history_for_prompt = managed_history_str
    
    current_prompt = prompt_template.format(
        history=history_for_prompt, query=query_to_format, retrieved_docs=retrieved_docs_str, answer=""
    )
    prompt_display_text = current_prompt 

    retrieval_end_time = time.time()
    retrieval_duration = retrieval_end_time - start_time
    retrieved_tokens_count = word_count(retrieved_docs_str)
    retrieval_time_msg = (f"Retrieval: {retrieval_duration:.2f}s | Tokens: {retrieved_tokens_count} | Method: {selected_method_value} | k: {k_value}")

    yield (chat_history_tuples, query_text, prompt_display_text, used_query_display, retrieval_time_msg, "Waiting for LLM...")

    messages_for_llm = [
        {"role": "system", "content": "You are a scientific document analysis AI."},
        {"role": "user", "content": current_prompt}
    ]
    
    if not oai_client:
        err_msg = "OpenAI client not available. Cannot contact LLM."
        updated_history = chat_history_tuples + [[query_text, err_msg]]
        yield (updated_history, query_text, prompt_display_text, used_query_display, retrieval_time_msg, err_msg)
        return

    try:
        completion = oai_client.chat.completions.create(
            model="lmstudio/Meta-Llama-3.1", messages=messages_for_llm, temperature=0.7, stream=True,
        )
    except Exception as e:
        err_msg = f"LLM API Error: {e}"
        updated_history = chat_history_tuples + [[query_text, err_msg]]
        yield (updated_history, query_text, prompt_display_text, used_query_display, retrieval_time_msg, err_msg)
        return

    full_response = ""
    current_chat_history_for_display = chat_history_tuples + [[query_text, ""]]
    llm_start_time = time.time()
    for chunk in completion:
        if chunk.choices[0].delta.content:
            full_response += chunk.choices[0].delta.content
            current_chat_history_for_display[-1][1] = full_response
            yield (current_chat_history_for_display, query_text, prompt_display_text, used_query_display, retrieval_time_msg, "Streaming LLM response...")
    
    llm_end_time = time.time()
    message_tokens_llm = word_count(str(messages_for_llm))
    history_tokens_llm = word_count(managed_history_str)
    total_interaction_time = llm_end_time - start_time
    gpt_response_time_msg = (f"Total Interaction: {total_interaction_time:.2f}s (LLM: {llm_end_time - llm_start_time:.2f}s) | "
                             f"LLM In Tokens (approx): {message_tokens_llm} | Hist Tokens (approx): {history_tokens_llm}")
    yield (current_chat_history_for_display, "", prompt_display_text, used_query_display, retrieval_time_msg, gpt_response_time_msg)


# --- UI Definition ---
with gr.Blocks(theme=gr.themes.Soft(), title="Scientific Document Assistant") as demo:
    vectordb_state = gr.State(None) # For RAG ChromaDB
    sqlite_viewer_conn_state = gr.State(None) # For SQLite viewer connection (optional, can reconnect each time)
    
    # Load initial ingestion settings
    initial_ingestion_settings = load_ingestion_settings(INGESTION_SETTINGS_FILE)
    if not isinstance(initial_ingestion_settings, dict): initial_ingestion_settings = {}


    with gr.Tab("🔬 Scientific RAG Conversation"):
        gr.Markdown("# 🔬 Scientific RAG Conversation")
        with gr.Row():
            with gr.Column(scale=1, min_width=300):
                gr.Markdown("### 📂 RAG Database Management")
                db_source_dropdown = gr.Dropdown(
                    label="Select RAG Document Collection Source Folder", # Clarified label
                    choices=list_potential_db_sources(),
                    info="Select a subfolder from your 'docs' directory."
                )
                db_mode_radio = gr.Radio(
                    choices=["Load Existing ChromaDB", "Create New ChromaDB (from SQLite)"],
                    value="Load Existing ChromaDB", label="Action for RAG DB"
                )
                # NEW Dropdown for specific SQLite file (initially hidden or empty)
                rag_sqlite_file_dropdown = gr.Dropdown(
                    label="Select Specific SQLite File for New RAG DB",
                    choices=[],
                    interactive=True,
                    visible=False, # Initially hidden
                    info="Visible when 'Create New ChromaDB' is selected and source folder has SQLite files."
                )
                force_overwrite_checkbox = gr.Checkbox(
                    label="Force Overwrite (if creating RAG DB and target ChromaDB dir exists)", value=False
                )
                process_db_button = gr.Button("🔄 Process Selected RAG Database", variant="primary")
                
                
                
                db_status_message = gr.Markdown("RAG DB Status: No Database Loaded.")
                num_docs_loaded_info = gr.Markdown("Original Docs in RAG Source: 0 | Chunks in RAG DB: 0")

                gr.Markdown("---")
                gr.Markdown("### ⚙️ Chat Controls")
                selected_method_dd = gr.Dropdown(label='Retrieval Keyword Generation Method',
                                                 choices=['combined', 'keywords', 'llm', 'original_query'], value='combined',
                                                 info="How to refine query for retrieval. 'original_query' uses input as is.")
                k_value_slider = gr.Slider(minimum=1, maximum=50, value=10, step=1, label='Number of Chunks to Retrieve (K)')

            with gr.Column(scale=3):
                chatbot_display = gr.Chatbot(label="Conversation", height=600, bubble_full_width=False, show_label=False)
                query_input_box = gr.Textbox(label="Enter your query:", placeholder="Type your message (e.g., 'doc_id:123' or natural language) and press Enter...",
                                             lines=3, show_label=False, elem_id="query-input-box")

        with gr.Accordion("🔍 Timings and Debug Information (RAG Chat)", open=False):
            retrieval_time_md = gr.Markdown("Retrieval Time: N/A")
            response_time_md = gr.Markdown("LLM Response Time: N/A")
            used_query_md = gr.Markdown("Used Retrieval Query: N/A")
            prompt_display_md = gr.Markdown("Full Prompt to LLM: N/A")


    with gr.Tab("📄 Data Ingestion & DB Management"):
        gr.Markdown("# 📄 Data Ingestion & SQLite Database Management")

        with gr.Row():
            with gr.Column(scale=2):
                gr.Markdown("### 1. Process PDF/Text to SQLite Database")

                # Change Textbox to File component for uploading
                ingest_input_files = gr.File(
                    label="Upload PDF Files or a single TXT File",
                    file_count="multiple",  # Allows multiple PDFs or one TXT
                    file_types=[".pdf", ".txt"],
                    # value=initial_ingestion_settings.get('ingest_input_path', None) # .File doesn't easily take a default path string value
                    # We'll handle "default" or remembered paths differently if needed, perhaps by showing the last processed path.
                )
                ingest_is_directory_mode = gr.Checkbox(
                    label="Alternatively, process files from a server directory path (instead of uploads)", 
                    value=False # Default to upload mode
                )
                ingest_server_directory_path = gr.Textbox(
                    label="Server Directory Path (if checkbox above is ticked)",
                    placeholder="e.g., /path/to/your/pdfs_on_server",
                    value=initial_ingestion_settings.get('ingest_server_directory_path', ''), # New setting
                    visible=False # Initially hidden
                )

                # Show/hide server directory path based on checkbox
                def toggle_server_path_visibility(is_directory_mode_checked):
                    return gr.update(visible=is_directory_mode_checked)

                ingest_is_directory_mode.change(
                    fn=toggle_server_path_visibility,
                    inputs=[ingest_is_directory_mode],
                    outputs=[ingest_server_directory_path]
                )

                ingest_output_dir = gr.Textbox(
                    label="Output Directory for SQLite DB",
                    placeholder="e.g., /path/to/output_dbs (DB will be saved here)",
                    value=initial_ingestion_settings.get('ingest_output_dir', str(project_root_path / "processed_databases"))
                )
                ingest_db_name_stem = gr.Textbox(
                    label="SQLite Database Name (without .db)",
                    placeholder="e.g., my_collection",
                    value=initial_ingestion_settings.get('ingest_db_name_stem', 'processed_docs')
                )
                ingest_processing_mode = gr.Radio(
                    choices=["grobid", "text", "both"], value=initial_ingestion_settings.get('ingest_processing_mode', "grobid"),
                    label="Processing Mode", info="Grobid for PDFs, Text for structured TXT, Both to try based on file type."
                )
                ingest_overwrite_db = gr.Checkbox(label="Overwrite SQLite DB if it exists", value=initial_ingestion_settings.get('ingest_overwrite_db', False))
                ingest_grobid_config = gr.Textbox(
                    label="GROBID Config Path (optional)", placeholder="config.json",
                    value=initial_ingestion_settings.get('ingest_grobid_config', 'config.json'),
                    info="Path to GrobidClient's config.json, if not in CWD."
                )

                ingest_process_button = gr.Button("⚙️ Process Files to SQLite", variant="primary")
                ingest_status_md = gr.Markdown("Ingestion Status: Ready")
                ingest_output_db_path_md = gr.Markdown("Created DB Path: N/A")

        # ...the SQLite viewer column ...

            with gr.Column(scale=3):
                gr.Markdown("### 2. View SQLite Database Records")
                
                # Option 1: Keep Textbox for manual path entry AND add a dropdown
                view_sqlite_db_path_dropdown = gr.Dropdown(
                    label="Select an existing SQLite DB to view (from 'docs' folder)",
                    choices=list_sqlite_db_files(), # This will be a list of (display, value)
                    interactive=True,
                    # info="Or paste path below and click 'Load SQLite DB'."
                )
                view_sqlite_db_path_textbox = gr.Textbox(
                    label="Path to SQLite DB file (manual entry or auto-filled from selection/creation)", 
                    placeholder="Select from dropdown, or paste path, or will use DB from Ingestion tab"
                )
                view_load_db_button = gr.Button("📂 Load SQLite DB for Viewing")
                
                view_table_name_info = gr.Markdown("Table: N/A")
                
                view_record_dropdown = gr.Dropdown(label="Select Record (ID: Title)", choices=[], interactive=True)
                view_record_details_md = gr.Markdown("Record Number: N/A | Author: N/A | Date: N/A")
                view_record_content_text = gr.Textbox(label="Record Content (Abstract / Body)", lines=15, interactive=False, autoscroll=False)
                view_status_md = gr.Markdown("Viewer Status: Ready")

    # --- RAG DB Processing Logic ---
    def process_database_selection_ui(
        selected_source_folder_name: str, # From db_source_dropdown
        db_mode: str,                     # From db_mode_radio
        selected_sqlite_file_name: Optional[str], # NEW: From rag_sqlite_file_dropdown
        overwrite_flag: bool              # From force_overwrite_checkbox
    ) -> Tuple[Optional[Chroma], str, str]:

        if not selected_source_folder_name or \
           selected_source_folder_name.startswith("Error") or \
           selected_source_folder_name.startswith("No DB sources found"):
            return None, "Error: No valid RAG DB source folder selected.", "Original Docs: 0 | Chunks in DB: 0"

        source_collection_path = BASE_DOCS_PATH / selected_source_folder_name
        new_vectordb = None
        status_msg = ""
        total_original_docs = 0
        num_db_chunks = 0
        # determined_chroma_persist_dir_str = "" # Will be set specifically

        if db_mode == "Load Existing ChromaDB":
            # Logic for loading existing ChromaDB:
            # Try to find a ChromaDB named after a SQLite file stem, or default to 'chroma_db'
            
            # Option 1: Look for a ChromaDB that might be named after a SQLite file in the dir
            # This is a bit heuristic. If there are multiple .db files and multiple chroma_xx_db dirs,
            # we might need a more explicit way to link them or just pick one.
            # For now, let's prioritize a default 'chroma_db' or one directly in the source_collection_path.

            chroma_path_default_subdir = source_collection_path / "chroma_db" # Default location
            chroma_path_in_source_dir = source_collection_path # Check if source_collection_path itself is a ChromaDB dir

            determined_chroma_persist_dir_str = None

            if (chroma_path_in_source_dir / "chroma.sqlite3").exists():
                determined_chroma_persist_dir_str = str(chroma_path_in_source_dir)
                status_msg += f"Found ChromaDB directly in '{selected_source_folder_name}'. "
            elif (chroma_path_default_subdir / "chroma.sqlite3").exists():
                determined_chroma_persist_dir_str = str(chroma_path_default_subdir)
                status_msg += f"Found ChromaDB in default '{selected_source_folder_name}/chroma_db'. "
            else:
                # Attempt to find any subdirectory that looks like a Chroma DB
                # This is more complex if you have multiple (e.g. chroma_ligase_db, chroma_kinase_db)
                # For simplicity, if the above aren't found, we report an error.
                # A more advanced version would list available ChromaDBs in the folder.
                msg = f"Error: No standard ChromaDB found in '{selected_source_folder_name}' (checked . and ./chroma_db/). "
                status_msg += msg
                # To load a specific named ChromaDB (e.g. chroma_ligase_db), the user would need to select it.
                # For now, this simple load looks for the default or root.
                return None, status_msg, "Original Docs: 0 | Chunks in DB: 0"


            new_vectordb, load_status_msg, num_db_chunks = create_or_load_chromadb(
                None, embedding_function, determined_chroma_persist_dir_str, mode="load"
            )
            status_msg += load_status_msg
            
            # Try to find an associated SQLite to get original doc count
            # If multiple SQLite, pick the first one or try to match ChromaDB name if possible (more complex)
            sqlite_files_in_folder = list_sqlite_files_in_folder(str(source_collection_path))
            if new_vectordb and sqlite_files_in_folder:
                # Heuristic: if ChromaDB is named like 'chroma_X_db', try to find 'X.db'
                # This is a simplification.
                sqlite_to_count_from = sqlite_files_in_folder[0] # Default to first
                if determined_chroma_persist_dir_str:
                    chroma_dir_name = Path(determined_chroma_persist_dir_str).name
                    if chroma_dir_name.startswith("chroma_") and chroma_dir_name.endswith("_db"):
                        potential_sqlite_stem = chroma_dir_name[len("chroma_"):-len("_db")]
                        for s_file in sqlite_files_in_folder:
                            if Path(s_file).stem == potential_sqlite_stem:
                                sqlite_to_count_from = s_file
                                break
                try:
                    _, _, total_original_docs = load_docs_from_sqlite(str(source_collection_path / sqlite_to_count_from))
                except Exception:
                    total_original_docs = -1 
        
        elif db_mode == "Create New ChromaDB (from SQLite)":
            if not selected_sqlite_file_name:
                msg = "Error: No specific SQLite file selected for new RAG DB creation."
                return None, msg, "Original Docs: 0 | Chunks in DB: 0"

            sqlite_db_path = source_collection_path / selected_sqlite_file_name
            if not sqlite_db_path.exists():
                msg = f"Error: Selected SQLite file '{selected_sqlite_file_name}' not found in '{selected_source_folder_name}'."
                return None, msg, "Original Docs: 0 | Chunks in DB: 0"

            # Customize ChromaDB directory name
            sqlite_stem = Path(selected_sqlite_file_name).stem
            # Sanitize stem for directory name (e.g., replace spaces, special chars if any)
            safe_stem = "".join(c if c.isalnum() or c in ['_', '-'] else '_' for c in sqlite_stem)
            chroma_db_dir_name = f"chroma_{safe_stem}_db"
            determined_chroma_persist_dir = source_collection_path / chroma_db_dir_name

            print(f"Loading docs from SQLite: {sqlite_db_path} for new ChromaDB creation.")
            docs_from_sqlite, load_msg, total_original_docs = load_docs_from_sqlite(str(sqlite_db_path))
            status_msg += load_msg + " "
            
            if not docs_from_sqlite:
                return None, status_msg, f"Original Docs: {total_original_docs} | Chunks in DB: 0"

            print(f"Chunking {len(docs_from_sqlite)} documents...")
            chunked_texts = chunk_texts_with_metadata(docs_from_sqlite)
            # unique_doc_ids = len(set(chunk.metadata.get('doc_id') for chunk in chunked_texts)) # Already printed in chunk_texts_with_metadata
            status_msg += f"Chunked into {len(chunked_texts)} pieces. " # from {unique_doc_ids} documents. "
            
            print(f"Creating new ChromaDB in: {determined_chroma_persist_dir}")
            new_vectordb, create_load_msg, num_db_chunks = create_or_load_chromadb(
                chunked_texts, embedding_function, str(determined_chroma_persist_dir), 
                mode="create", force_overwrite=overwrite_flag
            )
            status_msg += create_load_msg
        
        num_docs_info_str = f"Original Docs (SQLite source): {total_original_docs if total_original_docs != -1 else 'N/A'} | Chunks in RAG DB: {num_db_chunks}"
        if not new_vectordb and db_mode == "Load Existing ChromaDB" and num_db_chunks == 0 and "Error" not in status_msg:
            status_msg += " Loaded an empty Chroma DB." # Clarify if empty but no error
        elif not new_vectordb:
             num_docs_info_str = f"Original Docs (SQLite source): {total_original_docs if total_original_docs != -1 else 'N/A'} | Chunks in RAG DB: 0 (Failed or not processed)"
        
        return new_vectordb, status_msg, num_docs_info_str

    process_db_button.click(
        fn=process_database_selection_ui,
        inputs=[
            db_source_dropdown,
            db_mode_radio,
            rag_sqlite_file_dropdown, # NEW INPUT
            force_overwrite_checkbox
        ],
        outputs=[vectordb_state, db_status_message, num_docs_loaded_info]
    )


    # --- Initial Greeting for Chatbot ---
    def simple_initial_greeting_ui() -> List[Tuple[Optional[str], Optional[str]]]:
        query_text = "Hello! I am a scientific RAG assistant. Please load a RAG database from the 'RAG Database Management' panel to begin chatting, or go to the 'Data Ingestion' tab to process new documents."
        return [[None, query_text]]
    
    # --- Data Ingestion Tab Callbacks ---
    def handle_ingest_process_button_click(
            uploaded_files_list: Optional[List[Any]], # List of tempfile._TemporaryFileWrapper from gr.File
            is_directory_mode: bool,
            server_dir_path: str,
            output_dir: str, 
            db_name_stem: str, 
            mode: str, 
            overwrite: bool, 
            grobid_cfg: str, 
            progress=gr.Progress(track_tqdm=True)):
        
        # Save settings - decide what to save for the input path now
        current_ingestion_settings = {
            # 'ingest_input_path': ..., # This is tricky with gr.File, maybe save last server_dir_path
            'ingest_server_directory_path': server_dir_path if is_directory_mode else initial_ingestion_settings.get('ingest_server_directory_path', ''), # Save last used server path
            'ingest_output_dir': output_dir,
            'ingest_db_name_stem': db_name_stem,
            'ingest_processing_mode': mode,
            'ingest_overwrite_db': overwrite,
            'ingest_grobid_config': grobid_cfg
        }
        save_ingestion_settings(current_ingestion_settings, INGESTION_SETTINGS_FILE)

        input_target_path_for_processor: Optional[str] = None
        temp_upload_dir: Optional[Path] = None # To store uploaded files temporarily if needed

        if is_directory_mode:
            if not server_dir_path:
                return "Error: Server Directory Path is required when 'process from server directory' is checked.", "N/A", gr.update()
            input_target_path_for_processor = server_dir_path
        else: # Upload mode
            if not uploaded_files_list:
                return "Error: No files uploaded. Please upload PDF(s) or a TXT file.", "N/A", gr.update()
            
            # Create a temporary directory to store uploaded files before processing
            # This makes it easier for `process_documents_to_sqlite` if it expects a directory.
            temp_upload_dir = project_root_path / "temp_uploads" / str(time.time_ns())
            temp_upload_dir.mkdir(parents=True, exist_ok=True)
            
            processed_one_file = False
            for uploaded_file_obj in uploaded_files_list:
                original_filename = Path(uploaded_file_obj.name).name # Get original filename
                temp_save_path = temp_upload_dir / original_filename
                try:
                    # Gradio's temp file needs to be copied to a persistent temp location
                    # because the original temp file wrapper might be closed or deleted.
                    shutil.copy(uploaded_file_obj.name, temp_save_path)
                    print(f"Copied uploaded file {original_filename} to {temp_save_path}")
                    processed_one_file = True
                except Exception as e:
                    shutil.rmtree(temp_upload_dir, ignore_errors=True)
                    return f"Error copying uploaded file {original_filename}: {e}", "N/A", gr.update()

            if not processed_one_file:
                shutil.rmtree(temp_upload_dir, ignore_errors=True)
                return "Error: Could not process any of the uploaded files.", "N/A", gr.update()

            input_target_path_for_processor = str(temp_upload_dir)
            # Note: If `process_documents_to_sqlite` can handle a list of file paths directly,
            # you might not need to copy them all to one temp_upload_dir.
            # But sending a directory is often simpler for existing directory-based processors.

        if not output_dir or not db_name_stem:
            if temp_upload_dir: shutil.rmtree(temp_upload_dir, ignore_errors=True)
            return "Error: Output Directory and DB Name Stem are required.", "N/A", gr.update()

        status_msg, created_db_path = process_documents_to_sqlite(
            input_target_path_for_processor, # This is now either server_dir_path or path to temp_upload_dir
            output_dir, db_name_stem, mode, overwrite, grobid_cfg,
            progress_callback=lambda p, desc: progress(p, desc=desc)
        )
        
        # Clean up temporary upload directory if it was used
        if temp_upload_dir:
            shutil.rmtree(temp_upload_dir, ignore_errors=True)
            print(f"Cleaned up temporary upload directory: {temp_upload_dir}")

        if created_db_path:
            # Clear the file input component after successful processing (optional)
            return status_msg, f"Created DB: {created_db_path}", created_db_path, gr.update(value=None) # Clears gr.File
        else:
            return status_msg, "Failed to create DB or no DB path returned.", gr.update(), gr.update(value=None) # Clears gr.File

    # Update the ingest_process_button.click call
    ingest_process_button.click(
        fn=handle_ingest_process_button_click,
        inputs=[
            ingest_input_files, ingest_is_directory_mode, ingest_server_directory_path,
            ingest_output_dir, ingest_db_name_stem, ingest_processing_mode, 
            ingest_overwrite_db, ingest_grobid_config
        ],
        outputs=[ingest_status_md, ingest_output_db_path_md, view_sqlite_db_path_textbox, ingest_input_files] # Add ingest_input_files to clear it
    )

    # --- SQLite Viewer Callbacks ---
    sqlite_viewer_db_connection = None # Module-level variable to hold connection if we want to keep it open

    def load_sqlite_for_viewing(db_path_str: str):
        # global sqlite_viewer_db_connection # Decide if you really need this global
        # if sqlite_viewer_db_connection:
        #     try: sqlite_viewer_db_connection.close()
        #     except: pass
        #     sqlite_viewer_db_connection = None

        if not db_path_str or not Path(db_path_str).exists():
            return "Error: SQLite DB path is invalid or file does not exist.", "Table: N/A", gr.update(choices=[], value=None), "", "", ""
        
        conn = None # Initialize conn
        try:
            conn = sqlite3.connect(f"file:{db_path_str}?mode=ro", uri=True)
            # sqlite_viewer_db_connection = conn # If you keep it global
            cursor = conn.cursor()
            
            cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='document_table';")
            table_info = cursor.fetchone()
            if not table_info:
                cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
                table_info = cursor.fetchone()

            if not table_info:
                conn.close()
                return "Error: No tables found in the database.", "Table: N/A", gr.update(choices=[], value=None), "", "", ""
            
            table_name = table_info[0]

            # Get total record count for user information
            cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
            total_db_records = cursor.fetchone()[0]
            
            cursor.execute(f"PRAGMA table_info({table_name})")
            columns = [row[1] for row in cursor.fetchall()]
            id_col_name = "ID" # Assuming 'ID' is your primary key column for selection
            title_col_name = "Title" if "Title" in columns else ("Abstract" if "Abstract" in columns else None)
            
            record_options_for_gradio = [] # List of (display_str, actual_id_value)

            if not title_col_name: 
                cursor.execute(f"SELECT {id_col_name} FROM {table_name} ORDER BY {id_col_name}")
                fetched_for_dropdown = cursor.fetchall()
                for row_tuple in fetched_for_dropdown:
                    record_id = row_tuple[0]
                    display_str = f"ID {record_id}"
                    record_options_for_gradio.append((display_str, record_id))
            else:
                cursor.execute(f"SELECT {id_col_name}, {title_col_name} FROM {table_name} ORDER BY {id_col_name}")
                fetched_for_dropdown = cursor.fetchall()
                for r_id, r_title in fetched_for_dropdown:
                    title_str = str(r_title) if r_title else "No Title"
                    display_title_text = title_str[:70] + "..." if len(title_str) > 70 else title_str
                    display_str = f"ID {r_id}: {display_title_text}"
                    record_options_for_gradio.append((display_str, r_id))
            
            first_val_actual_id = record_options_for_gradio[0][1] if record_options_for_gradio else None
            status_text = f"Loaded DB: {Path(db_path_str).name}. Table: {table_name} ({total_db_records} records)."
            
            # Output for view_record_dropdown should be gr.update(choices=..., value=...)
            # Output for view_table_name_info: string
            # Output for view_record_details_md: string (empty initially)
            # Output for view_record_content_text: string (empty initially)
            # Output for view_status_md: string
            return (status_text, 
                    f"Table: {table_name} ({total_db_records} records)", 
                    gr.update(choices=record_options_for_gradio, value=first_val_actual_id), 
                    "", # for view_record_details_md
                    "", # for view_record_content_text
                    "DB Loaded. Select a record." # for view_status_md (final one)
                   )

        except Exception as e:
            # import traceback # For debugging
            # print(traceback.format_exc()) # For debugging
            error_msg = f"Error loading SQLite DB: {e}"
            return error_msg, "Table: N/A", gr.update(choices=[], value=None), "", "", error_msg
        finally:
            if conn:
                conn.close()
                # sqlite_viewer_db_connection = None # if global

    def display_selected_sqlite_record(selected_record_actual_id: Optional[int], db_path_str: str):
        if selected_record_actual_id is None or not db_path_str: # Check for None explicitly
            return "No record selected or DB path missing.", "", ""

        record_id = selected_record_actual_id # It's already an int. This is the ID to use.
        conn = None

        try:
            # ------------------------------------------------------------- #
            # DELETE THIS ENTIRE BLOCK:
            # record_id_match = re.match(r"ID (\d+):?", selected_record_str)
            # if not record_id_match:
            #     record_id_match = re.match(r"ID (\d+)", selected_record_str)
            #     if not record_id_match:
            #         return "Could not parse Record ID from selection.", "", ""
            # record_id = int(record_id_match.group(1))
            # ------------------------------------------------------------- #

            conn = sqlite3.connect(f"file:{db_path_str}?mode=ro", uri=True)
            cursor = conn.cursor()
            
            # Determine table name again
            cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='document_table';")
            table_info = cursor.fetchone()
            if not table_info:
                cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
                table_info = cursor.fetchone()
            if not table_info: 
                if conn: conn.close() # Ensure connection is closed before returning
                return "Error: Table not found.", "", ""
            table_name = table_info[0]

            # Fetch all relevant columns for the selected ID
            cursor.execute(f"PRAGMA table_info({table_name})")
            columns = [row[1] for row in cursor.fetchall()]
            
            cols_to_fetch = ["Abstract", "Body", "Authors", "Date", "Record_Number", "Journal", "Title", "DOI", "Refs", "Citations"]
            select_cols_str = ", ".join([col for col in cols_to_fetch if col in columns])
            
            if not select_cols_str: 
                select_cols_str = "*" 

            # Use the record_id (which is selected_record_actual_id) directly in the query
            query = f"SELECT {select_cols_str} FROM {table_name} WHERE ID = ?" # Assuming your ID column is named 'ID'
            cursor.execute(query, (record_id,)) # Pass record_id as a tuple
            record_data_tuple = cursor.fetchone()
            # conn.close() # Moved to finally block

            if not record_data_tuple:
                return f"Record ID {record_id} not found.", "", ""
            
            fetched_cols_list = [col for col in cols_to_fetch if col in columns and select_cols_str != "*"]
            if select_cols_str == "*": # If we fetched all columns with *, get column names from cursor.description
                fetched_cols_list = [desc[0] for desc in cursor.description]

            record_data = dict(zip(fetched_cols_list, record_data_tuple))

            title_text = str(record_data.get('Title', 'N/A'))
            abstract_text = str(record_data.get('Abstract', ''))
            body_text = str(record_data.get('Body', ''))
            
            display_content = f"Title: {title_text}\n\n"
            if abstract_text and body_text and abstract_text.strip() == body_text.strip():
                 display_content += f"Abstract/Body:\n{abstract_text}"
            else:
                if abstract_text: display_content += f"Abstract:\n{abstract_text}\n\n"
                if body_text: display_content += f"Body:\n{body_text}"
            
            if not abstract_text and not body_text and not title_text == 'N/A': # Show raw if no main content but title exists
                 display_content += "\nNo Abstract or Body found. Raw data:\n" + "\n".join([f"{k}: {v}" for k,v in record_data.items()])
            elif not abstract_text and not body_text and title_text == 'N/A': # Truly empty
                 display_content = "No content found for this record."


            details_str = (
                f"Record Number: {record_data.get('Record_Number', 'N/A')} | "
                f"Authors: {str(record_data.get('Authors', 'N/A'))[:100]}... | "
                f"Date: {record_data.get('Date', 'N/A')} | "
                f"Journal: {record_data.get('Journal', 'N/A')} | "
                f"DOI: {record_data.get('DOI', 'N/A')}"
            )
            return details_str, display_content, "Record displayed."
        
        except Exception as e:
            # import traceback # For debugging
            # print(traceback.format_exc()) # For debugging
            err_msg = f"Error displaying record ID {record_id if 'record_id' in locals() else 'unknown'}: {e}"
            return err_msg, "", "Error"
        
        finally:
            if conn:
                conn.close()
                
                
    # New callback to update textbox when dropdown selection changes
    def update_viewer_path_from_dropdown(selected_db_full_path: str):
        # selected_db_full_path is the 'value' from the (display, value) tuple
        if selected_db_full_path:
            return gr.update(value=selected_db_full_path)
        return gr.update() # No change if nothing selected or empty value

    view_sqlite_db_path_dropdown.change(
        fn=update_viewer_path_from_dropdown,
        inputs=[view_sqlite_db_path_dropdown],
        outputs=[view_sqlite_db_path_textbox]
    )
    
    view_load_db_button.click(
        fn=load_sqlite_for_viewing, # Your existing function
        inputs=[view_sqlite_db_path_textbox], # Use the textbox as the source of truth for the path
        outputs=[view_status_md, view_table_name_info, view_record_dropdown, view_record_details_md, view_record_content_text, view_status_md]
    )
    
    view_record_dropdown.change(
        fn=display_selected_sqlite_record,
        inputs=[view_record_dropdown, view_sqlite_db_path_textbox], # Pass db_path again
        outputs=[view_record_details_md, view_record_content_text, view_status_md]
    )
    
    db_source_dropdown.change(
        fn=update_rag_sqlite_selector,
        inputs=[db_source_dropdown, db_mode_radio],
        outputs=[rag_sqlite_file_dropdown]
    )
    
    db_mode_radio.change(
        fn=update_rag_sqlite_selector,
        inputs=[db_source_dropdown, db_mode_radio],
        outputs=[rag_sqlite_file_dropdown]
    )

    # --- RAG Chat Input Submission ---
    demo.load(fn=simple_initial_greeting_ui, inputs=None, outputs=[chatbot_display]) # Ensure inputs=None if no inputs
    
    query_input_box.submit(
        fn=handle_chat_interaction_gradio,
        inputs=[query_input_box, chatbot_display, selected_method_dd, k_value_slider, vectordb_state],
        outputs=[chatbot_display, query_input_box, prompt_display_md, used_query_md, retrieval_time_md, response_time_md],
        show_progress="full"
    )

# --- Main Execution ---
if __name__ == "__main__":
    print(f"Script CWD: {os.getcwd()}")
    print(f"Looking for RAG document collections in: {BASE_DOCS_PATH}")
    if not BASE_DOCS_PATH.exists():
        print(f"WARNING: Base RAG documents path {BASE_DOCS_PATH} does not exist! This is for the RAG tab.")
    
    processed_db_default_dir = project_root_path / "processed_databases"
    if not processed_db_default_dir.exists():
        try:
            processed_db_default_dir.mkdir(parents=True, exist_ok=True)
            print(f"Created default directory for processed SQLite databases: {processed_db_default_dir}")
        except Exception as e:
            print(f"Warning: Could not create default directory for processed databases {processed_db_default_dir}: {e}")

    if not oai_client:
        print("CRITICAL WARNING: OpenAI client (oai_client) is NOT initialized. LLM and Embedding features will FAIL.")
    
    print("Launching Gradio App...")
    demo.queue().launch(debug=True, server_name="127.0.0.1", share=False)