In [None]:
# Not required
import os
os.environ['http_proxy']="http://localhost:1238"
os.environ['https_proxy']="http://localhost:1238"

In [None]:
import os
cwd = os.getcwd()
print("cwd",cwd)

## Read PDF and Extract TEXT, Stores in a SQLite3 database (db): GROBID based approach

In [None]:
import os
import sqlite3
import pandas as pd
from grobid_client.grobid_client import GrobidClient
import json
import grobid_tei_xml



class GrobidAuthor:
    def __init__(self, full_name):
        self.full_name = full_name

class GrobidBiblio:
    def __init__(self, index, authors, title, date, volume, pages, issue, journal, doi):
        self.index = index
        self.authors = authors
        self.title = title
        self.date = date
        self.volume = volume
        self.pages = pages
        self.issue = issue
        self.journal = journal
        self.doi = doi

def extract_bibliographic_details(biblios, print_choice=False):
    for biblio in biblios:
        if print_choice:
            print("Index", biblio.index, "| Title:", biblio.title)
            print("Authors:")
            for author in biblio.authors:
                print("-", author.full_name)

            print("Date:", biblio.date)
            print("Volume:", biblio.volume)
            print("Pages:", biblio.pages)
            # print("Issue:", biblio.issue)
            print("Journal:", biblio.journal)
            print("Doi:", biblio.doi)
            print()
        else:
            i=1# Add any other logic you want to perform when print_choice is False
            pass
    ref_list = '*'.join([
        f" * Index: {biblio.index} | Title: {biblio.title} | Authors: {', '.join([author.full_name for author in biblio.authors])} | Date: {biblio.date} | Volume: {biblio.volume} | Pages: {biblio.pages} | Journal: {biblio.journal} | Doi: {biblio.doi}\|"
        for biblio in biblios
    ])
    return ref_list
    
    
client = GrobidClient(config_path="config.json")


cwd = os.getcwd()
print("cwd",cwd)

#pdf file directory
path= os.path.join(cwd, "docs", "sample")
#--------------------------------------------------------------------
#database
base_name='sample.db'
database_name=os.path.join(cwd, "docs", "sample",base_name)

print("\n\ndatabase_pathname",database_name)



service_name = "processFulltextDocument"

# Check if the database file exists
if os.path.exists(database_name):
    while True:
        choice = input("\nThe database file already exists. Do you want to create a new one? (y/n): ")
        if choice.lower() == 'y':
            # Close the connection to the database if it is open
            if 'conn' in locals():
                conn.close()

            # Delete the existing database file
            conn = None  # Reset conn variable
            os.remove(database_name)
            break
        elif choice.lower() == 'n':
            break
        else:
            print("Invalid choice! Please enter 'y' or 'n'.")

# Connect to the SQLite database
conn = sqlite3.connect(database_name)
# Create the table if it doesn't exist
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
result = cursor.fetchone()

if result is None:
    table_name = "your_table_name"  # Provide a table name of your choice
    cursor.execute('''CREATE TABLE {table_name}
                    (ID INTEGER PRIMARY KEY,
                    Title TEXT,
                    Authors TEXT,
                    DOI TEXT,
                    Citations INTEGER,
                    Abstract TEXT,
                    Body TEXT,
                    Refs TEXT)'''.format(table_name=table_name))
else:
    table_name = result[0]
    
# Retrieve the maximum ID value from the table
cursor.execute("SELECT MAX(ID) FROM {table_name}".format(table_name=table_name))
max_id = cursor.fetchone()[0]
# Increment the ID value by one
id_key = max_id + 1 if max_id is not None else 1



# Loop through the files in the directory
for filename in os.listdir(path):
    if filename.endswith(".pdf"):
        print(f"filename: {filename}")
        file_path = os.path.join(path, filename)
      
        pdf_file, status, text = client.process_pdf(service_name, 
                                 file_path, 
                                 generateIDs=True, 
                                 consolidate_header=True, 
                                 consolidate_citations=True, 
                                 include_raw_citations=True, 
                                 include_raw_affiliations=True, 
                                 tei_coordinates=True,                          
                                 segment_sentences=True)
        
                       
        grobid_biblios=grobid_tei_xml.parse_citation_list_xml(text)
        # Extract metadat and text and print the bibliographic details
        ref_list=extract_bibliographic_details(grobid_biblios, print_choice=False)
        
        doc = grobid_tei_xml.parse_document_xml(text)
        title = doc.header.title
        authors = ';'.join([a.full_name for a in doc.header.authors])
        doi = str(doc.header.doi)
        citations = str(len(doc.citations))
        abstract = doc.abstract
        body = doc.body
        
        #----------------------------------------
        print(f"id_key: {id_key}")
        # Insert the information into the database
        cursor.execute('''
            INSERT INTO {table_name} (ID, Title, Authors, DOI, Citations, Abstract, Body, Refs)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        '''.format(table_name=table_name), (id_key, title, authors, doi, citations, abstract, body, ref_list))

        # Commit the changes to the database
        conn.commit()
        # Increment the ID key for the next iteration
        id_key += 1

        # Retrieve the count of records from the table
        cursor.execute("SELECT COUNT(*) FROM {table_name}".format(table_name=table_name))
        count = cursor.fetchone()[0]
        print("record loop:", count)


# Retrieve the count of records from the table
cursor.execute("SELECT COUNT(*) FROM {table_name}".format(table_name=table_name))
count = cursor.fetchone()[0]
print("Total records:", count)        
# Close the connection
conn.close()
print("finished")

# simple check of SQL database: 

In [None]:
import os
from IPython.display import clear_output
# Function to clear the console screen
def clear_console():
    if os.name == 'nt':  # for Windows
        os.system('cls')
    else:  # for Mac and Linux
        os.system('clear')
# Connect to the SQLite database
conn = sqlite3.connect(database_name)
# Retrieve the table name from the database
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
table_name = cursor.fetchone()[0]
# Create the loop
while True:
    # Clear the console screen
    # Prompt the user to enter the record number or 'q' to quit
    user_input = input("Enter the record number or 'q' to quit: After: Enter 'n' for next choice or 'q' for quit")
    
    
        
    # Check if the user wants to quit
    if user_input.lower() == 'q':
        break
    # Validate the user input as an integer
    try:
        chosen_record = int(user_input)
    except ValueError:
        print("Invalid input. Please enter a valid record number or 'q' to quit.")
        continue
    # Retrieve the "body" column from the table for the chosen record
    cursor.execute("SELECT body FROM {table_name} LIMIT 1 OFFSET {chosen_record}".format(table_name=table_name, chosen_record=chosen_record - 1))
    record_body = cursor.fetchone()
    
    # Check if the record exists
    if record_body is None:
        print("Record does not exist. Please enter a valid record number.")
        continue
    # Clear the console screen again before printing the new text
    
   
    # Print the body information of the chosen record
    print(record_body[0])
    user_input = input("Next: 'n' Quite: 'q'")
    
    if user_input.lower() == 'n':
        # Clear the screen (Windows-specific)
        #print("clean")
        clear_output()
    else:
        break
        
        
# Close the connection
conn.close()

# adds metadata to chunk, seem to be not good for embedding
 * Metadata might be not useful here

In [None]:
import sqlite3
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters.base import Document

# Connect to the SQLite database
conn = sqlite3.connect(database_name)

# Create the table if it doesn't exist
cursor = conn.cursor()
cursor.execute("SELECT ID, Title, Authors, DOI, Abstract, Body FROM {table_name}".format(table_name=table_name))
results = cursor.fetchall()

# Define the desired chunk size
desired_chunk_size = 2000
# Define the chunk overlap
chunk_overlap = 450

# Initialize the text splitter
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=desired_chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", ".", " ", ""]
)

# Loop through the results and process the data
texts_db = []
for result in results:
    record_id, title, authors, doi, abstract, body = result
    print(f"Record_id {record_id} | title {title}")
    
    # your code to retrieve abstract and body values from the database
    if abstract is None:
        abstract = ""
    if body is None:
        body = ""
    
    # Combine the abstract and body values
    combined_text = abstract + " " + body
    
    # Create a Document object for the continuous large text
    #document = Document(page_content=combined_text, metadata={'ID': record_id, 'Title': title, 'Authors': authors, 'DOI': doi})
    document = Document(page_content=combined_text, metadata={'ID': record_id})
    # Chunk the document
    r_splits = r_splitter.split_documents([document])
    
    # Add the chunked documents to the texts list, preserving the metadata
   # Create Document objects for the chunks and add them to the texts list
    for i, chunk in enumerate(r_splits):
        chunk_document = Document(
            page_content=chunk.page_content,
            metadata={
                'ID': chunk.metadata['ID'],
                #'Page': i+1,
                #'Title': chunk.metadata['Title'],
                #'Authors': chunk.metadata['Authors'],
                #'DOI': chunk.metadata['DOI'],
                #'Chunk size': len(chunk.page_content)
            }
        )
        texts_db.append(chunk_document)

# Close the connection
conn.close()

# Create the custom embedding object
#embedding = CustomEmbedding2(texts)

In [None]:
texts_db

In [None]:
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from openai import OpenAI
from typing import List
import os
import shutil
import tempfile
import time

class CustomEmbedding2:
    def __init__(self):
        self.embeddings = []  # List to store the embeddings

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        embeddings = [get_embedding(text) for text in texts]
        self.embeddings = embeddings  # Store the embeddings in the `embeddings` attribute
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        embedding = get_embedding(text)
        self.embeddings = [embedding]
        return embedding

    def get_embeddings(self) -> List[List[float]]:
        return self.embeddings

def get_embedding(text, model="TheBloke/nomic-embed-text"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding


def create_vectordb(embedding, r_splits, new_vectordb=True, user_path=None, temp_dir=None):
    cwd = os.getcwd()  # Get the current working directory

    if new_vectordb:
        if user_path is None:
            if temp_dir is None:
                # Create a temporary directory for persisting the vector database
                temp_dir = tempfile.mkdtemp()
                persist_directory = os.path.join(cwd, temp_dir, 'chroma_db')
            else:
                persist_directory = os.path.join(cwd, temp_dir, 'chroma_db')
        else:
            persist_directory = os.path.join(cwd, user_path, 'chroma_db')

        # Remove the existing persist directory (if any)
        if os.path.exists(persist_directory):
            shutil.rmtree(persist_directory)

        print("Creating a new vector database...")
        start_time = time.time()
        vectordb = Chroma.from_documents(
            documents=r_splits,
            embedding=embedding,
            persist_directory=persist_directory
        )
        end_time = time.time()
        print(f"New vector database created in {end_time - start_time:.2f} seconds. Directory: {persist_directory}")

    else:
        if user_path is None:
            print("Please provide a valid user path to load the existing vector database.")
            return None
        else:
            persist_directory = os.path.join(cwd, user_path, 'chroma_db')
            if not os.path.exists(persist_directory):
                print(f"Vector database not found in the specified path: {persist_directory}")
                return None

            print("Loading existing vector database...")
            start_time = time.time()
            vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
            end_time = time.time()
            print(f"Existing vector database loaded in {end_time - start_time:.2f} seconds. Directory: {persist_directory}")

    return vectordb, persist_directory


# Default values
new_vectordb = True

# Check if new_vectordb is True and set texts_db accordingly
if new_vectordb:
    texts_db = texts_db
else:
    texts_db = []

user_path = os.path.join(os.getcwd(), "docs", "sample")
temp_dir = os.path.join(os.getcwd(), "docs", "temp")

client = OpenAI(base_url="http://localhost:1238/v1", api_key="lm-studio")

# Extract the text from the Document objects text_db 
#texts = [doc.page_content for doc in texts_db]
# Create the custom embedding object
embedding = CustomEmbedding2()


vectordb, temp_dir = create_vectordb(embedding, texts_db, new_vectordb, user_path, temp_dir)
print("\nfinished", temp_dir, len(vectordb))
print(vectordb._collection.count())

# Based on  client.chat.completions.creat  : Browser TAB

In [None]:
import panel as pn
from openai import OpenAI
from langchain_community.vectorstores import Chroma
#from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
import re

# Define the template for the prompt
template = """
Use the following context from the vector database and the previous conversation to answer the query. Incorporate your own knowledge and reasoning as an AI assistant:
Previous Conversation: {history}
Context: {context}
Query: {query}
Answer:
{answer}
"""

# Create the prompt template
prompt_template = PromptTemplate(
    input_variables=["history", "context", "query", "answer"],
    template=template
)

# Initialize the chat history
history = [
    {"role": "system", "content": "You are an intelligent assistant. You always provide well-reasoned answers that are both correct and helpful. Use the following context from the vector database and the previous conversation to answer the query. Incorporate your own knowledge and reasoning as an AI assistant"},
    {"role": "user", "content": "Hello, introduce yourself to someone opening this program for the first time. Be concise."},
]

# Set up the Chroma vector store and HuggingFace embeddings
#embeddings = HuggingFaceEmbeddings()
#vectordb = Chroma(collection_name="my_collection", embedding_function=embeddings)
retriever = vectordb.as_retriever()

# Point to the local OpenAI server
client = OpenAI(base_url="http://localhost:1238/v1", api_key="lm-studio")

# Define a function to handle the user's input
def handle_input2():
    global is_first_run
    wrap_width = 80

    if is_first_run:
        query = "Hello"
        is_first_run = False
        some_context = ""
    else:
        query = input_box.value.strip()
        next_input = query
        search_results = retriever.get_relevant_documents(query)
        some_context = ""
        for result in search_results:
            some_context += result.page_content + "\n\n"
        history.append({"role": "user", "content": next_input + " Database content: [" + some_context + "]"})

    if query:
        # Generate the prompt using the template and input values
        prompt = prompt_template.format(history=history, context=some_context, query=query, answer="")

        completion = client.chat.completions.create(
            model="LLMA/Meta-Llama-3-8B",
            messages=history + [{"role": "user", "content": prompt}],
            temperature=0.7,
            stream=True,
        )

        new_message = {"role": "assistant", "content": ""}

        full_response = ""
        for chunk in completion:
            if chunk.choices[0].delta.content:
                new_message["content"] += chunk.choices[0].delta.content
                full_response += (chunk.choices[0].delta.content)

        formatted_text = f"<pre style='white-space: pre-wrap; width: {wrap_width}ch; font-family: Arial, sans-serif; font-size: 12px;'>**Query:**\n{query}\n\n**Final Answer:**\n{full_response}</pre>"
        output_box.object = formatted_text

        history.append(new_message)

        input_box.value = ""



# Define a function to handle user input changes
def handle_input_change(event):
    handle_input2()

# Load the Panel extension
pn.extension()

# Create an input box
input_box = pn.widgets.TextAreaInput()
# Create the output box
output_box = pn.pane.Markdown()

# Watch for changes in the input box value
input_box.param.watch(handle_input_change, 'value')

# Create a variable to track if it's the first run
is_first_run = True
if is_first_run:
        # Call handle_input2()
        handle_input2()
        is_first_run = False

# Create the dashboard layout using Panel
dashboard_layout = pn.Column(
    pn.pane.Markdown("# ChatGPT-like Conversation"),
    pn.Spacer(height=20),
    pn.Row(input_box, output_box),
    name="ChatGPT Dashboard"
)

# Create a dashboard object using the layout
dashboard = pn.panel(dashboard_layout)

# Display the dashboard in a new browser tab
dashboard.show()

# Alternative Version: based on QA CHAIN with BROWSER TAB - output still short

In [None]:
import panel as pn
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
import textwrap
from langchain.chains import LLMChain
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate



# Create a RAG model
llm = OpenAI(base_url="http://localhost:1238/v1", api_key="lm-studio")
retriever = vectordb.as_retriever()

# Define a custom prompt template
template = """
Use the following context from the vector database and the previous conversation to answer the query. Incorporate your own knowledge and reasoning as an AI assistant:

Previous Conversation: {history}

Context: {context}

Query: {query}

Answer:
"""

prompt_template = PromptTemplate(input_variables=["history", "context", "query"], template=template)

# Create a custom chain
qa_chain = LLMChain(llm=llm, prompt=prompt_template)


# Load the Panel extension
pn.extension()

import textwrap
import panel as pn
# Create an input box
input_box = pn.widgets.TextAreaInput()
# Create the output box
output_box = pn.pane.Markdown()
# Create a history list to store the conversation
history = []
# Define a function to handle the user's input
def handle_input():
    query = input_box.value.strip()
    if query:
        # Add user input to the history
        #history.append({"role": "user", "content": query})
        # Process the query and generate a response
        context = retriever.get_relevant_documents(query)
        result_str = qa_chain.run(
            history="\n".join([f"{msg['role']}: {msg['content']}" for msg in history]),
            query=query,
            #max_tokens=8096,
            max_tokens=-1,
            context="".join([doc.page_content for doc in context])
        )
        final_answer_start = result_str.find("Final Answer:")
        if final_answer_start != -1:
            final_answer = result_str[final_answer_start + len("Final Answer:"):].strip()
            output_box.object = f"**Query:**\n{query}\n\n**Final Answer:**\n{textwrap.fill(final_answer, width=80)}"
        else:
            final_answer=result_str
            output_box.object = f"**Query:**\n{query}\n\n**Answer:**\n{textwrap.fill(result_str, width=80)}"
        
        # Add assistant response to the history
        history.append({"role": "assistant", "content": final_answer})
        history.append({"role": "user", "content": query})
        # Clear the input box
        input_box.value = ""
        
        
# Watch for changes in the input box value
input_box.param.watch(lambda event: handle_input(), 'value')
# Create the dashboard layout using Panel
dashboard_layout = pn.Column(
    pn.pane.Markdown("# ChatGPT-like Conversation"),
    pn.Spacer(height=20),
    pn.Row(input_box, output_box),
    name="ChatGPT Dashboard"
)
# Create a dashboard object using the layout
dashboard = pn.panel(dashboard_layout)
# Display the dashboard in a new browser tab
dashboard.show()