In [None]:
!pip3 install langchain transformers
!pip3 install langchain_community
!pip3 install PyPDF2
!pip3 install pypdf
!pip3 install sentence-transformers
!pip3 install chromadb
!pip3 install langchain_together
!pip3 install streamlit
!pip3 install einops
!pip3 install faiss-gpu
!pip3 install faiss-cpu
!pip3 install pymupdf

In [None]:
from langchain_community.document_loaders import DirectoryLoader
from langchain.document_loaders import PyMuPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate

from langchain_together import Together
import os
from langchain.memory import ConversationBufferWindowMemory
from langchain.chains import ConversationalRetrievalChain
import streamlit as st
import time

In [None]:
#READ THE PDF FROM THE FOLDER
# loader = DirectoryLoader('sample_data', glob="./*.pdf", loader_cls=PyPDFLoader)   #for multiple  pdfs
# MEDICAL.PDF IS OF >5000 PAGES IT WILL TAKE TIME TOO MUCH TIME FOR EMBEDDING SO USE CMDT-2023.PDF
loader = DirectoryLoader('sample_data', glob="CMDT-2023.pdf", loader_cls=PyMuPDFLoader)
documents = loader.load()

In [None]:
### USE ONLY WHEN UR DATA CONTAINS LOTS OF TABLES , COLORED BOXES , IMAGES ETC
import pdfplumber

# Function to extract text and tables from a PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
    all_text = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text from the page
            text = page.extract_text()
            if text:
                all_text.append(text)

            # Extract tables from the page and convert to text
            tables = page.extract_tables()
            for table in tables:
                # Handle potential None values within table rows
                table_text = '\n'.join(['\t'.join([str(cell) if cell is not None else '' for cell in row]) for row in table])
                all_text.append(table_text)

    return '\n'.join(all_text)

# Load the PDF file and extract text
pdf_path = "sample_data/CMDT-2023.pdf"
pdf_text = extract_text_from_pdf(pdf_path)

# Convert the extracted text into documents for further processing
documents = [{"page_content": pdf_text}]

In [None]:
# SEE YOUR LOADED DATA
documents

In [None]:
# MAKING CHUNKS For unstructured text documents,USE “Recursive Character Splitting” strategy.
#This strategy excels at preserving semantic coherence in the resulting fragments, effectively adapting to various types of
#documents while avoiding the loss of relevant information.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [None]:
texts[0] # to check the chunks

In [None]:
# EMBEDDINGS TO MAKE VECTOR DATABASE USING HUUGING FACE AS OPEN SOURCE FREE EMBEDDINGS
embedings = HuggingFaceEmbeddings(model_name="nomic-ai/nomic-embed-text-v1",
                                  # BAAI/bge-small-en-v1.5  or sentence-transformers/all-MiniLM-16-v2
                                  model_kwargs={"trust_remote_code":True,"revision":"289f532e14dbbbd5a04753fa58739e9ba766f3c7"})

In [None]:
# To see Embeddings
import numpy as np
np.array(embedings.embed_query(texts[0].page_content)) # to check the embeddings

In [None]:
## VectorStore Creation USE ONLY WHEN U HAVE TO CREATE NEW VECTOR DATABASE
faiss_db = FAISS.from_documents(texts, embedings)

In [None]:
# Saves and export the vector embeddings databse
faiss_db.save_local("ipc_vector_db")

In [None]:
#same here as model.py
def reset_conversation():
  st.session_state.messages = []
  st.session_state.memory.clear()

if "messages" not in st.session_state:
    st.session_state.messages = []
    
if "memory" not in st.session_state:
    st.session_state.memory = ConversationBufferWindowMemory(k=2, memory_key="chat_history",return_messages=True)

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="nomic-ai/nomic-embed-text-v1",
                                   model_kwargs={"trust_remote_code":True,"revision":"289f532e14dbbbd5a04753fa58739e9ba766f3c7"})

In [None]:
#db = FAISS.load_local("ipc_vector_db_cmdt", embeddings, allow_dangerous_deserialization=True)
db = FAISS.load_local("ipc_vector_db_med", embeddings, allow_dangerous_deserialization=True)

In [None]:
db_retriever = db.as_retriever(search_type="similarity",search_kwargs={"k": 4})

In [None]:
# THIS IS ACTUALLY TELLING CHATBOT WHAT U ARE AND CHAIN WHAT U HAVE TO DO
# FOR CMDT-2023 DATA
# prompt_template = """<s>[INST]You are a medical chatbot trained on the latest data in diagnosis and treatment, designed to provide accurate and concise information in response to users' medical queries. Your primary focus is to offer evidence-based answers related to symptoms, infections, disorders, diseases, and their respective treatments. Refrain from generating hypothetical diagnoses or questions, and stick strictly to the context provided. Ensure your responses are professional, concise, and relevant. If the question falls outside the given context, do not rely on chat history; instead, generate an appropriate response based on your medical knowledge. Prioritize the user's query, avoid unnecessary details, and ensure compliance with medical standards and guidelines.
# CONTEXT: {context}
# CHAT HISTORY: {chat_history}
# QUESTION: {question}
# ANSWER:
# </s>[INST]
# """

## FOR MEDICAL DATA
prompt_template = """<s>[INST]You are a medical chatbot trained on the latest data in diagnosis and treatment from HARRISON'S PRINCIPLES OF INTERNAL MEDICINE. Your primary focus is to provide accurate, evidence-based answers related to symptoms, infections, disorders, diseases, and their respective treatments, including medications, cautionary advice, and necessary evaluations. Refrain from generating hypothetical diagnoses or questions, and strictly adhere to the context provided by the user’s query. Ensure your responses are professional, concise, and aligned with established medical standards and guidelines. If the question falls outside the provided context, do not rely on chat history; instead, generate an appropriate response based on your medical knowledge. Always prioritize the user's query, avoid unnecessary details, and maintain clarity in your explanations.
CONTEXT: {context}
CHAT HISTORY: {chat_history}
QUESTION: {question}
ANSWER:
</s>[INST]
"""

In [None]:
prompt = PromptTemplate(template=prompt_template,
                        input_variables=['context', 'question', 'chat_history'])

In [None]:
# You can also use other LLMs options from https://python.langchain.com/docs/integrations/llms. Here I have used TogetherAI API
llm = Together(
    model="mistralai/Mistral-7B-Instruct-v0.2",
    temperature=0.5,
    max_tokens=1024,
    together_api_key="63796cfbe489810e7341f4622447cf023df92e6c6f9d665777f374032ba50474" # mine 
)

In [None]:
# st.session_state.memory = ConversationBufferWindowMemory(k=2, memory_key="chat_history",return_messages=True)
qa = ConversationalRetrievalChain.from_llm(
    llm=llm,
    memory=ConversationBufferWindowMemory(k=2, memory_key="chat_history",return_messages=True),
    retriever=db_retriever,
    combine_docs_chain_kwargs={'prompt': prompt}
)

In [None]:
# Prompt the user for a query
user_query = input("Ask your question: ")

In [None]:
# If the user enters a query, process it
if user_query:
    # Get the response from the Conversational Retrieval Chain
    response = qa({"question": user_query})

    # Display the chatbot's response
    print("Chatbot:", response['answer'])

    # Optionally store the conversation for reference
    st.session_state.messages.append({"role": "user", "content": user_query})
    st.session_state.messages.append({"role": "assistant", "content": response['answer']})

# If you want to reset the conversation, you can call reset_conversation()

In [None]:
## TO RUN THE STREAMLIT APP 

In [None]:
# Import necessary libraries
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain_together import Together
import os
from langchain.memory import ConversationBufferWindowMemory
from langchain.chains import ConversationalRetrievalChain
import streamlit as st
import time
from dotenv import load_dotenv # load specific environment that been created
 
load_dotenv()
## Langsmith project tracking
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")

# Set up the Streamlit page configuration
st.set_page_config(page_title="MedGPT", layout="wide")

# Custom CSS for styling the app
st.markdown(
    """
    <style>
    /* Main container for flexbox layout */
    .main {
        display: flex;
    }
    
    /* Sidebar styling */
    .sidebar {
        width: 300px;
        padding: 20px;
        height: 100vh;
        position: fixed;
        background-color: #000000;
        left: 0;
        top: 0;
        display: flex;
        flex-direction: column;
        align-items: center;
    }
    
    /* Main chat container styling */
    .chat-container {
        flex: 1;
        padding: 20px;
        margin-left: 300px;
    }
    
    .stApp, .ea3mdgi6 {
        background-color: #000000; /* right side bg color */
    }
    
    div.stButton > button:first-child {
        background-color: #ffd0d0;
    }
    div.stButton > button:active {
        background-color: #ff6262;
    }
    
    div[data-testid="stStatusWidget"] div button {
        display: none;
    }
    
    /* Adjust top margin of the report view container */
    .reportview-container {
        margin-top: -2em;
    }
    
    /* Hide various Streamlit elements */
    #MainMenu {visibility: hidden;}
    .stDeployButton {display:none;}
    footer {visibility: hidden;}
    #stDecoration {display:none;}
    button[title="View fullscreen"]{
        visibility: hidden;
    }
    
    /* Ensure the placeholder text is also visible */
    .stTextInput > div > div > input::placeholder {
        color: #666666 !important;
    }
    
    .stChatMessage {
        background-color: #28282B; /* chat message background color set to black */
        color : #000000 !important;
    }


    </style>
    """,
    unsafe_allow_html=True,
)

# Create the sidebar
with st.sidebar:
    # Add logo to the sidebar
    st.image("Black Bold Initial AI Business Logo.jpg", width=200)
    # Add title to the sidebar
    st.title("MedGPT")
    # Add description to the sidebar
    st.markdown("Your AI MEDICAL ASSISTANT")

# Main chat interface container
st.markdown('<div class="chat-container">', unsafe_allow_html=True)

# Function to reset the conversation
def reset_conversation():
    st.session_state.messages = []
    st.session_state.memory.clear()

# Initialize session state for messages if not already present
if "messages" not in st.session_state:
    st.session_state["messages"] = []

# Initialize conversation memory
if "memory" not in st.session_state:
    st.session_state["memory"] = ConversationBufferWindowMemory(k=2, memory_key="chat_history",return_messages=True) 

# Set up embeddings for vector search
embedings = HuggingFaceEmbeddings(model_name="nomic-ai/nomic-embed-text-v1",model_kwargs={"trust_remote_code":True,"revision":"289f532e14dbbbd5a04753fa58739e9ba766f3c7"})
# Load the FAISS vector database
#db = FAISS.load_local("./ipc_vector_db_cmdt", embedings, allow_dangerous_deserialization=True)
db = FAISS.load_local("./ipc_vector_db_med", embedings, allow_dangerous_deserialization=True)

db_retriever = db.as_retriever(search_type="similarity",search_kwargs={"k": 4})

# Define the prompt template for the AI
# THIS IS ACTUALLY TELLING CHATBOT WHAT U ARE AND CHAIN WHAT U HAVE TO DO
# FOR CMDT-2023 DATA
# prompt_template = """<s>[INST]You are a medical chatbot trained on the latest data in diagnosis and treatment, designed to provide accurate and concise information in response to users' medical queries. Your primary focus is to offer evidence-based answers related to symptoms, infections, disorders, diseases, and their respective treatments. Refrain from generating hypothetical diagnoses or questions, and stick strictly to the context provided. Ensure your responses are professional, concise, and relevant. If the question falls outside the given context, do not rely on chat history; instead, generate an appropriate response based on your medical knowledge. Prioritize the user's query, avoid unnecessary details, and ensure compliance with medical standards and guidelines.
# CONTEXT: {context}
# CHAT HISTORY: {chat_history}
# QUESTION: {question}
# ANSWER:
# </s>[INST]
# """

## FOR MEDICAL DATA
prompt_template = """<s>[INST]You are a medical chatbot trained on the latest data in diagnosis and treatment from HARRISON'S PRINCIPLES OF INTERNAL MEDICINE. Your primary focus is to provide accurate, evidence-based answers related to symptoms, infections, disorders, diseases, and their respective treatments, including medications, cautionary advice, and necessary evaluations. Refrain from generating hypothetical diagnoses or questions, and strictly adhere to the context provided by the user’s query. Ensure your responses are professional, concise, and aligned with established medical standards and guidelines. If the question falls outside the provided context, do not rely on chat history; instead, generate an appropriate response based on your medical knowledge. Always prioritize the user's query, avoid unnecessary details, and maintain clarity in your explanations.
CONTEXT: {context}
CHAT HISTORY: {chat_history}
QUESTION: {question}
ANSWER:
</s>[INST]
"""

# Create a PromptTemplate object
prompt = PromptTemplate(template=prompt_template,
                        input_variables=['context', 'question', 'chat_history'])

# Set up the language model (LLM)
llm = Together(
    model="mistralai/Mistral-7B-Instruct-v0.2",
    temperature=0.5,
    max_tokens=1024,
    together_api_key="63796cfbe489810e7341f4622447cf023df92e6c6f9d665777f374032ba50474"
)

# Create the conversational retrieval chain
qa = ConversationalRetrievalChain.from_llm(
    llm=llm,
    memory=ConversationBufferWindowMemory(k=2, memory_key="chat_history",return_messages=True),
    retriever=db_retriever,
    combine_docs_chain_kwargs={'prompt': prompt}
)

# Display previous messages
for message in st.session_state.get("messages", []):
    with st.chat_message(message.get("role")):
        st.write(message.get("content"))

# Create the chat input
input_prompt = st.chat_input("Write your Queries here.....")#input text box for user to ask question

# Handle user input
if input_prompt:
    # Display user message
    with st.chat_message("user"):
        st.write(input_prompt)

    # Add user message to session state
    st.session_state.messages.append({"role":"user","content":input_prompt})

    # Generate and display AI response
    with st.chat_message("assistant"):
        with st.status("Introspecting 💡...",expanded=True):
            # Invoke the QA chain to get the response
            result = qa.invoke(input=input_prompt)

            message_placeholder = st.empty()

            full_response = "⚠️ **_Note: Information provided is accordance to current medical diagnosis & treatment 2023._** \n\n\n"
        # Stream the response
        for chunk in result["answer"]:
            full_response+=chunk
            time.sleep(0.02)
            
            message_placeholder.markdown(full_response+" ▌")
        # Add a button to reset the conversation
        st.button('Reset All Chat 🗑️', on_click=reset_conversation)

    # Add AI response to session state
    st.session_state.messages.append({"role":"assistant","content":result["answer"]})

# Close the chat container div
st.markdown('</div>', unsafe_allow_html=True)