In [1]:
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")
import logging

# Get the logger that produces the warning message
logger = logging.getLogger('langchain_text_splitters.base')

# Set the logging level to a higher level such as ERROR or CRITICAL
logger.setLevel(logging.ERROR)

### Install necessary packages

In [9]:
#install necessray packages
!pip install -q -U torch tensorflow transformers langchain  faiss-cpu sentence_transformers
!pip install -q peft==0.4.0 trl==0.4.7 accelerate==0.21.0 bitsandbytes==0.41.3
!pip install pypdf PyPDF2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m


### Import packages

In [3]:
#import packages
import os
import torch
from transformers import (
    BitsAndBytesConfig,
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
)

from peft import LoraConfig, PeftModel
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import AsyncChromiumLoader
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from PyPDF2 import PdfReader

### 1. Read the content of a provided PDF document.

In [6]:
# Read the content of pdf
# pdf file is converted to text files

# pdf file name
uploaded_file = "Harrison's Principles of Internal Medicine, 19E (2015) [True PDF] [UnitedVRG]-compressed.pdf"

#error handling for cases like missing or inaccessible PDF files
try:
  # Read the PDF file
  doc_reader = PdfReader(uploaded_file)
  pgno = len(doc_reader.pages)

  #solving for model context limit for longer pdfs.
  if pgno > 1200:
    print('pdf is too longer')
  else:
    # Extract text from the PDF
    raw_text = ""
    for i, page in enumerate(doc_reader.pages):
        text = page.extract_text()
        if text:
            raw_text += text

except:
  print('incorrect pdf path')

### 2. Preprocess the content for efficient question-answering.

In [7]:
# The converted amount of text is well above the context limit of LLM, therefore text is
# Split the text into smaller chunks(Chunking technique)
# solving for model context limit for longer pdfs
# model's maximum context length is 4097 tokens, so context is split as 1000 size
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
)
texts = text_splitter.split_text(raw_text)



### 3. Develop a Q&A bot using open source LLM to answer questions based on the PDF's content.

In [10]:
#Tokenizer is defined here. Tokenizer model is loaded from pretrained Mistral 7B model
#LLM model is loaed from pretrained Mistral 7B model

# tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2",
                                          trust_remote_code=True,
                                          use_auth_token='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')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# define quantization config file
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type= "nf4", #fp4 or nf4,
    bnb_4bit_compute_dtype=  "float16",
    bnb_4bit_use_double_quant=False,
)

# Load pre-trained model
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    quantization_config=bnb_config,
    use_auth_token='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'
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
# Define text pipeline
# This pipline defines processes of LLM for Q&A
text_pipeline = pipeline(
    temperature=0.2,
    tokenizer=tokenizer,
    task="text-generation",
    model=model,
    repetition_penalty=1.12,
    return_full_text=True,
    max_new_tokens=290,
)

In [12]:
# define hugging face pipeline
# mistal model is run through the HuggingFacePipeline class
mistral_llm = HuggingFacePipeline(pipeline=text_pipeline)

In [13]:
# The splitted chunk texts are vectorized.
# These vectorized texts are used in context
# Embedding model
db = FAISS.from_texts(texts,
                          HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))

retriever = db.as_retriever()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
# This cell defines language model for context-based Q&A

# define prompt template
prompt_template = """
### [INST] Instruction: Use the following pieces of context to answer the question.

{context}

### QUESTION:
{question} [/INST]
 """

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

# Build RAG chain
rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

# define function for quesion and answer
def Q_A(question):
  result = rag_chain.invoke(question)
  answer = result['text']
  print('Answer:')
  print(answer.split('[/INST]')[-1])


#### Running Q&A bot

In [15]:
question = "please explain about skin disorder"
Q_A(question)


Answer:

  Skin disorders refer to various conditions that affect the largest organ in the human body. These disorders can range from inflammatory conditions, infections, neoplastic processes (skin cancer), immunologically mediated diseases, and manifestations of underlying internal diseases.

During a physical examination of a patient with a suspected skin disorder, it's essential to evaluate the entire cutaneous surface before taking an extensive history. The assessment should include noting the distribution of the eruption, the types of primary and secondary lesions, and the shape of individual lesions.

Four basic features of a skin lesion must be considered during a physical examination:

1. Distribution: The location of the skin lesion on the body can provide valuable information regarding the underlying cause. For instance, pityriasis rosea typically presents with multiple round or oval erythematous patches along the skin tension lines on the trunk.

2. Types of primary and seco

In [16]:
question = "please explain about internal disease"
Q_A(question)

Answer:

  Internal diseases refer to medical conditions that affect the body's internal organs and systems beyond the skin. These conditions can result from various causes such as infections, trauma, inflammation, metabolic imbalances, neurological issues, or psychiatric disorders. Some examples of internal diseases mentioned in the provided context include abdominal wall distortions, mesenteric inflammation, appendicitis, cardiac conditions like acute myocardial infarction, and various metabolic and neurological disorders.

The context also discusses how certain cutaneous manifestations can indicate the presence of internal diseases. For instance, eroded bullae in pemphigus vulgaris, palpable purpuric papules in cutaneous small-vessel vasculitis, and various other skin lesions can be indicative of underlying systemic conditions. However, identifying these conditions based on cutaneous signs alone can be challenging for non-dermatologists due to the vast array of possible presentation

### 4. Implement a mechanism to determine if a question is in or out of context and deny out-of-context questions.

In [19]:
# If a question is in out context, then deny out-of-context questions
# This is done by defining of prempt_template.
# define prompt
prompt_template2 = """
### [INST] Instruction: Use the following pieces of context to answer the question.
           If there is no mention of question in the provided context,
           please deny answer

{context}

### QUESTION:
{question} [/INST]
 """

# Create prompt from prompt template
prompt2 = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template2,
)

# Create llm chain
llm_chain2 = LLMChain(llm=mistral_llm, prompt=prompt2)

# Build RAG chain
rag_chain2 = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain2
)

# define function for question and answer
def Q_A2(question):
  result = rag_chain.invoke(question)
  answer = result['text']
  print('Answer:')
  print(answer.split('[/INST]')[-1])

####Running Q&A bot for in_or_out of context

In [20]:
# Run Q&A
question = "please explain about Gibbs free anergy"
Q_A2(question)


Answer:

  I'm sorry for any confusion, but the context provided does not contain any information related to Gibbs free energy. Therefore, I cannot provide an explanation about it based on the given text.


Q_A bot refuses to answer because question is out of context

In [21]:
question = "please explain about Management of the patient with ischemic heart disease"
Q_A2(question)


Answer:

  Based on the provided context, the document discusses various aspects of ischemic heart disease, its causes, diagnosis, and management. Here's a summary of what the text says about managing a patient with ischemic heart disease:

1. Ischemic heart disease is a condition where there is an imbalance between myocardial oxygen requirements and myocardial oxygen supply, leading to insufficient oxygen delivery to the heart. This results in myocardial ischemia, which if severe and prolonged, can result in myocardial infarction (MI).
2. The management of a patient with ischemic heart disease involves addressing both the underlying cause and the symptoms.
3. The first step in managing a patient with ischemic heart disease is to evaluate their risk factors, such as atherosclerosis, endothelial dysfunction, microvascular disease, vasospasm, congenital abnormalities, myocardial bridging, coronary arteritis, radiation-induced coronary disease, extreme myocardial oxygen demand, and impair

Q_A bot answers because question is in of context

## Context-Aware Responses with Advanced ML

### 1. Reading and Preprocessing PDF Content

In [None]:
# Import necessary libraries for reading and preprocessing PDF content
import PyPDF2
import os
from langchain.text_splitter import CharacterTextSplitter

# Function to read content from a PDF file
def read_pdf(file_path):
    if not os.path.exists(file_path):  # Check if the file exists
        raise FileNotFoundError(f"The file {file_path} does not exist.")
    
    content = ""
    with open(file_path, 'rb') as file:  # Open the PDF file in binary mode
        reader = PyPDF2.PdfFileReader(file)  # Initialize PDF reader
        for page_num in range(reader.numPages):  # Iterate through each page
            page = reader.getPage(page_num)  # Get a page
            content += page.extract_text() + "\n"  # Extract text and add newline
    return content

# Function to preprocess content using Langchain's text splitter
def preprocess_content(content):
    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)  # Initialize text splitter
    return splitter.split_text(content)  # Split content into chunks

# Example usage
pdf_content = read_pdf('/mnt/data/Contextual_Q&A_Bot.pdf')  # Read PDF content
processed_content = preprocess_content(pdf_content)  # Preprocess the content


### 2. Developing the Q&A Bot with RAG

In [None]:

# Import necessary libraries for the Q&A bot
from langchain.llms import OpenSourceLLM
from langchain.chains import QAChain
from langchain.retrievers import EmbeddingRetriever
from sentence_transformers import SentenceTransformer

# Load the Language Model (Mistral 7b) with 32-bit configuration for efficiency
llm = OpenSourceLLM(model_name="mistral-7b", use_32bit=True)

# Load a pre-trained embedding model (MiniLM) for embeddings
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Create an Embedding Retriever for RAG (Retrieval-Augmented Generation)
retriever = EmbeddingRetriever(embedding_model=embedding_model, documents=processed_content)

# Create a Q&A Chain using the Language Model and Retriever
qa_chain = QAChain(llm=llm, retriever=retriever)

# Function to answer questions using the Q&A Chain
def answer_question(question):
    return qa_chain.run(question=question)

# Example usage
response = answer_question("What is the main topic of the document?")  # Ask a question
print(response)  # Print the response


### 3. Context-Aware Responses with Advanced ML

In [None]:

# Import necessary libraries for context-aware responses
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Placeholder for context documents and labels for training the context relevance model
context_documents = processed_content[:100]  # Use the first 100 chunks for training
labels = [1 if "relevant" in doc else 0 for doc in context_documents]  # Example labels (adjust as needed)

# Train a logistic regression model to determine context relevance
X_train, X_test, y_train, y_test = train_test_split(context_documents, labels, test_size=0.2, random_state=42)
context_model = make_pipeline(TfidfVectorizer(), LogisticRegression())  # Create a pipeline with TF-IDF and Logistic Regression
context_model.fit(X_train, y_train)  # Train the model

# Evaluate the model
y_pred = context_model.predict(X_test)  # Predict on test set
print(f"Context relevance model accuracy: {accuracy_score(y_test, y_pred)}")  # Print accuracy

# Function to check if a question is in context using the trained model
def is_question_in_context(question, context_documents):
    context_scores = context_model.predict_proba([question] * len(context_documents))[:, 1]  # Get probabilities
    max_score = np.max(context_scores)  # Get maximum score
    threshold = 0.5  # Define a threshold for context relevance
    return max_score > threshold

# Function to handle questions by checking context and answering
def handle_question(question):
    context = retriever.retrieve(question)  # Use retriever to get the most relevant context
    if is_question_in_context(question, context):  # Check if question is in context
        return answer_question(question)  # Answer the question if in context
    else:
        return "The question is out-of-context."  # Return out-of-context message

# Example usage
response = handle_question("What is the main topic of the document?")  # Ask a question
print(response)  # Print the response


### 4. Scalability Considerations

In [None]:

# Import necessary libraries for scalable preprocessing
from pyspark.sql import SparkSession

# Function to preprocess large PDFs using Spark for scalability
def scalable_preprocess(file_path):
    spark = SparkSession.builder.appName("PDFPreprocessing").getOrCreate()  # Initialize Spark session
    pdf_rdd = spark.sparkContext.binaryFiles(file_path).flatMap(lambda x: read_pdf(x[1]).split('\n'))  # Read and split PDF content
    processed_content = pdf_rdd.collect()  # Collect the results
    spark.stop()  # Stop the Spark session
    return processed_content

# Example usage
processed_content = scalable_preprocess('/mnt/data/Contextual_Q&A_Bot.pdf')  # Preprocess PDF content
