# Import Required Libraries


In [1]:
# Import Required Libraries

import os
import logging
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from typing_extensions import List, Optional
from datetime import date
import gradio as gr
from pydantic import BaseModel, Field, ValidationInfo, field_validator
from dotenv import load_dotenv
import pandas as pd

# Define Helper Functions for semantic search


In [2]:
# Define Helper Functions

def process_markdown_for_embeddings():
    """Process Markdown file for embedding using langchain components"""
    file_path = "./data/ocr.md"
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            full_text = f.read()
        
        # Create text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=100,
            chunk_overlap=50,
            length_function=len,
            is_separator_regex=False,
        )
        
        # Split text into chunks
        texts = text_splitter.create_documents([full_text])
        return texts
    except Exception as e:
        logging.error(f"Error processing Markdown for embedding: {e}")
        return None

def setup_rag(document_splits=None):
    """Initialize RAG components with document embedding using FAISS"""
    global vector_store  # Add this line to modify the global variable
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    azure_openai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
    
    # Initialize embeddings
    embeddings = AzureOpenAIEmbeddings(
        azure_deployment="text-embedding-ada-002",
        openai_api_version="2023-05-15",
        azure_endpoint=azure_endpoint,
        api_key=azure_openai_api_key,
    )
    
    # Initialize or load FAISS vector store
    if document_splits:
        vector_store = FAISS.from_documents(document_splits, embeddings)
        # Optionally save the index
        vector_store.save_local("./data/faiss_index")
    else:
        # Load existing index if available
        try:
            vector_store = FAISS.load_local("./data/faiss_index", embeddings)
        except:
            # Return None or handle the case when no index exists
            return None
    return vector_store

def is_vector_store_initialized():
    """Check if the vector store is initialized."""
    return vector_store is not None

def semantic_search(query, k, filter=None):
    """Perform semantic search from the vector store to retrieve relevant chunks"""
    if not is_vector_store_initialized():
        logging.error("Vector store is not initialized.")
        return None
    
    results = vector_store.similarity_search(query, k=k, filter=filter)
    return results

# Process Markdown for Embeddings

In [3]:
# Process Markdown for Embeddings

document_splits = process_markdown_for_embeddings()

# Display the first few document splits to verify
document_splits[:5]

[Document(metadata={}, page_content='<figure>\n</figure>\n\n\nMC-2202\n\n\n<figure>\n\nsterling\nACCURIS\nPathology lab that cares\n\n</figure>'),
 Document(metadata={}, page_content='</figure>\n\n\nScan QR code to check\nreport authenticity\n\nPassport No :\n\n\n# LABORATORY TEST REPORT'),
 Document(metadata={}, page_content='<table>\n<tr>\n<th>Patient Information</th>\n<th colspan="2">Sample Information</th>'),
 Document(metadata={}, page_content='<th colspan="2">Sample Information</th>\n<th>Client/Location Information</th>\n</tr>\n<tr>'),
 Document(metadata={}, page_content='<td rowspan="2">Name : Lyubochka Svetka Sex/Age Male / 41 Y 01-Feb-1982 Ref. Id : Ref. By :</td>')]

# Setup RAG (Retrieval-Augmented Generation)

In [4]:
# Setup RAG (Retrieval-Augmented Generation)

# Initialize RAG components with document embedding using FAISS
rag_chain = setup_rag(document_splits)

# Check if the RAG setup was successful
if rag_chain:
    print("RAG setup successful.")
else:
    print("RAG setup failed.")

RAG setup successful.


# Check Vector Store Initialization

In [5]:
# Check Vector Store Initialization

# Check if the vector store is initialized
is_initialized = is_vector_store_initialized()

# Display the initialization status
is_initialized

True

# Perform Semantic Search

In [6]:
# Perform Semantic Search

# Define the query and perform semantic search
query = "Yash Shah"
results = semantic_search(query, k=5)

# Display the search results
results

[Document(id='a575cb8b-5443-4e5d-a1a8-f06fcbe0e9ca', metadata={}, page_content='Dr.Yash Shah\nMD Path\n\n\\# Referred Test\n\n<!-- PageNumber="Page 6 of 19" -->'),
 Document(id='3047c552-8b9c-4ca4-a424-f10fe0c9357e', metadata={}, page_content='Dr.Yash Shah\nMD Path\n\n\\# Referred Test\n\n<!-- PageNumber="Page 8 of 19" -->'),
 Document(id='ff58991d-b43d-41b2-8d5b-43bdc577a4e7', metadata={}, page_content='Dr.Yash Shah\nMD Path\n\n\\# Referred Test\n\n<!-- PageNumber="Page 3 of 19" -->'),
 Document(id='2c3400be-c7b5-4690-8948-0e6e4b578443', metadata={}, page_content='Dr.Yash Shah\nMD Path\n\n\\# Referred Test\n\n<!-- PageNumber="Page 11 of 19" -->'),
 Document(id='e9b44ff8-5934-460e-9bcb-a4fe080423e8', metadata={}, page_content='Dr.Yash Shah\nMD Path\n\n\\# Referred Test\n\n<!-- PageNumber="Page 4 of 19" -->')]

# Define Demographics Model

In [7]:
class Demographics(BaseModel):
    """Information about a person."""

    patient_first_name: Optional[str] = Field(
        default=None, description="First Name of the patient"
    )
   
    patient_last_name: Optional[str] = Field(
    default=None, description="Last Name of the patient"
    )

    @field_validator('patient_first_name', 'patient_last_name', mode='after')  
    @classmethod
    def validate_name(cls, value: str, info: ValidationInfo) -> str:
        if not value:
            return value
        try:
            if not is_vector_store_initialized():
                return value  # Skip validation if vector store isn't ready
            answer = semantic_search(value, k=1)
            
            print(f"Validation result for {value}: {answer}")

            if not any(value in result.page_content for result in answer):
                print(f"Warning: Could not verify {value} in the knowledge base")
            return value
        except Exception as e:
            print(f"Warning: Validation error for {value}: {str(e)}")
            return value

    patient_dob: Optional[date] = Field(
        default=None, description="Date of birth of the patient in YYYY-MM-DD format"
    )
    patient_phone: Optional[str] = Field(
        default=None, description="Phone number of the patient"
    )
    patient_address: Optional[str] = Field(
        default=None, description="Address of the patient"
    )
    patient_sex: Optional[str] = Field(
        default=None, description="Sex of the patient"
    )

# Define Data Model

In [8]:
# Define Data Model

class Data(BaseModel):
    """Extracted data about patient"""

    # Creates a model so that we can extract multiple entities.
    people: List[Demographics]

In [9]:
# Load environment variables
load_dotenv()

# Check required environment variables

azure_openai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")


if not azure_openai_api_key:
    raise ValueError("AZURE_OPENAI_API_KEY environment variable not set")
if not azure_endpoint:
    raise ValueError("AZURE_OPENAI_ENDPOINT environment variable not set")

# Initialize Azure OpenAI client
llm = AzureChatOpenAI(
    azure_endpoint=azure_endpoint,
    api_key=azure_openai_api_key,
    api_version="2024-02-15-preview",
    deployment_name=azure_deployment_name,  # Add your deployment name here
    logprobs=True,
    top_logprobs=1
)

# Process Text for Entity Extraction

In [10]:
# Process Text for Entity Extraction

# Initialize structured LLM with the defined schema
structured_llm = llm.with_structured_output(schema=Data)

def process_text(text_input):
    try:
        # Create a prompt template
        prompt_template = PromptTemplate(input_variables=["text"], template="{text}")
        prompt = prompt_template.invoke({"text": text_input})
        
        # Invoke the structured LLM with the prompt
        result = structured_llm.invoke(prompt)
        
        # Check if any people data was extracted
        if not result.people:
            return "No data was extracted from the text"
        
        # Create lists with consistent lengths
        data_lists = []
        for person in result.people:
            person_data = {
                "First Name": person.patient_first_name or "",
                "Last Name": person.patient_last_name or "",
                "Date of Birth": person.patient_dob or None,
                "Phone": person.patient_phone or "",
                "Address": person.patient_address or "",
                "Sex": person.patient_sex or ""
            }
            data_lists.append(person_data)
        
        # Check if any valid data was extracted
        if not data_lists:
            return "No valid data extracted"
        
        # Create a DataFrame from the extracted data
        df = pd.DataFrame(data_lists)
        return df

    except Exception as e:
        # Print error message for debugging
        print(f"Error during processing: {str(e)}")
        return f"Error processing the text: {str(e)}"

# Create Gradio Interface

In [None]:
# Create Gradio Interface

# Set default text and OCR file path
default_text = "Please input text to extract demographics."
ocr_file_path = "./data/ocr.md"

# Try to read the OCR file and set the default text
try:
    if os.path.exists(ocr_file_path):
        with open(ocr_file_path, 'r', encoding='utf-8') as file:
            default_text = file.read()
except Exception as e:
    print(f"Warning: Could not read OCR file: {str(e)}")

# Create Gradio interface
demo = gr.Interface(
    fn=process_text,
    inputs=gr.Textbox(value=default_text, lines=10, label="Input Text"),
    outputs=gr.Dataframe(),
    title="Demographics Extractor",
    description="Extract patient demographics from medical documents",
)

# Launch the Gradio interface with sharing enabled
demo.launch(share=True)

2025-02-24 11:28:48.403 
  command:

    streamlit run /home/appuser/.local/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]
2025-02-24 11:28:48.408 Session state does not function when running a script without `streamlit run`


In [16]:
!streamlit run /home/appuser/.local/lib/python3.11/site-packages/ipykernel_launcher.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://10.0.2.215:8501[0m
[34m  External URL: [0m[1mhttp://4.240.39.197:8501[0m
[0m
[34m  Stopping...[0m
^C
