Step 0: Setup your .env file locally
Setup your OPENAI_API_BASE key and OPENAI_API_KEY in a file .env in this same folder.

# example .env contents (copy paste this into a .env file)
OPENAI_API_BASE=yourapibase
OPENAI_API_KEY=yourapikey
Install the required dependencies.

In [None]:
%pip install -q -r requirements.txt

: 

In [None]:
%pip install sklearn plotly chromadb openai

: 

In [None]:
# You don't need to change this, just run this cell
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.
import openai

: 

Step 1: Create the Dataset

In [None]:
pip install pdfplumber

: 

In [None]:
import pdfplumber
import pandas as pd

: 

def extract_information_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        dataset = []
        for page in pdf.pages:
            text = page.extract_text()
            # Extract the textbook title, page number, and information
            # based on your specific requirements
            textbook_title = "Case Files Internal Medicine"
            page_number = page.page_number
            information = text
            dataset.append([textbook_title, page_number, information])
    
    df = pd.DataFrame(dataset, columns=["Textbook Title", "Page Number", "Information"])
    return df

pdf_file = "data/Case files Internal Medicine.pdf"
dataset = extract_information_from_pdf(pdf_file)
dataset.to_csv("output.csv", index=False)

In [None]:
import os

def extract_information_from_pdf(pdf_files):
    dataset = []
    for pdf_file in pdf_files:
        textbook_title = os.path.splitext(os.path.basename(pdf_file))[0]
        with pdfplumber.open(pdf_file) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                page_number = page.page_number
                information = text
                dataset.append([textbook_title, page_number, information])

    df = pd.DataFrame(dataset, columns=["Textbook Title", "Page Number", "Information"])
    return df

pdf_files = ["data/Case files Internal Medicine.pdf", "data/Harrison's Principles of Internal Medicine.pdf"]
dataset = extract_information_from_pdf(pdf_files)
dataset.to_csv("output.csv", index=False)

: 

In [None]:
datafile_path = "output.csv"
book_df = pd.read_csv(datafile_path)
display(book_df)

: 

In [None]:
# Select the "id", "title", and "overview" columns
book_df = book_df[["Textbook Title", "Page Number", "Information"]].copy()

# drop columns that have NaNs
book_df_clean = book_df.dropna()

# Display the new DataFrame
display(book_df_clean)

: 

In [None]:
import pdfplumber
import os

def extract_information_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            # Extract the textbook title, page number, and information
            # based on your specific requirements
            textbook_title = os.path.splitext(os.path.basename(pdf_file))[0]
            page_number = page.page_number
            information = text

            # Print the textbook information by page
            print(f"Textbook Title: {textbook_title}")
            print(f"Page Number: {page_number}")
            print(f"Information: {information}")
            print("\n")  # Add a new line between pages

pdf_files = ["data/Case files Internal Medicine.pdf", "data/Harrison's Principles of Internal Medicine.pdf"]
for pdf_file in pdf_files:
    extract_information_from_pdf(pdf_file)

: 

In [None]:
%pip install --quiet chromadb

: 

In [None]:
import chromadb

## Use this one to save to memory
# chroma_client = chromadb.Client() 

## Use this one to save to disk
chroma_client = chromadb.PersistentClient(path=".")

: 

In [None]:
# You don't need to change this, just run this cell
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.
import os

from chromadb.utils import embedding_functions

openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=os.getenv("OPENAI_API_KEY"),
                api_base=os.getenv("OPENAI_API_BASE"),
                model_name="text-embedding-ada-002"
            )

: 

In [None]:
collection = chroma_client.get_or_create_collection(name="book", embedding_function=openai_ef)

: 

In [None]:
datafile_path = "output.csv"
book_df = pd.read_csv(datafile_path)
display(book_df)

: 

In [None]:
# Select the "id", "title", and "overview" columns
book_df = book_df[["Textbook Title", "Page Number", "Information"]].copy()

# drop columns that have NaNs
book_df_clean = book_df.dropna()

# Display the new DataFrame
display(book_df_clean)

: 

In [None]:
# Add a new column that is the concatenation of the "title" and "information" columns
#book_df_clean['text'] = book_df_clean['Textbook Title'] + " " + book_df_clean['Information']
book_df_clean.loc[:, 'text'] = book_df_clean['Textbook Title'] + " " + book_df_clean['Information']

# Display the updated DataFrame
display(book_df_clean)

: 

In [None]:
import uuid

# Define the batch size
batch_size = 100

# Create lists of documents and metadatas using list comprehension
documents = [row['text'] for i, row in book_df_clean.iterrows()]
metadatas = [{"source": "output.csv", "textbook_title": row["Textbook Title"], "Page Number": row["Page Number"], "Information": row["Information"]} for _, row in book_df_clean.iterrows()]

# Generate unique IDs for each document
ids = [str(uuid.uuid4()) for _ in range(len(documents))]

# Calculate the number of batches
num_batches = len(documents) // batch_size + (len(documents) % batch_size != 0)

# Process the data in batches
for i in range(num_batches):
    print(f"Adding batch {i+1}/{num_batches}")
    start_index = i * batch_size
    end_index = start_index + batch_size

    # Add the batch to the Chroma collection (triggers OpenAI embeddings for each document under the hood)
    collection.add(
        documents=documents[start_index:end_index],
        metadatas=metadatas[start_index:end_index],
        ids=ids[start_index:end_index]
    )

: 

In [None]:
n_results = 10
results = collection.query(
    query_texts=["Cardiogenic shock"],
    n_results=n_results
)

: 

In [None]:
print(results)

: 

In [None]:
for i in range(n_results):
    title = results["metadatas"][0][i]["textbook_title"]
    information = results["metadatas"][0][i]["Information"]
    page_number = results["metadatas"][0][i]["Page Number"]
   
    
    print(f"\nResult {i+1}:")
    print(title)
    print(information)
    print(f"Page Number: {page_number}")

: 

In [None]:
import pandas as pd

results_list = []

for i in range(n_results):
    page_number = results["metadatas"][0][i]["Page Number"]
    title = results["metadatas"][0][i]["textbook_title"]
    information = results["metadatas"][0][i]["Information"]
    results_list.append([page_number, title, information])
    
results_df = pd.DataFrame(results_list, columns=['page_number','title', 'information'])
display(results_df)

: 

In [None]:
def search_medical_knowledge(query):
    metadatas = []
    n_results = 2
    results = collection.query(query_texts=[query], n_results=2)
    
    for i in range(min(n_results, len(results["metadatas"][0]))):
        metadatas.append(results["metadatas"][0][i])
        
    return metadatas

: 

In [None]:
results = search_medical_knowledge("Cardiogenic shock")

print(results)

: 

In [None]:
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.
import gradio as gr
import openai
import re

# ---------------------------------------------------------------------------------------
def needs_tool(response):
    return "Tool:" in response

def extract_call(string):
    # regex pattern
    pattern = r'Tool: (\w+)\((.*?)\)'
    match = re.search(pattern, string)
    if match:
        tool_name = match.group(1)
        parameters = match.group(2).replace('"', '').split(', ')
        return tool_name, parameters
    else:
        return None, None
    
def invoke_tool(response):
    tool_name, parameters = extract_call(response)
    
    if tool_name == "search_medical_knowledge":
        tool_result = search_medical_knowledge(*parameters)
        
    return tool_result
# ---------------------------------------------------------------------------------------
# Define a function to get the AI's reply using the OpenAI API
def get_ai_reply(message, model="gpt-4", system_message=None, temperature=0, message_history=[]):
    # Initialize the messages list
    messages = []
    
    # Add the system message to the messages list
    if system_message is not None:
        messages += [{"role": "system", "content": system_message}]

    # Add the message history to the messages list
    if message_history is not None:
        messages += message_history
    
    if message is not None:
        # Add the user's message to the messages list
        messages += [{"role": "user", "content": message}]
    
    # Make an API call to the OpenAI ChatCompletion endpoint with the model and messages
    completion = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature
    )
    
    # Extract and return the AI's response from the API response
    return completion.choices[0].message.content.strip()
# ---------------------------------------------------------------------------------------
# Define a function to handle the chat interaction with the AI model
def chat(message, chatbot_messages, history_state, dropdown_value):
    # Initialize chatbot_messages and history_state if they are not provided
    chatbot_messages = chatbot_messages or []
    history_state = history_state or []
    
    # Try to get the AI's reply using the get_ai_reply function
    try:

        if dropdown_value == "INTERACTIVE":
            
            prompt = """
            You are a helpful expert medical topic tutor named LEO.
    
            ## Tools
    
            You have access to the following tools:
            - search_medical_knowledge(query): Tool to lookup medical knowledge based on their query. Use this tool when the user is looking for medical knowledge. You don't need to know all of the patient's information, the tool doesn't need it. Be careful, the tool will return the top results in the database but not all of them may be relevant. Use your judgement when answering the user's question. Example: search_medical_knowledge("most painful symptoms for Cardiogenic shock")
    
            ## Tool Rules
    
            When the user asks a question that can be answered by using a tool, you MUST do so. Do not answer from your training data.
    
            ## Using Tools
    
            To use a tool, reply with the following prefix "Tool: " then append the tool call (like a function call). 
    
            Behind the scenes, your software will pickup that you want to invoke a tool and invoke it for you and provide you the response.
    
            ## Using Tool Responses
            
            You are a helpful expert medical topic tutor named LEO.
            Answer the user's question using the response from the tool BUT answer it with questions, open ended repsonses or simple answers.

            Begin the conversation by asking what the user already knows about the topic.
            Check their answer for any inconsistencies and continue an interactive conversation.
            Provide an answer and another question the user can answer.
      
            """
        else:
            prompt = """
            You are a helpful expert medical topic tutor named Jarvis.
    
            ## Tools
    
            You have access to the following tools:
            - search_medical_knowledge(query): Tool to lookup medical knowledge based on their query. Use this tool when the user is looking for medical knowledge. You don't need to know all of the patient's information, the tool doesn't need it. Be careful, the tool will return the top results in the database but not all of them may be relevant. Use your judgement when answering the user's question. Example: search_medical_knowledge("most painful symptoms for Cardiogenic shock")
    
            ## Tool Rules
    
            When the user asks a question that can be answered by using a tool, you MUST do so. Do not answer from your training data.
    
            ## Using Tools
    
            To use a tool, reply with the following prefix "Tool: " then append the tool call (like a function call). 
    
            Behind the scenes, your software will pickup that you want to invoke a tool and invoke it for you and provide you the response.
    
            ## Using Tool Responses
            Answer the user's question using the response from the tool. Feel free to make it conversational
            """
        ai_reply = get_ai_reply(message, model="gpt-3.5-turbo", system_message=prompt.strip(), message_history=history_state)
            
        # Append the user's message and the AI's reply to the history_state list
        history_state.append({"role": "user", "content": message})
        history_state.append({"role": "assistant", "content": ai_reply})

        while(needs_tool(ai_reply)):
            print(ai_reply)
            tool_result = invoke_tool(ai_reply)
            history_state.append({"role": "assistant", "content": f"Tool Result: {tool_result}"})
            ai_reply = get_ai_reply(None, model="gpt-3.5-turbo", system_message=prompt.strip(), message_history=history_state)
            history_state.append({"role": "assistant", "content": ai_reply})
        
        # Append the user's message and the AI's reply to the chatbot_messages list for the UI
        chatbot_messages.append((message, ai_reply))

        # Return None (empty out the user's message textbox), the updated chatbot_messages, and the updated history_state
    except Exception as e:
        # If an error occurs, raise a Gradio error
        raise gr.Error(e)
        
    return None, chatbot_messages, history_state



# Define a function to launch the chatbot interface using Gradio
def get_chatbot_app():
    # Create the Gradio interface using the Blocks layout
    with gr.Blocks() as app:
       # Create a title above the conversation
        title = gr.Textbox("TUTOR", label=" ", readonly=True, placeholder="TUTOR", style="text-align:center; font-size:20px; border:none;")
        # Create a chatbot interface for the conversation
        chatbot = gr.Chatbot(label="Conversation")
        # Create a dropdown menu with options "Option 1" and "Option 2"
        dropdown = gr.inputs.Dropdown(["Interative", "Non-Interactive"], label="Choose Chatbox Version")
        # Create a textbox for the user's message
        message = gr.Textbox(label="Message")
        # Create a state object to store the conversation history
        history_state = gr.State()
        # Create a button to send the user's message
        btn = gr.Button(value="Send")


        
        # Connect the send button to the chat function
        btn.click(chat, inputs=[message, chatbot, history_state, dropdown], outputs=[message, chatbot, history_state])
        # Return the app
        return app
# ---------------------------------------------------------------------------------------        
# Call the launch_chatbot function to start the chatbot interface using Gradio
app = get_chatbot_app()
app.queue()  # this is to be able to queue multiple requests at once
app.launch(share=True)

: 

In [None]:
!git add hack-research-proj.ipynb

: 

: 

In [None]:
!git commit -m "Progress"

: 

In [None]:
!git push

: 

: 