# MIMIC-III Dataset

`patient_data.csv` is the cleaned patient data by teammate Ananay and Shreyan

In [None]:
import pandas as pd
import random

df = pd.read_csv("patient_data.csv")
n = len(df.index)

df.head()

Cleaning the `notes` column of the dataframe.

In [None]:
# def clean_str(input):
#     input = input.split()
#     input = " ".join(input).strip()
#     return input

# df["notes"] = df["notes"].apply(clean_str)

# # Assuming df is your DataFrame
# with open(f"anonymized_patient_notes.txt", 'w', encoding='utf-8') as file:
#     for _, row in df.iterrows():
#         file.write(row['notes'])

## MIMIC-III QA Pair Construction

Attempt at constructing question-answer pairs based on the medical notes using local models, unsuccessful given the below code.

For a higher chance of success, consider applying chatml format (construct a conversation list) first.

Also, ollama is fairly slow. Consider using `llama-cpp-python`.

In [None]:
import requests
from pprint import pprint
import json


def ollama_request(prompt, model="hermes"):
    url = "http://localhost:11434/api/generate"
    param = {"model": "hermes", "prompt": prompt, "stream": False, "raw": True}
    res = requests.post(url, json=param).json()
    pprint(res)
    bot_response = res["response"]
    sec = res["eval_duration"] / 1000000000
    tok_s = res["eval_count"] / sec
    return bot_response


def obtain_qa(note, model="hermes"):
    q_prompt = f'MEDICAL NOTE: """\n{note}""" \nBased on the given medical note, what is be the single most probably inquiry or question the patient asked to the doctor? '
    q = ollama_request(q_prompt, model)

    ans_prompt = f'PATIENT QUESTION:  """\n{q}""" \n MEDICAL NOTE: """\n{note}""" \nBased on the given medical note and patient question, construct a concise and terse paragraph of a top professional doctors response in 3 to 4 sentences. '
    ans = ollama_request(ans_prompt)

    return q, ans


def obtain_qa_single_run(note, model="hermes"):
    prompt = f'MEDICAL NOTE: """\n{note}""" \nBased on the given medical note, construct one `Question` and `Answer` pair between the patient and the doctor in JSON format with exactly one pair of `Question` and `Answer`. The patients question includes clear and detailed description of the problem relevant in the medical note. The doctors answer is in first person perspective, and includes reasoning and details, such as sympotoms, diagnosis, inference, suggestions, medications. Both questions and answers should be concise, straight to the point and highly medically relevant. '

    res = ollama_request(prompt, model)
    return res


r = random.randint(0, n)

sample_note = df.iloc[r].notes

# q, ans = obtain_qa(sample_note)
# print(f"==== question: {q}")
# print(f"==== answer: {ans}")

print(f"\n\n {obtain_qa_single_run(sample_note, 'neural')}")

# PubMedQA Conversation Construction

Contains the transformation codes for both chatml and sharegpt format.

Importantly, using `sharegpt` format chat datasets needs us to modify the system prompts in the axolotl source code file `src/axolotl/prompt_strategies/sharegpt.py` before training.

In [None]:
from datasets import load_dataset
from pprint import pprint
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B")

dataset = load_dataset("pubmed_qa", "pqa_artificial")["train"]
dataset = dataset.flatten().remove_columns(
    ["pubid", "context.labels", "context.meshes"]
)


dataset = dataset.rename_column(
    original_column_name="context.contexts", new_column_name="contexts"
)

# Axolotl configurations:
# === alpaca_w_system.load_open_orca_chatml: 
# {"system_prompt": "...", "question": "...", "response": "..."} - 
# === sharegpt: 
# {"conversations": [{"from": "...", "value": "..."}]} - sharegpt. 
# Using sharegpt format demands manually modify system prompt at:
# src/axolotl/prompt_strategies/sharegpt.py

def concat_contexts(row):
    system = "As an expert doctor in clinical science and medical knowledge, can you tell me if the following question is correct, given the accompanying context? Answer yes, no, or maybe. Then, follow up with some explanations."
    user = "Context: " + " ".join(row["contexts"]) + " Question: " + row["question"]
    assistant = row["final_decision"] + ". " + row["long_answer"]
    chat = [
        # {"from": "system", "value": system},
        {"from": "user", "value": user},
        {"from": "assistant", "value": assistant},
    ]
    row["conversations"] = chat
    return row


dataset = dataset.map(concat_contexts, num_proc=12)
# dataset = dataset.add_column(name="context", column=contexts)
dataset = dataset.remove_columns(
    ["contexts", "question", "final_decision", "long_answer"]
)

# dataset.push_to_hub("Medilora/PubMedQA-ShareGPT", private=True)

# Check chat template format and tokenization

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B")

sample_chat = dataset["chat"][2]

tokenized_chat = tokenizer.apply_chat_template(sample_chat, tokenize=True, return_tensors="pt")
print(tokenizer.decode(tokenized_chat[0]))