### Next Steps
- Use LLM to get message to user

In [1]:
import json
import os
import re

import pandas as pd
from dotenv import load_dotenv

import google.genai as genai

Get API key from .env

In [2]:
_ = load_dotenv()
api_key = os.getenv("API_KEY")

### One-shot learning text example
This is decent

In [3]:
expected_cols = [
    'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc',
    'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 
    'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane',
]

In [4]:
def response_to_df(text):
    if 'json' in text:
        fixed_str = text.removeprefix('```json\n').removesuffix('\n```')
        if '{' not in fixed_str:
            fixed_str = '{' + fixed_str
        if '\n}' not in fixed_str:
            fixed_str += '\n}'

        fixed_str = re.sub(r'(\b(?:yes|no|true|false)\b)(?=\s*[\n,])', r'"\1"', fixed_str) # add double quotes around non-numeric values
        fixed_str = re.sub(r'([{,]?\s*)(\w+)(\s*:)', r'\1"\2"\3', fixed_str) # add double quotes around keys if necessary
        fixed_str = re.sub(r'(?<=[\d"])\s*\n(?=\s*"\w+":)', ',\n', fixed_str) # add commas to end of life (except last) if necessary

        try:
            data = json.loads(fixed_str)
        except:
            print(fixed_str)
            return
    else:
        try:
            data = dict(line.split(": ") for line in text.strip().splitlines())
        except:
            print(text)
            return
    
    try:
        df = pd.DataFrame([data]).reindex(columns=expected_cols)
    except:
        print(data)
        return
    
    return df

In [5]:
def get_pairs_text(text):
    example = """
    Example 1:
    Text: "I'm 43 years old, and my recent blood tests showed a low specific gravity. I have been feeling quite fatigued, and my doctor mentioned I might have anemia. My blood pressure is high, and I have a poor appetite."

    Key-value pairs:
    age: 43
    appet: poor
    ane: yes
    htn: yes
    """

    prompt = f"""
 	1. Age (numerical): age in years
 	2. Blood Pressure (numerical): bp in mm/Hg
 	3. Specific Gravity (nominal): sg - (1.005,1.010,1.015,1.020,1.025)
 	4. Albumin (nominal): al - (0,1,2,3,4,5)
 	5. Sugar (nominal): su - (0,1,2,3,4,5)
 	6. Red Blood Cells (nominal): rbc - (normal,abnormal)
 	7. Pus Cell (nominal): pc - (normal,abnormal)
 	8. Pus Cell clumps (nominal): pcc - (present,notpresent)
 	9. Bacteria (nominal): ba - (present,notpresent)
 	10. Blood Glucose Random (numerical): bgr in mgs/dl
 	11. Blood Urea (numerical): bu in mgs/dl
 	12. Serum Creatinine (numerical): sc in mgs/dl
 	13. Sodium (numerical): sod in mEq/L
 	14. Potassium (numerical): pot in mEq/L
 	15. Hemoglobin (numerical): hemo in gms
 	16. Packed Cell Volume (numerical): pcv in %
 	17. White Blood Cell Count (numerical): wc in cells/cumm
 	18. Red Blood Cell Count (numerical): rc in millions/cmm
 	19. Hypertension (nominal): htn - (yes,no)
 	20. Diabetes Mellitus (nominal): dm - (yes,no)
 	21. Coronary Artery Disease (nominal): cad - (yes,no)
 	22. Appetite (nominal): appet - (good,poor)
 	23. Pedal Edema (nominal): pe - (yes,no)	
 	24. Anemia (nominal): ane - (yes,no)

    I'll provide you with an example of generating key-value pairs, then ask you to do the same for a new text.
    
    {example}

    Now, please analyze the following text and extract the key-value pairs:
    Text: {text}

    Key-value pairs:
    """

    client = genai.Client(api_key=api_key)
    response = client.models.generate_content(
        model="gemini-2.0-flash", contents=prompt
    )
    return response.text


In [6]:
text = get_pairs_text("I (45M) am tired recently and cannot eat as much. My doctor said my red blood cell count is low, and that I have anemia. My head hurts a lot and my blood pressure is high, what do I do?")

In [7]:
text_df = response_to_df(text)

### One-shot learning image
Definitely getting somewhere...

In [8]:
def get_pairs_image(path, report_type):
    example = """
    Example:
    Text: "I'm 43 years old, and my recent blood tests showed a specific gravity of 1.010. I have been feeling quite fatigued, and my doctor mentioned I might have anemia. My blood pressure is high, and I have a poor appetite."

    Key-value pairs:
    age: 43
    sg: 1.010
    appet: poor
    ane: yes
    htn: yes
    """

    prompt = f"""
    Here are the keys, what they represent, and their possible values.

 	1. Age (numerical): age in years
 	2. Blood Pressure (numerical): bp in mm/Hg
 	3. Specific Gravity (nominal): sg - (1.005,1.010,1.015,1.020,1.025)
 	4. Albumin (nominal): al - (0,1,2,3,4,5)
 	5. Sugar (nominal): su - (0,1,2,3,4,5)
 	6. Red Blood Cells (nominal): rbc - (normal,abnormal)
 	7. Pus Cell (nominal): pc - (normal,abnormal)
 	8. Pus Cell clumps (nominal): pcc - (present,notpresent)
 	9. Bacteria (nominal): ba - (present,notpresent)
 	10. Blood Glucose Random (numerical): bgr in mgs/dl
 	11. Blood Urea (numerical): bu in mgs/dl
 	12. Serum Creatinine (numerical): sc in mgs/dl
 	13. Sodium (numerical): sod in mEq/L
 	14. Potassium (numerical): pot in mEq/L
 	15. Hemoglobin (numerical): hemo in gms
 	16. Packed Cell Volume (numerical): pcv in %
 	17. White Blood Cell Count (numerical): wc in cells/cumm
 	18. Red Blood Cell Count (numerical): rc in millions/cmm
 	19. Hypertension (nominal): htn - (yes,no)
 	20. Diabetes Mellitus (nominal): dm - (yes,no)
 	21. Coronary Artery Disease (nominal): cad - (yes,no)
 	22. Appetite (nominal): appet - (good,poor)
 	23. Pedal Edema (nominal): pe - (yes,no)	
 	24. Anemia (nominal): ane - (yes,no)

    I'll provide you with an textual example of generating key-value pairs, then ask you to do the same for a {report_type} report.
    
    {example}

    Now, please extract the text from this pdf, then generate only the key-value pairs.

    Key-value pairs:
    """

    client = genai.Client(api_key=api_key)
    myfile = client.files.upload(file=path)
    response = client.models.generate_content(
        model="gemini-2.5-pro-exp-03-25", contents=[myfile, prompt]
    )
    return response.text

In [9]:
cmp = get_pairs_image("llm_src/example_cmp.png", "Comprehensive Metabolic Panel")

In [10]:
cmp_df = response_to_df(cmp)

In [11]:
cbc = get_pairs_image("llm_src/example_cbc.png", "Complete Blood Count")

In [12]:
cbc_df = response_to_df(cbc)

In [13]:
cbc_2 = get_pairs_image("llm_src/example_cbc_2.png", "Comprehensive Metabolic Panel")

In [14]:
cbc_2_df = response_to_df(cbc_2)

In [15]:
combined = cbc = get_pairs_image("llm_src/example_combined.jpg", "Multiple Lab Reports")

In [16]:
combined_df = response_to_df(combined)

In [17]:
multi_test_df = pd.DataFrame(columns=expected_cols)

all_df = pd.concat([text_df, cmp_df, cbc_df, cbc_2_df, combined_df], ignore_index=True)

result_row = all_df.bfill(axis=0).iloc[0]

multi_test_df.loc[0] = result_row

In [18]:
multi_test_df.to_csv('llm_src/multi_test_values.csv', index=False)