In [1]:
import pdfplumber
import groq

In [6]:
from groq import Groq

In [99]:
import os
from dotenv import load_dotenv

In [100]:
load_dotenv()

True

In [101]:
client=Groq(api_key=os.environ['GROQ'])

In [92]:
prompts={
    "type_prompt":'''
you are a medical report classifier.
you will be given a medical report in text and you have to classify it in blood_report or other.
if there are values of hemoglobin,rbc count wbc count,platelet count bilirubin etc then it is a blood report else other.
Give me the result in Json format with key as "type" with no preamble and no nested json.
''',
    'blood_prompt':'''
you are a medical report parser.
Give me the output in json format strictly with no preamble.
If you dont get any value assign it None
Extract the following fields
    name of the doctor with key as "doctor_name"
    date of report with key as "date_of_report" with dd/mm/yy format
    date of collection with key as "date_of_collection" with dd/mm/yy format
    hemoglobin with key as "hemoglobin"
    rbc count with key as "rbc_count"
    wbc count with key as "wbc_count"
    pcv with key as "pcv"
    iron with key as "iron"
    sodium with key "sodium"
    pottasium with key "potassium"
    phosphorus with key "phosphorus"
    chloride with key "chloride"
    platelet count with key as "platelet_count"
    bilirubin total with key as "bilirubin_total"
    bilirubin direct with key as "bilirubin_direct"
    bilirubin indirect with key as "bilirubin_indirect"
    proteins with key as "proteins"
    calcium with key as "calcium"
    albumin with key as "albumin"
    Globulin with key as "globulin"
    Blood Urea with the key "blood_urea"
    Blood Urea Nitrogen with the key "blood_urea_nitrogen"
    S. Creatinine with the key "s_creatinine"
    S. Uric Acid with the key "s_uric_acid"
    S. Phosphorus with the key "s_phosphorus"
    Neutrophils with the key "neutrophils".
    Lymphocytes with the key "lymphocytes".
    Sr. Cholesterol with the key "sr_cholesterol".
    HDL Cholesterol with the key "hdl_cholesterol".
    fasting sugar with key "fasting_sugar"
    after lunch sugar with key "after_lunch_sugar"
Summarize the medical report, focusing on any values that are abnormal or outside the normal range with key "summary"
Strictly give me in proper json format with no nested json and  with no preamble.
''' ,
    "other_prompt":'''
you are a medical report summarizer and entity extractor.
Give me the following details in json format
    name of the doctor with key as "doctor_name"
    date of report with key as "date_of_report" with dd/mm/yy format
    date of collection with key as "date_of_collections" with dd/mm/yy format
    summary the report with key as "summary".
Only give me json with no preamble
'''
}

In [15]:
# check type of report
def get_model_response(prompt,text):
    chat_completion=client.chat.completions.create(
        messages=[{'role':"system","content":prompt},{"role":"user","content":text}],
        model="llama3-70b-8192"
    )
    return chat_completion.choices[0].message.content

In [33]:
pdf=pdfplumber.open("test2.pdf")
pages=pdf.pages

In [34]:
text=""
for page in pages[:2] :
    text+=page.extract_text()

In [35]:
type_of_report=eval(get_model_response(prompts['type_prompt'],text=text))['type']

In [37]:
for page in pages[2:]:
    text+=page.extract_text()

In [25]:
if type_of_report=="other":
    data=eval(
        get_model_response(prompts['other_prompt'],text)
    )
    print(data)

{'doctor_name': 'Dr. A.A. Chaudhary', 'date_of_report': '16/11/22', 'date_of_collections': '15/11/22', 'summary': 'The malaria parasites test result is not detected using the thick smear method. However, it does not rule out malaria, and smears may be required 2-3 times in few cases.'}


In [36]:
type_of_report

'blood_report'

In [38]:
len(text)

2761

In [102]:
data=eval(
    get_model_response(prompts['blood_prompt'],text)
)

In [94]:
data_template={
  "hemoglobin": {
    "value": -1,
    "min": 12.1,
    "max": 15.5,
    "unit": "g/dL"
  },
  "rbc_count": {
    "value": -1,
    "min": 4.2,
    "max": 5.4,
    "unit": "million cells/μL"
  },
  "wbc_count": {
    "value": -1,
    "min": 4.5,
    "max": 11.0,
    "unit": "thousand cells/μL"
  },
  "pcv": {
    "value": -1,
    "min": 36,
    "max": 50,
    "unit": "%"
  },
  "iron": {
    "value": -1,
    "min": 60,
    "max": 170,
    "unit": "μg/dL"
  },
  "sodium": {
    "value": -1,
    "min": 135,
    "max": 145,
    "unit": "mmol/L"
  },
  "potassium": {
    "value": -1,
    "min": 3.5,
    "max": 5.0,
    "unit": "mmol/L"
  },
  "phosphorus": {
    "value": -1,
    "min": 2.5,
    "max": 4.5,
    "unit": "mg/dL"
  },
  "chloride": {
    "value": -1,
    "min": 98,
    "max": 107,
    "unit": "mmol/L"
  },
  "platelet_count": {
    "value": -1,
    "min": 150000,
    "max": 450000,
    "unit": "cells/μL"
  },
  "bilirubin_total": {
    "value": -1,
    "min": 0.1,
    "max": 1.2,
    "unit": "mg/dL"
  },
  "bilirubin_direct": {
    "value": -1,
    "min": 0.0,
    "max": 0.3,
    "unit": "mg/dL"
  },
  "bilirubin_indirect": {
    "value": -1,
    "min": 0.1,
    "max": 0.8,
    "unit": "mg/dL"
  },
  "proteins": {
    "value": -1,
    "min": 6.0,
    "max": 8.0,
    "unit": "g/dL"
  },
  "calcium": {
    "value": -1,
    "min": 8.5,
    "max": 10.2,
    "unit": "mg/dL"
  },
  "albumin": {
    "value": -1,
    "min": 3.5,
    "max": 5.0,
    "unit": "g/dL"
  },
  "globulin": {
    "value": -1,
    "min": 2.0,
    "max": 4.0,
    "unit": "g/dL"
  },
  "blood_urea": {
    "value": -1,
    "min": 7,
    "max": 20,
    "unit": "mg/dL"
  },
  "blood_urea_nitrogen": {
    "value": -1,
    "min": 7,
    "max": 20,
    "unit": "mg/dL"
  },
  "s_creatinine": {
    "value": -1,
    "min": 0.6,
    "max": 1.2,
    "unit": "mg/dL"
  },
  "s_uric_acid": {
    "value": -1,
    "min": 3.5,
    "max": 7.2,
    "unit": "mg/dL"
  },
  "s_phosphorus": {
    "value": -1,
    "min": 2.5,
    "max": 4.5,
    "unit": "mg/dL"
  },
  "neutrophils": {
    "value": -1,
    "min": 40,
    "max": 75,
    "unit": "%"
  },
  "lymphocytes": {
    "value": -1,
    "min": 20,
    "max": 45,
    "unit": "%"
  },
  "sr_cholesterol": {
    "value": -1,
    "min": 0,
    "max": 200,
    "unit": "mg/dL"
  },
  "hdl_cholesterol": {
    "value": -1,
    "min": 40,
    "max": 60,
    "unit": "mg/dL"
  },
  "fasting_sugar": {
    "value": -1,
    "min": 70,
    "max": 100,
    "unit": "mg/dL"
  },
  "after_lunch_sugar": {
    "value": -1,
    "min": 70,
    "max": 140,
    "unit": "mg/dL"
  }
}



In [95]:
sett={"doctor_name","date_of_report","date_of_collection","summary"}

In [96]:
for i in data:
    if i not in sett:
        data_template[i]['value']=data[i] if data[i] else -1

In [97]:
data_template

{'hemoglobin': {'value': 12.1, 'min': 12.1, 'max': 15.5, 'unit': 'g/dL'},
 'rbc_count': {'value': 4.72,
  'min': 4.2,
  'max': 5.4,
  'unit': 'million cells/μL'},
 'wbc_count': {'value': 9700,
  'min': 4.5,
  'max': 11.0,
  'unit': 'thousand cells/μL'},
 'pcv': {'value': 38, 'min': 36, 'max': 50, 'unit': '%'},
 'iron': {'value': -1, 'min': 60, 'max': 170, 'unit': 'μg/dL'},
 'sodium': {'value': -1, 'min': 135, 'max': 145, 'unit': 'mmol/L'},
 'potassium': {'value': -1, 'min': 3.5, 'max': 5.0, 'unit': 'mmol/L'},
 'phosphorus': {'value': 3.84, 'min': 2.5, 'max': 4.5, 'unit': 'mg/dL'},
 'chloride': {'value': -1, 'min': 98, 'max': 107, 'unit': 'mmol/L'},
 'platelet_count': {'value': 219000,
  'min': 150000,
  'max': 450000,
  'unit': 'cells/μL'},
 'bilirubin_total': {'value': 0.57, 'min': 0.1, 'max': 1.2, 'unit': 'mg/dL'},
 'bilirubin_direct': {'value': 0.19, 'min': 0.0, 'max': 0.3, 'unit': 'mg/dL'},
 'bilirubin_indirect': {'value': 0.38,
  'min': 0.1,
  'max': 0.8,
  'unit': 'mg/dL'},
 'pro

In [103]:
data

{'doctor_name': 'Dr.Sayyed Manazir Hasan',
 'date_of_report': '03-Aug-2024',
 'date_of_collection': '02-Aug-2024',
 'hemoglobin': 12.1,
 'rbc_count': 4.72,
 'wbc_count': 9700,
 'pcv': 38,
 'iron': None,
 'sodium': None,
 'potassium': None,
 'phosphorus': 3.84,
 'chloride': None,
 'platelet_count': 219000,
 'bilirubin_total': 0.57,
 'bilirubin_direct': 0.19,
 'bilirubin_indirect': 0.38,
 'proteins': 6.18,
 'calcium': 8.19,
 'albumin': 3.16,
 'globulin': 3.02,
 'blood_urea': 21,
 'blood_urea_nitrogen': 9.81,
 's_creatinine': 0.98,
 's_uric_acid': 4.25,
 's_phosphorus': 3.84,
 'neutrophils': 91,
 'lymphocytes': 6,
 'sr_cholesterol': None,
 'hdl_cholesterol': None,
 'fasting_sugar': None,
 'after_lunch_sugar': None,
 'summary': 'Abnormal values include Hemoglobin which is below the normal range, RBC Count is lower than normal range, RBC Morphology shows Hypochromia and Mild Microcytosis. Platelet Count is normal. Neutrophils percentage is higher than normal range.'}