In [None]:
pip install torch pymupdf torch protobuf sentencepiece accelerate

In [None]:
import fitz  # PyMuPDF
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from huggingface_hub import login

In [None]:
login("")

In [None]:
model_id = "mistralai/Mistral-7B-Instruct-v0.3"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")


In [None]:
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")


In [None]:
def extract_data(path: str) -> list[str]:
    doc = fitz.open(path)
    return [page.get_text() for page in doc]

In [None]:
# Load and parse PDF
pages = extract_data("/content/sterling-accuris-pathology-sample-report-unlocked.pdf")

# Define output dictionary
all_structured_data = {}

In [None]:
for i, page in enumerate(pages):
    prompt = f"""
You are a medical-report parser. I will provide you with raw text from one page of a pathology report.

Your task is to extract and return a single valid JSON object with the following structure:

1. Patient Information:
    - patient_name: string
    - hospital_name: string
    - doctor_names: list of strings
    - sex: string
    - lab_id: string
    - age: integer
    - dob: string (YYYY-MM-DD or null)

2. Sections of the report (e.g., "Complete Blood Count"):
    - Each section’s value must be a list of test records with:
        - name: string
        - method: string
        - value: number
        - unit: string
        - ref_low: number or null
        - ref_high: number or null

Only emit a valid JSON object. No markdown or extra commentary.

Here is the raw page text:
{page}
"""

    try:
        result = generator(prompt, max_new_tokens=1024, do_sample=False)
        output_text = result[0]['generated_text']

        # Extract the JSON part if there's extra text
        start = output_text.find('{')
        end = output_text.rfind('}') + 1
        json_text = output_text[start:end]

        structured_data = json.loads(json_text)
        all_structured_data[f"Page {i+1}"] = structured_data
        print(f"Page {i+1} processed successfully.")

    except Exception as e:
        print(f"Error processing Page {i+1}: {e}")

In [None]:
with open("structured_medical_data.json", "w") as f:
    json.dump(all_structured_data, f, indent=4)

In [None]:
prompt = f"""
You are a medical-report parser. I will provide you with raw text from one page of a pathology report.

Your task is to extract and return a single valid JSON object with the following structure:

1. Patient Information:
    - patient_name: string
    - hospital_name: string
    - doctor_names: list of strings
    - sex: string
    - lab_id: string
    - age: integer
    - dob: string (YYYY-MM-DD or null)

2. Sections of the report (e.g., "Complete Blood Count"):
    - Each section’s value must be a list of test records with:
        - name: string
        - method: string
        - value: number
        - unit: string
        - ref_low: number or null
        - ref_high: number or null

Only emit a valid JSON object. No markdown or extra commentary.

Here is the raw page text:
{pages[0]}
"""
result = generator(prompt, max_new_tokens=1024, do_sample=False)
output_text = result[0]['generated_text']

start = output_text.find('{')
end = output_text.rfind('}') + 1
json_text = output_text[start:end]
print(json_texts)


In [None]:
json_text