# Questions

1. About the file itself
- Can I have an example file?
- Need to extract the date & time from line ending in #888# -> Will there be more than one line that ends with #888#? Contains #888#? #888# evaluate? On that line, will there be more than one date? 
   - Communication date
   - Within the hour of report creation
- Will the date format always be the same?
- Col E, F and G are calculations for time elapses from time on Col C to time in Col D -> So E, F, G will be the same? What is the desired date format? What will the names of each column?

2. About the infrastructure
- What's the frequency of this task? How often do you want this same processing?
- What we are planning is to have a shared drive, where you put the files that need processing in there, and we will run the program on those files, and put the processed files into the same shared drive.
- What this means is that there needs to be a person who regularly uploads the files that need processing into the shared drive.
- Naming convention for the files that need processing might be necessary.

In [25]:
import pandas as pd
import re
import datetime
import math
from openpyxl import load_workbook, styles
from openpyxl.utils.dataframe import dataframe_to_rows
from dateutil.parser import parse, ParserError
from pathlib import Path

In [26]:
last_month = (datetime.date.today().replace(day=1) - datetime.timedelta(days=1)).strftime("%B %Y")
file_name = f'Critical Results - {last_month}.xlsx'
file_dir = Path('/Volumes/RadReportsAdmin/CR Folder')
file_path = file_dir / file_name

assert file_path.exists()

In [27]:
multiple_888 = set()
no_888 = set()
no_date = set()
weird_time_format = set()
pattern_888 = re.compile(".*#888.*")


In [28]:
def parse_helper(line: str):
    """
    Parse `line` for datetime. Handle errors:
    - Line with datetime, but raise error due to having numbers/month names 
    that precede the datetime substring.

    If no datetime substring is found, return None.

    Algorithm:
    -------
       Slice the start of `line` off until a datetime has been found, or 
       all sliced iterations have been processed.

    Inputs:
    --------
    line (str): 
        The line with (potential) datetime.
    """
    # 1. Slice line from start -> end
    # Substring needs at least 8 characters to represent a datetime
    n = len(line)
    print('-----Parsing')

    for i in range(n):
        substring = line[i:]
        try:
            critical_result_date = parse(substring, ignoretz=True, fuzzy=True)
            if (critical_result_date > datetime.datetime.now()) or (critical_result_date < datetime.datetime(2000, 1, 1)):
                continue
            print(substring)
            return critical_result_date
        
        except ParserError:
            continue

    return None


In [29]:
def check_valid_datetime(report_created: datetime.datetime, critical_result_datetime: datetime.datetime):
    """
    Check if `critical_result_datetime` is within reasonable bounds by
    comparing with `report_created`.

    Inputs:
    --------
    report_created (datetime): 
        Datetime of when Report was created.

    critical_result_datetime (datetime): 
        Potential datetime of Critical Result
    """
    irregular_text = []
      
    critical_result_time = f'{critical_result_datetime.hour}:{critical_result_datetime.minute}:{critical_result_datetime.second}'
    print('Before parsing datetime\t', critical_result_datetime)
    # Time likely not found
    if critical_result_time == '0:0:0':
        irregular_text.append('Time likely not parsed')

    critical_result_date = critical_result_datetime.date()

    # Date likely not found
    if critical_result_date == datetime.datetime.now().date():
        irregular_text.append('Date taken from Report Created')

        # Assign the same date as report created's date
        critical_result_datetime = critical_result_datetime.replace(month=report_created.month, day=report_created.day, year=report_created.year)

    # Date difference is more than 1
    days_diff = (critical_result_datetime - report_created).days
    
    if abs(days_diff) > 1:
        irregular_text.append('Big difference in days, date taken from Report Created')
        critical_result_datetime = critical_result_datetime.replace(month=report_created.month, day=report_created.day, year=report_created.year)

    # Minutes difference is big
    minutes_diff = (critical_result_datetime - report_created).total_seconds() / 60.0
    if abs(minutes_diff) > 200 and abs(days_diff) <= 1:
        print(f'Big diff in minutes\t {abs(minutes_diff)}')
        irregular_text.append("Big difference in minutes")

    return (irregular_text, critical_result_datetime)

In [30]:
def process_critical_results(excel_path: Path):
   df = pd.read_excel(excel_path)
   df['Report Text'] = df['Report Text'].str.replace('_x000D', '')
   df['Called <\n31 minutes'] = 0
   df['Called < \n61 minutes'] = 0
   columns = list(df.columns)
   columns[7] = 'Note'
   df.columns = columns

   # Find line with #888#
   for index, row in df.iterrows():
      print('===================')
      report_text = row['Report Text'].strip().splitlines() # Row B
      report_id = row['Sort#']
      report_created = row['Report Created Date/Time'] # Row C

      print('ID:', report_id)
      print(f'Report created: \t {report_created}')

      line_with_date = ''
      critical_result_datetime = None

      # Find all lines with #888#
      found = []
      for line in report_text:
         if pattern_888.match(line):
            found.append(line)

      if not found:
         no_888.add(report_id)
         continue

      possible_critical_result_datetimes = []
      # Get the first row with a valid datetime
      for line_with_date in found:
         critical_result_datetime = parse_helper(line_with_date)
         irregularities_critical_result_datetime = check_valid_datetime(report_created, critical_result_datetime)
         possible_critical_result_datetimes.append(irregularities_critical_result_datetime)

      # If not found line with date
      if not possible_critical_result_datetimes:
         no_date.add(report_id)
         continue
      
      smallest_diff = float('inf')
      irregular_text = []

      # Assign critical result datetime as one with smallest diff with report_created
      for t in possible_critical_result_datetimes:
         if t[0]:
            print('---')
            print('Candidate time\t', t)
         possible_critical_result_datetime = t[1]
         diff = abs((possible_critical_result_datetime - report_created).total_seconds())
         print('diff\t', diff)
         if diff < smallest_diff:
            print('\tsmallest\t', smallest_diff)
            print('\tcurrent\t', diff)
            smallest_diff = diff
            critical_result_datetime = possible_critical_result_datetime
            irregular_text = t[0]


      # Assign relevant columns
      minutes_diff = (critical_result_datetime - report_created).total_seconds() / 60.0

      df.at[index, 'Note'] = ', '.join(irregular_text)
      df.at[index, 'CR called Date/Time'] = critical_result_datetime
      df.at[index, 'Minutes From\nCreated to Called'] = round(minutes_diff, 2)

      print('Fin CR datetime:\t', critical_result_datetime)

      if minutes_diff < 31:
         df.at[index, 'Called <\n31 minutes'] = 1
      if minutes_diff < 61:
         df.at[index, 'Called < \n61 minutes'] = 1

      
   # %Under 31 mins
   df.rename(
      columns={
         df.columns[9]: len(df[df['Called <\n31 minutes'] == 1]) / df.shape[0]
      }, 
      inplace=True
   ) 

   # %Under 61 minutes
   df.rename(
      columns={
         df.columns[11]: len(df[df['Called < \n61 minutes'] == 1]) / df.shape[0]
      }, 
      inplace=True
   ) 
   
   return df


In [31]:
text = """rEXAM: CT HEAD WO CONTRAST _
_
INDICATION:     Headache, sudden, severe_
_
TECHNIQUE: Axial thin section CT images of the head were obtained without contrast. Sagittal and coronal 2-D multiplanar reconstructions were performed at the scanner. _
_
COMPARISON: 2/1/2023_
_
FINDINGS:_
_
Adequate diagnostic quality._
_
Brain parenchyma, ventricles and extra axial spaces: Large volume multicompartment extra-axial hemorrhage, with predominantly hyperdense left subdural hemorrhage along the convexity measuring up to 7 mm in maximal thickness and causes an underlying local mass effect, more loculated appearing right subdural and possible parenchymal hemorrhage along the anterior right temporal lobe measuring up to 11 mm in maximal thickness (series 2 image 16). There is moderate to large volume left hemispheric subarachnoid hemorrhage seen along the frontal and parietal lobes to the vertex and layering within the left sylvian fissure. Small volume right subarachnoid hemorrhage is also seen in the sylvian fissure and right suprasellar cistern. Subdural hemorrhage from the left convexity layers along the posterior falx and left tentorial leaflet without significant mass effect. There is intracranial pneumocephalus related to fracture/defects along the cribriform plate as well as likely small bilateral inferior frontal lobe contusions (series 602 image 20). Small volume gas is also seen in the sella again likely related to sinus fracturing._
_
Orbits, paranasal sinuses, mastoids:     There is multifocal hyperdense opacification of the posterior paranasal sinuses, though the discrete fracture planes are difficult to visualize on nondedicated exam, fractures at least involves the bilateral sphenoid and posterior ethmoidal sinuses. There is significant stranding adjacent the left optic nerve but see foci of intraorbital gas (series 2 image 16) suggestive of nerve injury. Orbital fat bilaterally is grossly preserved. Few locules of extraconal gas are seen along the right medial aspect, however without blowout fracture or fracture defect visualized. There is a small amount of subcutaneous gas in the right preseptal space. No significant hemorrhage or crowding at the bilateral orbital apices. Bilateral mastoid air cells are clear..    _
_
Extracranial soft tissues: Normal._
_
Calvarium and skull base: No fracture or suspicious osseous lesion._
_
Other: CTA findings are reported separately, no discrete vascular injury identified on preliminary review._
_
IMPRESSION:_
1.	Bilateral subdural hemorrhage with local mass effect-on the right along the anterior temporal lobe measuring up to 11 mm, on the left along the convexity measuring up to 7 mm and extending to the posterior falx and tentorial leaflet. No significant midline shift._
2.	Moderate to large volume left hemispheric subarachnoid hemorrhage and small volume right hemispheric subarachnoid hemorrhage._
3.	Possible parenchymal contusion in the anterior right temporal lobe._
4.	Intracranial pneumocephalus along the cribriform plate fractures and likely small bilateral inferior frontal lobe contusions._
5.	Multifocal paranasal sinus opacification, more conspicuous posteriorly, along with scattered intracranial gas reflects posterior sinus fracturing and hemorrhage. Discrete fracture planes are difficult to visualize on nondedicated CT._
6.	Gas within the left optic nerve sheath, with mild associated fat stranding, may reflect some venous plexus gas versus may reflect optic nerve injury. No significant left-sided intracerebral hemorrhage or evidence of globe injury._
7.	Right preseptal and extraconal orbital gas without definite bony defect. Possible extraconal hemorrhage along the posterior inferior right orbit (CTA series 304 image 361)._
8.	Overall distribution of hemorrhage (anterior temporal, inferior frontal), presence of intracranial gas and sinus hemorrhage suggest traumatic etiology. Likely tiny sinus septal fractures are not visualized on nondedicated CT. _
9.	Critical Value: Large volume multicompartment extra-axial hemorrhage, intracranial gas. #888#._
_
Critical Value: The findings were discussed with Dr. David Wilson by Dr. Khadija Ahmed via telephone on 3/8/2025 10:46 AM EST. They confirmed that they understood the findings. #888#_
_
    EXAM: CTA NECK W AND OR WO CONTRAST_
EXAM: CTA HEAD W AND OR WO CONTRAST W VIZ LVO AI_
_
INDICATION:     Headache, sudden, severe_
_
TECHNIQUE: Axial thin section CT angiography of the neck and head was performed with 100 mL of IOHEXOL 350 MG IODINE/ML INTRAVENOUS SOLUTION administered intravenously. 3-D reconstructions of the cervical and cranial vessels were performed on a separate 3-D workstation, including MIP, curved MPR, and VRT images utilizing a radiologist approved protocol and were interpreted along with source images. Image data from the CTA was sent to Viz.ai for LVO analysis. This analysis was sent to the stroke team for clinical use.  _
_
COMPARISON: None available.. _
 _
FINDINGS:_
_
The diagnostic quality of the examination is adequate._
_
Pulmonary arteries: No included emboli._
_
Arch anatomy and vessel origins: Standard three-vessel arch anatomy. Mild calcified and noncalcified plaque at the origins of the great vessels with no significant stenosis.      _
_
Right carotid system: Calcified and soft plaque at the bifurcation with no stenosis._
_
Left carotid system: Calcified and soft plaque at the bifurcation with no stenosis._
_
Right vertebral artery: Normal._
_
Left vertebral artery:  Normal._
_
Intracranial posterior circulation: Normal bilateral V4 segments.  Bilateral PICA, anterior inferior cerebellar, and superior cerebellar arteries enhance normally. Hypoplastic basilar artery and the setting of bilateral fetal origin PCAs. Bilateral P1 and P2 segments demonstrate mild multifocal irregularity without flow-limiting stenosis. Distally PCAs are patent. . _
_
Intracranial anterior circulation: Mild calcified plaques of the parasellar internal carotids with no stenosis. The bilateral middle cerebral arteries and branches enhance normally. There is mild irregularity of the bilateral M1 and proximal M2 segments which is better seen on coronal MIP (for example series 312 image 24). There is early MCA branching on the left, with a few areas of more severe focal narrowing involving a left M2 anterior temporal division (series 304 image 375) and proximal posterior division before entering the sylvian fissure (series 304 image 373). Distally, MCAs are grossly patent. The bilateral anterior cerebral arteries enhance normally._
_
Venous structures: Venous structures are not well opacified in the arterial phase and cannot be assessed._
_
Extravascular findings: Intracranial findings are reported separately. Mild cervical degenerative disc disease without high-grade spinal canal or bony neuroforaminal stenosis, degenerative anterolisthesis C3-4. Stairstepping artifacts seen through the T1 vertebral body._
_
3D reconstructions of the neck vessels including MIP and curved MPR images reconstructed on a separate 3D workstation show no evidence of dissection or pseudoaneurysm._
_
3D reconstructions of the cranial vessels including MIP and VRT images reconstructed on a separate 3D workstation show the above-described findings to better advantage_
_
IMPRESSION:_
_
Neck_
1.	No evidence of carotid or vertebral artery stenosis._
2.	No dissection or pseudoaneurysm._
_
Head_
1.	Mild multifocal irregularity of the bilateral MCA M1 and M2 divisions may reflect atherosclerotic disease or vasospasm in the setting of intracranial hemorrhage._
2.	Up to severe focal narrowing of the left M2 anterior division without large vessel occlusion._
3.	Multifocal irregularity of the bilateral PCAs without significant stenosis again is nonspecific, may be related to vasospasm or atherosclerosis._
4.	No evidence for intracranial vascular injury. No aneurysm._
             _
_
Report Verified by: Khadija Ahmed, DO at 3/8/2025 11:11 AM EST""".splitlines()

found = []
for line in text:
    if pattern_888.match(line):
        found.append(line)

for line_with_date in found:
    print(line_with_date)
    critical_result_datetime = parse_helper(line_with_date)


9.	Critical Value: Large volume multicompartment extra-axial hemorrhage, intracranial gas. #888#._
-----Parsing
9.	Critical Value: Large volume multicompartment extra-axial hemorrhage, intracranial gas. #888#._
Critical Value: The findings were discussed with Dr. David Wilson by Dr. Khadija Ahmed via telephone on 3/8/2025 10:46 AM EST. They confirmed that they understood the findings. #888#_
-----Parsing
Critical Value: The findings were discussed with Dr. David Wilson by Dr. Khadija Ahmed via telephone on 3/8/2025 10:46 AM EST. They confirmed that they understood the findings. #888#_


In [32]:
found

['9.\tCritical Value: Large volume multicompartment extra-axial hemorrhage, intracranial gas. #888#._',
 'Critical Value: The findings were discussed with Dr. David Wilson by Dr. Khadija Ahmed via telephone on 3/8/2025 10:46 AM EST. They confirmed that they understood the findings. #888#_']

In [33]:
df = process_critical_results(file_path)

ID: 1
Report created: 	 2025-01-31 15:58:30
-----Parsing
8# Critical value:  Increase SDH and mass effect.  PA Brendan Wilson messaged, and called by Dr. Tomsick 4:15PM, expressed understanding.  
Before parsing datetime	 2025-04-08 16:15:00
---
Candidate time	 (['Big difference in days, date taken from Report Created'], datetime.datetime(2025, 1, 31, 16, 15))
diff	 990.0
	smallest	 inf
	current	 990.0
Fin CR datetime:	 2025-01-31 16:15:00
ID: 2
Report created: 	 2025-03-13 15:35:00
-----Parsing
.	CRITICAL RESULT: Feeding tube in airway.  This finding was discussed with Lisa Seiler, RN on  3/13/2025 3:35 PM EDT by telephone. They confirmed that they understood the findings communicated to them.  #888#evaluate_
Before parsing datetime	 2025-03-13 15:35:00
diff	 0.0
	smallest	 inf
	current	 0.0
Fin CR datetime:	 2025-03-13 15:35:00
ID: 3
Report created: 	 2025-02-05 14:46:45
-----Parsing
.	CRITICAL RESULT: Feeding tube in airway.  This finding was discussed with John Joseph, RN. on  2/5/

  df.at[index, 'Note'] = ', '.join(irregular_text)


8#_
Before parsing datetime	 2025-04-08 00:00:00
---
Candidate time	 (['Time likely not parsed', 'Big difference in days, date taken from Report Created'], datetime.datetime(2025, 1, 18, 0, 0))
diff	 69614.0
	smallest	 inf
	current	 69614.0
Fin CR datetime:	 2025-01-18 00:00:00
ID: 271
Report created: 	 2025-01-18 13:35:04
-----Parsing
Critical Value: The above findings were discussed with Margaret L Smith, RN by Jared Vearrier, MD on 1/18/2025 at 1338.  They confirmed that they understood the findings communicated to them.  #888#_
Before parsing datetime	 2025-01-18 13:38:00
diff	 176.0
	smallest	 inf
	current	 176.0
Fin CR datetime:	 2025-01-18 13:38:00
ID: 272
Report created: 	 2025-01-18 13:27:43
-----Parsing
Critical Value: The above findings were discussed with Sarah Hunkler, RN by Jared Vearrier, MD on 1/18/2025 at 1330.  They confirmed that they understood the findings communicated to them.  #888#_
Before parsing datetime	 2025-01-18 13:30:00
diff	 137.0
	smallest	 inf
	current

In [34]:
def df_to_excel(df: pd.DataFrame):
    """
    Create a fully formatted Excel file from DataFrame.

    Inputs:
    --------
    df (pd.DataFrame): 
        DataFrame of Critical Findings data.
    """
    # Create Excel file
    processed_file_name = f'Processed - {file_name}'
    df.to_excel(processed_file_name, sheet_name='Report', index=False)

    # Get column numbers of relevant rows
    percentage_31_col = df.columns.get_loc('%Under 31 min')
    percentage_61_col = df.columns.get_loc('%Under 61 min')
    report_created_col = df.columns.get_loc('Report Created Date/Time')
    cr_col = df.columns.get_loc('CR called Date/Time')


    workbook = load_workbook(processed_file_name)
    worksheet = workbook['Report']

    header_names = df.columns.to_list()
    header_fill = styles.PatternFill(fill_type='solid', fgColor='1A1A33')
    header_font = styles.Font(color='EEEEEE', bold=True)

    summary_fill = styles.PatternFill(fill_type='solid', fgColor='A2A2A2')
    summary_font = styles.Font(color='000000')

    # Format the column headers
    for col_num, header_name in enumerate(header_names, 1):
        cell = worksheet.cell(row=1, column=col_num)
        cell.value = header_name
        if col_num < 9:
            cell.font = header_font
            cell.fill = header_fill
        else:
            if col_num in [percentage_31_col + 1, percentage_61_col + 1]:
                cell.fill = summary_fill
                cell.font = summary_font

    # Format datetime
    for index, row in df.iterrows():
        cell = worksheet.cell(row=index + 2, column=report_created_col + 1)
        cell.value = row['Report Created Date/Time']
        cell.number_format = 'M/D/YYYY h:mm AM/PM'

        cell = worksheet.cell(row=index + 2, column=cr_col + 1)
        cell.value = row['CR called Date/Time']
        cell.number_format = 'M/D/YYYY h:mm AM/PM'

    workbook.save(processed_file_name)


In [35]:
df_to_excel(df)

In [36]:
raise Exception('Stop')

Exception: Stop

In [None]:
multiple_888_text = df[df['Sort#'].isin(multiple_888)]['Report Text'].to_list()
for t in multiple_888_text:
    print('============================================')
    print(t)

In [None]:
no_date_text = df[df['Sort#'].isin(no_date)]['Report Text'].to_list()
for t in no_date_text:
    print('============================================')
    print(t)

In [None]:
weird_time_format_text = df[df['Sort#'].isin(weird_time_format)]['Report Text'].to_list()
for t in weird_time_format_text:
    print('============================================')
    print(t)