# Questions

1. About the file itself
- Can I have an example file?
- Need to extract the date & time from line ending in #888# -> Will there be more than one line that ends with #888#? Contains #888#? #888# evaluate? On that line, will there be more than one date? 
   - Communication date
   - Within the hour of report creation
- Will the date format always be the same?
- Col E, F and G are calculations for time elapses from time on Col C to time in Col D -> So E, F, G will be the same? What is the desired date format? What will the names of each column?

2. About the infrastructure
- What's the frequency of this task? How often do you want this same processing?
- What we are planning is to have a shared drive, where you put the files that need processing in there, and we will run the program on those files, and put the processed files into the same shared drive.
- What this means is that there needs to be a person who regularly uploads the files that need processing into the shared drive.
- Naming convention for the files that need processing might be necessary.

In [4]:
import pandas as pd
import re
import datetime
import xlsxwriter
from dateutil.parser import parse, ParserError
from pathlib import Path

In [5]:
last_month = (datetime.date.today().replace(day=1) - datetime.timedelta(days=1)).strftime("%B %Y")
file_name = f'Critical Results - {last_month}.xlsx'
file_dir = Path('/Volumes/RadReportsAdmin/CR Folder')
file_path = file_dir / file_name

assert file_path.exists()

In [6]:
multiple_888 = set()
no_888 = set()
no_date = set()
weird_time_format = set()
pattern_888 = re.compile(".*#888.*")


In [7]:
def parse_helper(line: str):
    """
    Parse `line` for datetime. Handle errors:
    - Line with datetime, but raise error due to having numbers/month names 
    that precede the datetime substring.

    If no datetime substring is found, return None.

    Algorithm:
    -------
       Slice the start of `line` off until a datetime has been found, or 
       all sliced iterations have been processed.

    Inputs:
    --------
    line (str): 
        The line with (potential) datetime.
    """
    # 1. Slice line from start -> end
    # Substring needs at least 8 characters to represent a datetime
    n = len(line)

    for i in range(n):
        substring = line[i:]
        try:
            critical_result_date = parse(substring, ignoretz=True, fuzzy=True)
            if (critical_result_date > datetime.datetime.now()) or (critical_result_date < datetime.datetime(2000, 1, 1)):
                continue
            
            return critical_result_date
        
        except ParserError:
            continue

    return None


In [32]:
def process_critical_results(excel_path: Path):
   df = pd.read_excel(excel_path)
   df['Report Text'] = df['Report Text'].str.replace('_x000D', '')
   df['Called <\n31 minutes'] = 0
   df['Called < \n61 minutes'] = 0
   columns = list(df.columns)
   columns[7] = 'Note'
   df.columns = columns

   # Find line with #888#
   for index, row in df.iterrows():
      report_text = row['Report Text'].strip().splitlines() # Row B
      report_id = row['Sort#']

      line_with_date = ''
      critical_result_datetime = None

      # Find all lines with #888#
      found = []
      for line in report_text:
         if pattern_888.match(line):
            found.append(line)

      if not found:
         no_888.add(report_id)
         continue

      # Get the first row with a valid datetime
      for line_with_date in found:
         critical_result_datetime = parse_helper(line_with_date)

         if critical_result_datetime:
            break

      # If not found line with date
      if not critical_result_datetime:
         no_date.add(report_id)
         continue

      # Calculate other rows    
      report_created = row['Report Created Date/Time'] # Row C
      df.at[index, 'CR called Date/Time'] = critical_result_datetime
      minutes_diff = (critical_result_datetime - report_created).total_seconds() / 60.0
      df.at[index, 'Minutes From\nCreated to Called'] = round(minutes_diff, 2)

      if minutes_diff < 31:
         df.at[index, 'Called <\n31 minutes'] = 1
      if minutes_diff < 61:
         df.at[index, 'Called < \n61 minutes'] = 1

      # Note irregularities
      irregular_text = []
      
      critical_result_time = f'{critical_result_datetime.hour}:{critical_result_datetime.minute}:{critical_result_datetime.second}'
      
      # Date likely not found
      if critical_result_time == '0:0:0':
         irregular_text.append('Time likely not parsed')

      critical_result_date = critical_result_datetime.date()
      
      # Date likely not found
      if critical_result_date == datetime.datetime.now().date():
         irregular_text.append('Date likely not parsed')

      # Date difference is big
      if abs(minutes_diff) > 200:
         irregular_text.append("Big Value")

      df.at[index, 'Note'] = ', '.join(irregular_text)

      
   # %Under 31 mins
   df.rename(
      columns={
         df.columns[9]: len(df[df['Called <\n31 minutes'] == 1]) / df.shape[0]
      }, 
      inplace=True
   ) 

   # %Under 61 minutes
   df.rename(
      columns={
         df.columns[11]: len(df[df['Called < \n61 minutes'] == 1]) / df.shape[0]
      }, 
      inplace=True
   ) 
   
   return df


In [33]:
df = process_critical_results(file_path)

  df.at[index, 'Note'] = ', '.join(irregular_text)


In [10]:
def df_to_excel(df: pd.DataFrame):
    """
    Create a fully formatted Excel file from DataFrame.

    Inputs:
    --------
    df (pd.DataFrame): 
        DataFrame of Critical Findings data.
    """
    writer = pd.ExcelWriter(
                f'Processed_CR Findings_{datetime.datetime.now()}.xlsx',
                engine='xlsxwriter',
                # datetime_format='MM/dd/yyyy K:mm a'
    )

    df.to_excel(writer, startrow=1, header=False, sheet_name='Report')

    workbook = writer.book
    worksheet = writer.sheets['Report']

    header_format = workbook.add_format({
        'bold': True,
        'text_wrap': True,
        'valign': 'top',
        'fg_color': '#1A1A24',
        'font_color': '#FFFFFF'
    })
    
    # Write the column headers with the defined format
    for col_num, value in enumerate(df.columns.values[:-5]):
        worksheet.write(0, col_num + 1, value, header_format)

    # summary_format = workbook.add_format({
    #     'bold': True,
    #     'text_wrap': True,
    #     'valign': 'top',
    #     'fg_color': '909090',
    #     'font_color': 'Black'
    # })

    # worksheet.write(0, 8, '%Under 31 mins', summary_format)
    # worksheet.write(0, 10, '%Under 61 mins', summary_format)


In [35]:
df.to_excel(f'Proccessed_{datetime.datetime.now()}.xlsx', index=False)

In [12]:
# df_to_excel(df)

In [13]:
print(
    f'''Lengths
    multiple 888:   {len(multiple_888)}
    no_888:         {len(no_888)}
    no_date:        {len(no_date)}
    weird_time:     {len(weird_time_format)}
    '''
)

Lengths
    multiple 888:   0
    no_888:         0
    no_date:        0
    weird_time:     0
    


In [14]:
df = pd.read_excel(file_path)
df['Report Text'] = df['Report Text'].str.replace('_x000D_', '')

df[df['Sort#'].isin(multiple_888)].to_excel('Multiple_888.xlsx')

df.columns[7]



'Unnamed: 7'

In [15]:
multiple_888_text = df[df['Sort#'].isin(multiple_888)]['Report Text'].to_list()
for t in multiple_888_text:
    print('============================================')
    print(t)

In [16]:
no_date_text = df[df['Sort#'].isin(no_date)]['Report Text'].to_list()
for t in no_date_text:
    print('============================================')
    print(t)

In [17]:
weird_time_format_text = df[df['Sort#'].isin(weird_time_format)]['Report Text'].to_list()
for t in weird_time_format_text:
    print('============================================')
    print(t)

In [18]:
test_string = """
Critical Value: Age-indeterminate right lower lobe pulmonary embolism, which may be acute on chronic.  This finding was discussed with Victoria Martel, M.D. on 1/23/2025 at 5:04 PM by telephone.  They confirmed that they understood the findings communicated to them.
"""
parse(test_string, fuzzy=True, ignoretz=True)

ParserError: Unknown string format: 
Critical Value: Age-indeterminate right lower lobe pulmonary embolism, which may be acute on chronic.  This finding was discussed with Victoria Martel, M.D. on 1/23/2025 at 5:04 PM by telephone.  They confirmed that they understood the findings communicated to them.
