In [660]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np


In [661]:



def excel_to_dataframes(file_path):
    # Read the Excel file
    xl = pd.ExcelFile(file_path)
    
    sheet_names = xl.sheet_names
    
    dfs_dict = {}
    
    for sheet_name in sheet_names:
        df = xl.parse(sheet_name)
        # Select only the first 8 columns
        while len(df.columns) < 9:
            df[len(df.columns)] = np.nan
        
        # Select only the first 9 columns
        df = df.iloc[:, :9]

        # Remove rows with at least one NaN value
        df = df.dropna()
        
        df.columns = ['STATUS', 'DATE', 'SHIFT', 'HOURS', 'RATE', 'COST', 'ON CALL', 'ROLE', 'UNIT']
        df = df[df['STATUS'].isin(['NEW', 'CURRENT', 'VACANT', 'PENDING'])]
        
        #df['HOSPITAL'] = sheet_name
        df = df.reset_index(drop=True)
        for col in df.columns:
            df[f'{col}-VALIDATE'] = True
        dfs_dict[sheet_name.strip()] = df
    
    return dfs_dict

In [662]:
def validate_date(df):
    # Initialize an empty list to store boolean values indicating whether each row is valid
    indices = []
    # Iterate through each row in the dataframe
    for index, row in df.iterrows():
        try:
            # Attempt to parse the date value using the specified format
            date = pd.to_datetime(row['DATE'])
            df.at[index, 'DATE'] = date.strftime('%A, %d %B %Y')
            df.at[index, 'DATE-VALIDATE'] = True
        except ValueError:
            # If parsing fails, append False to the validation list
            df.at[index, 'DATE-VALIDATE'] = False
            indices.append(index)
            print(index, "date validate failed")
    
    return df, indices

In [663]:
def validate_shift(df):
    # Iterate through each row in the dataframe
    indices = []
    for index, row in df.iterrows():
        shift_value = row['SHIFT']
        # Check if the shift value is in the format "HHMM-HHMM"
        shift_value = "".join(shift_value.strip().split(" "))
        if not isinstance(shift_value, str) or not len(shift_value) == 9 or not shift_value[4] == '-':
            # If the shift value is not in the correct format, set shift-validate to False for this row

            df.at[index, 'SHIFT-VALIDATE'] = False
            indices.append(index)
        else:
            # If the shift value is in the correct format, set shift-validate to True for this row
            df.at[index, 'SHIFT-VALIDATE'] = True
    
    return df, indices

In [664]:
def validate_hours(df):
    indices = []
    # Iterate through each row in the dataframe
    for index, row in df.iterrows():
        shift_value = row['SHIFT']
        hours_value = row['HOURS']
        # Check if the shift value is in the correct format "HHMM-HHMM"
        if not isinstance(shift_value, str) or not len(shift_value) == 9 or not shift_value[4] == '-':
            # If the shift value is not in the correct format, set hours-validate to False for this row
            df.at[index, 'HOURS-VALIDATE'] = False
            indices.append(index)
        else:
            # If the shift value is in the correct format, calculate the hours worked
            start_hour, end_hour = shift_value.split('-')
            start_hour = int(start_hour[:2]) + int(start_hour[2:]) / 60
            end_hour = int(end_hour[:2]) + int(end_hour[2:]) / 60
            hours_worked = end_hour - start_hour
            if hours_worked<0:
                hours_worked = 24 + hours_worked
            # Check if the calculated hours match the value in the HOURS column
            if hours_worked != hours_value:  # Allowing for small floating point differences
                # If the calculated hours do not match, assign the correct value
                #df.at[index, 'HOURS'] = hours_worked
                indices.append(index)
                # Set hours-validate to False for this row
                df.at[index, 'HOURS-VALIDATE'] = False
            else:
                # If the calculated hours match, set hours-validate to True for this row
                df.at[index, 'HOURS-VALIDATE'] = True
    
    return df, indices


In [665]:
def validate_rate(df):
    # Iterate through each row in the dataframe
    indices = []
    for index, row in df.iterrows():
        rate_value = row['RATE']
        cost_value = row['COST']
        hours_value = row['HOURS']
        
        # Check if rate equals cost and hours is not equal to 1
        if rate_value == cost_value or type(rate_value)!=type(cost_value):
            # If rate equals cost but hours is not 1, set rate-validate to False for this row
            indices.append(index)
            df.at[index, 'RATE-VALIDATE'] = False
        else:
            # Otherwise, set rate-validate to True for this row
            df.at[index, 'RATE-VALIDATE'] = True
    
    return df,indices

In [666]:
def validate_cost(df):
    indices = []
    for index, row in df.iterrows():
        rate_value = row['RATE']
        hours_value = row['HOURS']
        cost_value = row['COST']
        
        expected_cost = rate_value * hours_value
        if expected_cost != cost_value:
            indices.append(index)
            df.at[index, 'COST-VALIDATE'] = False
        else:
            df.at[index, 'COST-VALIDATE'] = True
    
    return df, indices

In [667]:
def validate_oncall(df):
    # Iterate through each row in the dataframe
    indices = []
    for index, row in df.iterrows():
        oncall_value = row['ON CALL']
        
        # Check if cost value is either 'yes' or 'no'
        try:
            if oncall_value.lower() not in ['yes', 'no']:
                # If cost value is not 'yes' or 'no', set cost-validate to False for this row
                df.at[index, 'ON CALL-VALIDATE'] = False
                indices.append(index)
            else:
                # Otherwise, set cost-validate to True for this row
                df.at[index, 'ON CALL-VALIDATE'] = True
        except:
             df.at[index, 'ON CALL-VALIDATE'] = False
    
    return df, indices

In [668]:
def validate_roles(df):
    valid_roles = ['CMO Senior', 'REGISTRAR', 'RMO', 'SRMO', 'CMO NON IC', 'REGISTRAR IC']
    indices = []
    # Iterate through each row in the dataframe
    for index, row in df.iterrows():
        role_value = row['ROLE']
        try:
        # Check if role value is in the predefined list of valid roles
            if role_value not in valid_roles:
                # If role value is not in the predefined list, set roles-validate to False for this row
                indices.append(index)
                df.at[index, 'ROLE-VALIDATE'] = False
            else:
                # Otherwise, set roles-validate to True for this row
                df.at[index, 'ROLE-VALIDATE'] = True
        except:
            df.at[index, 'ROLE-VALIDATE'] = False
    
    return df, indices

In [669]:


def validate_units(df):
    valid_units = ['ANAESTH', 'ED', 'FACILITY', 'ICU', 'MEDICAL', 'O & G', 'ONCOLOGY', 'ORTHO', 'PAEDS', 'PSYCH', 'SURGICAL', 'WARDS']
    indices = []
    # Iterate through each row in the dataframe
    for index, row in df.iterrows():
        unit_value = row['UNIT']
        try:
            # Check if unit value is in the predefined list of valid units
            if unit_value not in valid_units:
                # If unit value is not in the predefined list, set unit-validate to False for this row
                indices.append(index)
                df.at[index, 'UNIT-VALIDATE'] = False
            else:
                # Otherwise, set unit-validate to True for this row
                df.at[index, 'UNIT-VALIDATE'] = True
        except:
            df.at[index, 'UNIT-VALIDATE'] = False
    
    return df, indices

In [670]:

file_path = 'data/HNELHD NON-SPECIALIST LOCUM VACANCY LIST (18) (1).xlsx'  # Change this to your Excel file path
dataframes = excel_to_dataframes(file_path)


In [671]:
INDEX = 6
print(list(dataframes.keys())[INDEX])
dataframes[list(dataframes.keys())[INDEX]]

ARMIDALE DENTIST


Unnamed: 0,STATUS,DATE,SHIFT,HOURS,RATE,COST,ON CALL,ROLE,UNIT,STATUS-VALIDATE,DATE-VALIDATE,SHIFT-VALIDATE,HOURS-VALIDATE,RATE-VALIDATE,COST-VALIDATE,ON CALL-VALIDATE,ROLE-VALIDATE,UNIT-VALIDATE
0,VACANT,2024-04-15 00:00:00,0800-1630,8,100,800,NO,DENTIST,ARMIDALE,True,True,True,True,True,True,True,True,True
1,VACANT,2024-04-16 00:00:00,0800-1630,8,100,800,NO,DENTIST,ARMIDALE,True,True,True,True,True,True,True,True,True
2,VACANT,2024-04-17 00:00:00,0800-1630,8,100,800,NO,DENTIST,ARMIDALE,True,True,True,True,True,True,True,True,True
3,VACANT,2024-04-18 00:00:00,0800-1630,8,100,800,NO,DENTIST,ARMIDALE,True,True,True,True,True,True,True,True,True
4,VACANT,2024-04-19 00:00:00,0800-1630,8,100,800,NO,DENTIST,ARMIDALE,True,True,True,True,True,True,True,True,True
5,VACANT,2024-04-22 00:00:00,0800-1630,8,100,800,NO,DENTIST,ARMIDALE,True,True,True,True,True,True,True,True,True
6,VACANT,2024-04-23 00:00:00,0800-1630,8,100,800,NO,DENTIST,ARMIDALE,True,True,True,True,True,True,True,True,True
7,VACANT,2024-04-24 00:00:00,0800-1630,8,100,800,NO,DENTIST,ARMIDALE,True,True,True,True,True,True,True,True,True
8,VACANT,2024-04-25 00:00:00,0800-1630,8,100,800,NO,DENTIST,ARMIDALE,True,True,True,True,True,True,True,True,True
9,VACANT,2024-04-26 00:00:00,0800-1630,8,100,800,NO,DENTIST,ARMIDALE,True,True,True,True,True,True,True,True,True


In [672]:
datevalidated = {}
date_val_failed = {}
for x in dataframes.keys():
    datevalidated[x],date_val_failed[x] = validate_date(dataframes[x])

In [673]:
shiftvalidated = {}
shift_val_failed = {}
for x in dataframes.keys():
    shiftvalidated[x],shift_val_failed[x] = validate_shift(datevalidated[x])

In [674]:
hoursvalidated = {}
hours_val_failed = {}
for x in dataframes.keys():
    hoursvalidated[x],hours_val_failed[x] = validate_hours(shiftvalidated[x])

In [675]:
ratesvalidated = {}
rates_val_failed = {}
for x in dataframes.keys():
    ratesvalidated[x],rates_val_failed[x] = validate_rate(hoursvalidated[x])

In [676]:
costvalidated = {}
cost_val_failed = {}
for x in dataframes.keys():
    costvalidated[x],cost_val_failed[x] = validate_cost(ratesvalidated[x])

In [677]:
oncallvalidated = {}
oncall_val_failed = {}
for x in dataframes.keys():
    oncallvalidated[x],oncall_val_failed[x] = validate_oncall(costvalidated[x])

In [678]:
rolesvalidated = {}
roles_val_failed = {}
for x in dataframes.keys():
    rolesvalidated[x],roles_val_failed[x] = validate_roles(oncallvalidated[x])

In [679]:
unitvalidated = {}
unit_val_failed = {}
for x in dataframes.keys():
    unitvalidated[x],unit_val_failed[x] = validate_roles(rolesvalidated[x])

In [680]:
unitvalidated

{'MAIN MENU': Empty DataFrame
 Columns: [STATUS, DATE, SHIFT, HOURS, RATE, COST, ON CALL, ROLE, UNIT, STATUS-VALIDATE, DATE-VALIDATE, SHIFT-VALIDATE, HOURS-VALIDATE, RATE-VALIDATE, COST-VALIDATE, ON CALL-VALIDATE, ROLE-VALIDATE, UNIT-VALIDATE]
 Index: [],
 'ARMIDALE CMO ED':    STATUS                    DATE      SHIFT  HOURS   RATE    COST ON CALL  \
 0  VACANT   Tuesday, 04 June 2024  0800-2030   12.0  250.0  3000.0     NO    
 1  VACANT  Saturday, 08 June 2024  0800-2030   12.0  250.0  3000.0     NO    
 2  VACANT    Monday, 17 June 2024  2000-0830   12.0  250.0  3000.0     NO    
 3  VACANT   Tuesday, 18 June 2024  2000-0830   12.0  250.0  3000.0     NO    
 
      ROLE UNIT  STATUS-VALIDATE  DATE-VALIDATE  SHIFT-VALIDATE  \
 0  CMO IC   ED             True           True            True   
 1  CMO IC   ED             True           True            True   
 2  CMO IC   ED             True           True            True   
 3  CMO IC   ED             True           True            T

In [681]:
validations = [validate_date, validate_hours, validate_rate, validate_cost, validate_hours, validate_roles, validate_oncall, validate_units]