In [7]:
import pandas as pd

# Load the dataset
dataset = pd.read_excel('Hospital Patient Dataset Validata.xlsx')

# Step 1: Function to recognize 'Staging-T Category' as 'DiagStaging_StagingTCategory' and flag incorrect names
def find_equivalent_column_staging_t(data, expected_column):
    alternative_names = ["Staging-T Category", "T Category"]  # Add more alternatives as needed
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be '{expected_column}'")
            return col  # Return the recognized column
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found in dataset.")
    print(f"   Found columns: {list(data.columns)}")
    print(f"   The correct column name should be '{expected_column}'")
    return None

# Step 2: Validate 'Staging-T Category' values (must be one of the allowed values)
def validate_staging_t_values(data, staging_t_column):
    allowed_values = ["Clinical", "Pathologic", "Radiographic"]
    invalid_values = []

    for index, row in data.iterrows():
        staging_t_value = row[staging_t_column]
        
        # If value is not null and not in the allowed list, flag as invalid
        if pd.notna(staging_t_value) and staging_t_value not in allowed_values:
            invalid_values.append({
                'Row': index,
                'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                staging_t_column: staging_t_value,
                'Error': 'Invalid Staging-T Category'
            })

    # Return the DataFrame with invalid rows
    return pd.DataFrame(invalid_values)

# Step 3: Check for missing 'DiagStaging_StagingTCategory' values
def validate_missing_staging_t(data, staging_t_column):
    missing_staging_t = data[data[staging_t_column].isna()]
    if not missing_staging_t.empty:
        print(f"3. Rows with missing '{staging_t_column}':")
        print(missing_staging_t[['Anonymized Patient Identifier', staging_t_column]])
    else:
        print(f"3. No missing '{staging_t_column}' values found (null values are allowed).")

# Run the validation functions for 'Staging-T Category'
expected_column_staging_t = 'DiagStaging_StagingTCategory'

# Step 1: Recognize and handle equivalent columns for 'DiagStaging_StagingTCategory'
recognized_column_staging_t = find_equivalent_column_staging_t(dataset, expected_column_staging_t)

# If an equivalent column is recognized, proceed with the validation
if recognized_column_staging_t:
    # Step 2: Validate values in the recognized column
    invalid_staging_t_values = validate_staging_t_values(dataset, recognized_column_staging_t)
    
    # Output the number of invalid rows
    invalid_count_staging_t = len(invalid_staging_t_values)
    if invalid_count_staging_t > 0:
        # Extract just the row numbers
        invalid_row_numbers_staging_t = invalid_staging_t_values['Row'].tolist()
        invalid_row_numbers_str_staging_t = ','.join(map(str, invalid_row_numbers_staging_t))

        # Display the count and the row numbers
        print(f"2. {invalid_count_staging_t} rows have an invalid 'Staging-T Category':")
        print(f"   Row numbers: {invalid_row_numbers_str_staging_t}\n")

        # Output the detailed list of invalid rows
        print(invalid_staging_t_values.to_string(index=False))  # Output invalid rows without index
    else:
        print(f"2. No rows with invalid 'Staging-T Category' values found.\n")

    # Step 3: Re-run the missing value check
    validate_missing_staging_t(dataset, recognized_column_staging_t)

    #More than one value allowed for this one, so no duplicate check necessary

1. Recognized 'Staging-T Category' as 'DiagStaging_StagingTCategory' for validation, but the column name is incorrect.
   The correct column name should be 'DiagStaging_StagingTCategory'
2. No rows with invalid 'Staging-T Category' values found.

3. No missing 'Staging-T Category' values found (null values are allowed).


In [8]:
import pandas as pd

# Load the dataset
dataset = pd.read_excel('Hospital Patient Dataset Validata.xlsx')

# Step 1: Function to recognize 'Staging-T' as 'DiagStaging_StagingT' and flag incorrect names
def find_staging_t_column(data, expected_column):
    alternative_names = ["Staging-T", "T Staging", "Tumor Staging"]  # Add more alternatives as needed
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be '{expected_column}'")
            return col  # Return the recognized column
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found in dataset.")
    print(f"   Found columns: {list(data.columns)}")
    print(f"   The correct column name should be '{expected_column}'")
    return None

# Step 2: Function to validate if the values in the 'Staging-T' column are part of the standard list
def validate_staging_t_values(data, staging_t_column):
    standard_values = [
        "T1a", "T1b", "T1c", "T2a", "T2b", "T2c", "T3a", "T3b", "T4", 
        "Tx (only if prostate has been removed)", "Information Not Available"
    ]
    
    invalid_staging_t = []

    # Validate each entry in the Staging-T column
    for index, row in data.iterrows():
        staging_t_value = row[staging_t_column]
        
        # If the value is not null and not in the standard list, flag it
        if pd.notna(staging_t_value) and staging_t_value not in standard_values:
            invalid_staging_t.append({
                'Row': index,
                'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                staging_t_column: staging_t_value,
                'Error': 'Invalid Staging-T Value'
            })

    # Return the DataFrame with invalid rows
    return pd.DataFrame(invalid_staging_t)

# Step 3: Check for missing 'DiagStaging_StagingT' values
def validate_missing_staging_t(data, staging_t_column):
    missing_staging_t = data[data[staging_t_column].isna()]
    if not missing_staging_t.empty:
        print(f"3. Rows with missing '{staging_t_column}':")
        print(missing_staging_t[['Anonymized Patient Identifier', staging_t_column]])
    else:
        print(f"3. No missing '{staging_t_column}' values found.")

# Run the validation functions
expected_staging_t_column = 'DiagStaging_StagingT'

# Step 1: Recognize and handle equivalent columns for 'Staging-T'
recognized_staging_t_column = find_staging_t_column(dataset, expected_staging_t_column)

# If an equivalent column is recognized, proceed with the validation
if recognized_staging_t_column:
    # Step 2: Validate the values in the 'Staging-T' column
    invalid_staging_t = validate_staging_t_values(dataset, recognized_staging_t_column)
    
    # Output the number of invalid rows
    invalid_count = len(invalid_staging_t)
    if invalid_count > 0:
        # Extract just the row numbers
        invalid_row_numbers = invalid_staging_t['Row'].tolist()
        invalid_row_numbers_str = ','.join(map(str, invalid_row_numbers))

        # Display the count and the row numbers
        print(f"2. {invalid_count} rows have an invalid Staging-T value:")
        print(f"   Row numbers: {invalid_row_numbers_str}\n")

        # Output the detailed list of invalid rows
        print(invalid_staging_t.to_string(index=False))  # Output invalid rows without index
    else:
        print(f"2. No rows with invalid Staging-T values found.\n")

    # Step 3: Re-run the missing value check
    validate_missing_staging_t(dataset, recognized_staging_t_column)

1. Recognized 'Staging-T' as 'DiagStaging_StagingT' for validation, but the column name is incorrect.
   The correct column name should be 'DiagStaging_StagingT'
2. No rows with invalid Staging-T values found.

3. No missing 'Staging-T' values found.


In [9]:
import pandas as pd

# Load the dataset
dataset = pd.read_excel('Hospital Patient Dataset Validata.xlsx')

# Step 1: Function to recognize 'Staging-N' as 'DiagStaging_StagingN' and flag incorrect names
def find_equivalent_column(data, expected_column):
    alternative_names = ["Staging-N", "Staging N"]  # Add more alternatives as needed
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be: '{expected_column}'")
            return col  # Return the recognized column
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found in dataset.")
    print(f"   Found columns: {list(data.columns)}")
    print(f"   The correct column name should be: '{expected_column}'")
    return None

# Step 2: Convert correctly formatted staging values and flag invalid strings
def convert_or_flag_staging_n_values(data, staging_n_column):
    valid_values = [
        "N0", "N1", "Nx", "Information Not Available"
    ]
    invalid_staging_n = []

    for index, row in data.iterrows():
        staging_n = row[staging_n_column]

        # Check if the value is not in the valid list
        if staging_n not in valid_values and pd.notna(staging_n):
            invalid_staging_n.append({
                'Row': index,
                'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                staging_n_column: staging_n,
                'Error': 'Invalid Staging-N value'
            })

    return pd.DataFrame(invalid_staging_n)

# Step 3: Check for missing 'Staging-N' values
def validate_missing_staging_n(data, staging_n_column):
    missing_staging_n = data[data[staging_n_column].isna()]
    if not missing_staging_n.empty:
        print(f"3. Rows with missing '{staging_n_column}':")
        print(missing_staging_n[['Anonymized Patient Identifier', staging_n_column]])
    else:
        print(f"3. No missing '{staging_n_column}' values found.")

# Run the validation functions
expected_column = 'DiagStaging_StagingN'

# Step 1: Recognize and handle equivalent columns for 'Staging-N'
recognized_column = find_equivalent_column(dataset, expected_column)

# If an equivalent column is recognized, proceed with the validation
if recognized_column:
    # Step 2: Convert correctly formatted staging values, flag invalid values
    invalid_staging_n = convert_or_flag_staging_n_values(dataset, recognized_column)
    
    # Output the number of invalid rows
    invalid_count = len(invalid_staging_n)
    if invalid_count > 0:
        invalid_row_numbers = invalid_staging_n['Row'].tolist()
        invalid_row_numbers_str = ','.join(map(str, invalid_row_numbers))

        print(f"2. {invalid_count} rows have an invalid Staging-N value:")
        print(f"   Row numbers: {invalid_row_numbers_str}\n")
        print(invalid_staging_n.to_string(index=False))  # Output invalid rows without index
    else:
        print(f"2. No rows with invalid Staging-N values found.\n")

    # Step 3: Check for missing values
    validate_missing_staging_n(dataset, recognized_column)

1. Recognized 'Staging-N' as 'DiagStaging_StagingN' for validation, but the column name is incorrect.
   The correct column name should be: 'DiagStaging_StagingN'
2. No rows with invalid Staging-N values found.

3. No missing 'Staging-N' values found.


In [10]:
import pandas as pd

# Load the dataset
dataset = pd.read_excel('Hospital Patient Dataset Validata.xlsx')

# Step 1: Function to recognize 'Staging-M' as 'DiagStaging_StagingM' and flag incorrect names
def find_equivalent_column(data, expected_column):
    alternative_names = ["Staging-M", "Staging M"]  # Add more alternatives as needed
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be: '{expected_column}'")
            return col  # Return the recognized column
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found in dataset.")
    print(f"   Found columns: {list(data.columns)}")
    print(f"   The correct column name should be: '{expected_column}'")
    return None

# Step 2: Convert correctly formatted staging values and flag invalid strings
def convert_or_flag_staging_m_values(data, staging_m_column):
    valid_values = [
        "M0", "M1a", "M1b", "M1c", "Mx", "Information Not Available"
    ]
    invalid_staging_m = []

    for index, row in data.iterrows():
        staging_m = row[staging_m_column]

        # Check if the value is not in the valid list
        if staging_m not in valid_values and pd.notna(staging_m):
            invalid_staging_m.append({
                'Row': index,
                'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                staging_m_column: staging_m,
                'Error': 'Invalid Staging-M value'
            })

    return pd.DataFrame(invalid_staging_m)

# Step 3: Check for missing 'Staging-M' values
def validate_missing_staging_m(data, staging_m_column):
    missing_staging_m = data[data[staging_m_column].isna()]
    if not missing_staging_m.empty:
        print(f"3. Rows with missing '{staging_m_column}':")
        print(missing_staging_m[['Anonymized Patient Identifier', staging_m_column]])
    else:
        print(f"3. No missing '{staging_m_column}' values found.")

# Run the validation functions
expected_column = 'DiagStaging_StagingM'

# Step 1: Recognize and handle equivalent columns for 'Staging-M'
recognized_column = find_equivalent_column(dataset, expected_column)

# If an equivalent column is recognized, proceed with the validation
if recognized_column:
    # Step 2: Convert correctly formatted staging values, flag invalid values
    invalid_staging_m = convert_or_flag_staging_m_values(dataset, recognized_column)
    
    # Output the number of invalid rows
    invalid_count = len(invalid_staging_m)
    if invalid_count > 0:
        invalid_row_numbers = invalid_staging_m['Row'].tolist()
        invalid_row_numbers_str = ','.join(map(str, invalid_row_numbers))

        print(f"2. {invalid_count} rows have an invalid Staging-M value:")
        print(f"   Row numbers: {invalid_row_numbers_str}\n")
        print(invalid_staging_m.to_string(index=False))  # Output invalid rows without index
    else:
        print(f"2. No rows with invalid Staging-M values found.\n")

    # Step 3: Check for missing values
    validate_missing_staging_m(dataset, recognized_column)

1. Recognized 'Staging-M' as 'DiagStaging_StagingM' for validation, but the column name is incorrect.
   The correct column name should be: 'DiagStaging_StagingM'
2. No rows with invalid Staging-M values found.

3. No missing 'Staging-M' values found.


In [11]:
import pandas as pd

# Load the dataset
dataset = pd.read_excel('Hospital Patient Dataset Validata.xlsx')

# Step 1: Function to recognize 'Imaging for Biopsy' as 'DiagStaging_ImagingBiopsy' and flag incorrect names
def find_equivalent_column_imaging_biopsy(data, expected_column):
    alternative_names = ["Imaging for Biopsy", "Biopsy Imaging"]  # Add more alternatives as needed
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be: '{expected_column}'")
            return col  # Return the recognized column
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found in dataset.")
    print(f"   Found columns: {list(data.columns)}")
    print(f"   The correct column name should be: '{expected_column}'")
    return None

# Step 2: Convert correctly formatted imaging values and flag invalid strings
def convert_or_flag_imaging_biopsy_values(data, imaging_biopsy_column):
    valid_values = [
        "Ultrasound", 
        "Magnetic Resonance Imaging", 
        "Computed Tomography"
    ]
    invalid_imaging_biopsy = []

    for index, row in data.iterrows():
        imaging_biopsy = row[imaging_biopsy_column]

        # Check if the value is not in the valid list
        if imaging_biopsy not in valid_values and pd.notna(imaging_biopsy):
            invalid_imaging_biopsy.append({
                'Row': index,
                'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                imaging_biopsy_column: imaging_biopsy,
                'Error': 'Invalid Imaging for Biopsy value'
            })

    return pd.DataFrame(invalid_imaging_biopsy)

# Step 3: Check for missing 'Imaging for Biopsy' values
def validate_missing_imaging_biopsy(data, imaging_biopsy_column):
    missing_imaging_biopsy = data[data[imaging_biopsy_column].isna()]
    if not missing_imaging_biopsy.empty:
        print(f"3. Rows with missing '{imaging_biopsy_column}':")
        print(missing_imaging_biopsy[['Anonymized Patient Identifier', imaging_biopsy_column]])
    else:
        print(f"3. No missing '{imaging_biopsy_column}' values found (null values are allowed).")

# Run the validation functions for 'Imaging for Biopsy'
expected_column_imaging_biopsy = 'DiagStaging_ImagingBiopsy'

# Step 1: Recognize and handle equivalent columns for 'Imaging for Biopsy'
recognized_column_imaging_biopsy = find_equivalent_column_imaging_biopsy(dataset, expected_column_imaging_biopsy)

# If an equivalent column is recognized, proceed with the validation
if recognized_column_imaging_biopsy:
    # Step 2: Convert correctly formatted values, flag invalid values
    invalid_imaging_biopsy = convert_or_flag_imaging_biopsy_values(dataset, recognized_column_imaging_biopsy)
    
    # Output the number of invalid rows
    invalid_count_imaging_biopsy = len(invalid_imaging_biopsy)
    if invalid_count_imaging_biopsy > 0:
        invalid_row_numbers = invalid_imaging_biopsy['Row'].tolist()
        invalid_row_numbers_str = ','.join(map(str, invalid_row_numbers))

        print(f"2. {invalid_count_imaging_biopsy} rows have an invalid Imaging for Biopsy value:")
        print(f"   Row numbers: {invalid_row_numbers_str}\n")
        print(invalid_imaging_biopsy.to_string(index=False))  # Output invalid rows without index
    else:
        print(f"2. No rows with invalid Imaging for Biopsy values found.\n")

    # Step 3: Check for missing values
    validate_missing_imaging_biopsy(dataset, recognized_column_imaging_biopsy)

1. Recognized 'Imaging for Biopsy' as 'DiagStaging_ImagingBiopsy' for validation, but the column name is incorrect.
   The correct column name should be: 'DiagStaging_ImagingBiopsy'
2. No rows with invalid Imaging for Biopsy values found.

3. No missing 'Imaging for Biopsy' values found (null values are allowed).


In [12]:
import pandas as pd

# Load the dataset
dataset = pd.read_excel('Hospital Patient Dataset Validata.xlsx')

# Step 1: Function to recognize 'Pathology Molecular Test Name' as 'DiagStaging_PathologyMolecularTestName' and flag incorrect names
def find_equivalent_column_pathology_test(data, expected_column):
    alternative_names = ["Pathology Molecular Test Name", "Molecular Test Name"]  # Add more alternatives as needed
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be: '{expected_column}'")
            return col  # Return the recognized column
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found in dataset.")
    print(f"   Found columns: {list(data.columns)}")
    print(f"   The correct column name should be: '{expected_column}'")
    return None

# Step 2: Convert correctly formatted pathology test names and flag invalid strings
def convert_or_flag_pathology_test_values(data, pathology_test_column):
    valid_values = [
        "DECIPHER", 
        "Prolaris", 
        "Oncotype DX Genomic Prostate Score", 
        "ProMark", 
        "(+ Other)"
    ]
    invalid_pathology_test = []

    for index, row in data.iterrows():
        pathology_test = row[pathology_test_column]

        # Check if the value is not in the valid list
        if pathology_test not in valid_values and pd.notna(pathology_test):
            invalid_pathology_test.append({
                'Row': index,
                'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                pathology_test_column: pathology_test,
                'Error': 'Invalid Pathology Molecular Test Name'
            })

    return pd.DataFrame(invalid_pathology_test)

# Step 3: Check for missing 'Pathology Molecular Test Name' values
def validate_missing_pathology_test(data, pathology_test_column):
    missing_pathology_test = data[data[pathology_test_column].isna()]
    if not missing_pathology_test.empty:
        print(f"3. Rows with missing '{pathology_test_column}':")
        print(missing_pathology_test[['Anonymized Patient Identifier', pathology_test_column]])
    else:
        print(f"3. No missing '{pathology_test_column}' values found (null values are allowed).")

# Run the validation functions for 'Pathology Molecular Test Name'
expected_column_pathology_test = 'DiagStaging_PathologyMolecularTestName'

# Step 1: Recognize and handle equivalent columns for 'Pathology Molecular Test Name'
recognized_column_pathology_test = find_equivalent_column_pathology_test(dataset, expected_column_pathology_test)

# If an equivalent column is recognized, proceed with the validation
if recognized_column_pathology_test:
    # Step 2: Convert correctly formatted values, flag invalid values
    invalid_pathology_test = convert_or_flag_pathology_test_values(dataset, recognized_column_pathology_test)
    
    # Output the number of invalid rows
    invalid_count_pathology_test = len(invalid_pathology_test)
    if invalid_count_pathology_test > 0:
        invalid_row_numbers = invalid_pathology_test['Row'].tolist()
        invalid_row_numbers_str = ','.join(map(str, invalid_row_numbers))

        print(f"2. {invalid_count_pathology_test} rows have an invalid Pathology Molecular Test Name:")
        print(f"   Row numbers: {invalid_row_numbers_str}\n")
        print(invalid_pathology_test.to_string(index=False))  # Output invalid rows without index
    else:
        print(f"2. No rows with invalid Pathology Molecular Test Name values found.\n")

    # Step 3: Check for missing values
    validate_missing_pathology_test(dataset, recognized_column_pathology_test)

1. Recognized 'Pathology Molecular Test Name' as 'DiagStaging_PathologyMolecularTestName' for validation, but the column name is incorrect.
   The correct column name should be: 'DiagStaging_PathologyMolecularTestName'
2. No rows with invalid Pathology Molecular Test Name values found.

3. No missing 'Pathology Molecular Test Name' values found (null values are allowed).


In [13]:
import pandas as pd

# Load the dataset
dataset = pd.read_excel('Hospital Patient Dataset Validata.xlsx')

# Step 1: Function to recognize 'Pathology Molecular Test Result' as 'DiagStaging_PathologyMolecularTestResult' and flag incorrect names
def find_equivalent_column_pathology_test_result(data, expected_column):
    alternative_names = ["Pathology Molecular Test Result", "Molecular Test Result"]  # Add more alternatives as needed
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be: '{expected_column}'")
            return col  # Return the recognized column
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found in dataset.")
    print(f"   Found columns: {list(data.columns)}")
    print(f"   The correct column name should be: '{expected_column}'")
    return None

# Step 2: Flag invalid pathology molecular test results
def convert_or_flag_pathology_test_result_values(data, pathology_test_result_column):
    invalid_pathology_test_result = []

    for index, row in data.iterrows():
        test_result = row[pathology_test_result_column]

        # Here, you can add any custom validation logic for the test results.
        # Since there are no standard values, you can flag unexpected formats or leave it flexible.

        if pd.notna(test_result) and not isinstance(test_result, str):
            invalid_pathology_test_result.append({
                'Row': index,
                'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                pathology_test_result_column: test_result,
                'Error': 'Invalid format for Pathology Molecular Test Result'
            })

    return pd.DataFrame(invalid_pathology_test_result)

# Step 3: Check for missing 'Pathology Molecular Test Result' values
def validate_missing_pathology_test_result(data, pathology_test_result_column):
    missing_pathology_test_result = data[data[pathology_test_result_column].isna()]
    if not missing_pathology_test_result.empty:
        print(f"3. Rows with missing '{pathology_test_result_column}':")
        print(missing_pathology_test_result[['Anonymized Patient Identifier', pathology_test_result_column]])
    else:
        print(f"3. No missing '{pathology_test_result_column}' values found (null values are allowed).")

# Run the validation functions for 'Pathology Molecular Test Result'
expected_column_pathology_test_result = 'DiagStaging_PathologyMolecularTestResult'

# Step 1: Recognize and handle equivalent columns for 'Pathology Molecular Test Result'
recognized_column_pathology_test_result = find_equivalent_column_pathology_test_result(dataset, expected_column_pathology_test_result)

# If an equivalent column is recognized, proceed with the validation
if recognized_column_pathology_test_result:
    # Step 2: Convert correctly formatted values, flag invalid formats
    invalid_pathology_test_result = convert_or_flag_pathology_test_result_values(dataset, recognized_column_pathology_test_result)
    
    # Output the number of invalid rows
    invalid_count_pathology_test_result = len(invalid_pathology_test_result)
    if invalid_count_pathology_test_result > 0:
        invalid_row_numbers = invalid_pathology_test_result['Row'].tolist()
        invalid_row_numbers_str = ','.join(map(str, invalid_row_numbers))

        print(f"2. {invalid_count_pathology_test_result} rows have an invalid Pathology Molecular Test Result value:")
        print(f"   Row numbers: {invalid_row_numbers_str}\n")
        print(invalid_pathology_test_result.to_string(index=False))  # Output invalid rows without index
    else:
        print(f"2. No rows with invalid Pathology Molecular Test Result values found.\n")

    # Step 3: Check for missing values
    validate_missing_pathology_test_result(dataset, recognized_column_pathology_test_result)

1. Recognized 'Pathology Molecular Test Result' as 'DiagStaging_PathologyMolecularTestResult' for validation, but the column name is incorrect.
   The correct column name should be: 'DiagStaging_PathologyMolecularTestResult'
2. No rows with invalid Pathology Molecular Test Result values found.

3. No missing 'Pathology Molecular Test Result' values found (null values are allowed).


In [14]:
import pandas as pd

# Load the dataset
dataset = pd.read_excel('Hospital Patient Dataset Validata.xlsx')

# Step 1: Recognize and validate the 'Treatment Outcome' column
def find_equivalent_column_treatment_outcome(data, expected_column):
    alternative_names = ["Treatment Outcome", "Outcome"]  # Add more alternatives if needed
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be: '{expected_column}'")
            return col  # Return the recognized column
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found in dataset.")
    print(f"   Found columns: {list(data.columns)}")
    print(f"   The correct column name should be: '{expected_column}'")
    return None

# Step 2: Validate the values in 'Treatment Outcome' column
def validate_treatment_outcome_values(data, outcome_column):
    valid_values = [
        "Under Treatment", 
        "No evidence of disease (NED)", 
        "Stable Disease", 
        "Partial Response", 
        "Progressive Disease", 
        "Indeterminate (possible pseudo-progression)", 
        "Complete Response", 
        "Biochemical Recurrence", 
        "Primary Recurrence", 
        "Local Recurrence", 
        "Nodal Recurrence", 
        "Distant Recurrence", 
        "(+Other)"
    ]
    invalid_outcomes = []

    for index, row in data.iterrows():
        outcome = row[outcome_column]

        # Check if the value is not in the valid list
        if outcome not in valid_values and pd.notna(outcome):
            invalid_outcomes.append({
                'Row': index,
                'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                outcome_column: outcome,
                'Error': 'Invalid Treatment Outcome value'
            })

    return pd.DataFrame(invalid_outcomes)

# Step 3: Check for missing 'Treatment Outcome' values
def validate_missing_treatment_outcome(data, outcome_column):
    missing_outcomes = data[data[outcome_column].isna()]
    if not missing_outcomes.empty:
        print(f"3. Rows with missing '{outcome_column}':")
        print(missing_outcomes[['Anonymized Patient Identifier', outcome_column]])
    else:
        print(f"3. No missing '{outcome_column}' values found (null values are not allowed).")

# Run the validation functions for 'Treatment Outcome'
expected_column_outcome = 'PatientTreatmentOutcome_DiseaseStatus'  # Keep original column name for tracking

# Step 1: Recognize and handle equivalent columns for 'Treatment Outcome'
recognized_column_outcome = find_equivalent_column_treatment_outcome(dataset, expected_column_outcome)

# If an equivalent column is recognized, proceed with the validation
if recognized_column_outcome:
    # Step 2: Validate the values in the 'Treatment Outcome' column
    invalid_outcomes = validate_treatment_outcome_values(dataset, recognized_column_outcome)
    
    # Output the number of invalid rows
    invalid_count_outcomes = len(invalid_outcomes)
    if invalid_count_outcomes > 0:
        invalid_row_numbers = invalid_outcomes['Row'].tolist()
        invalid_row_numbers_str = ','.join(map(str, invalid_row_numbers))

        print(f"2. {invalid_count_outcomes} rows have an invalid Treatment Outcome value:")
        print(f"   Row numbers: {invalid_row_numbers_str}\n")
        print(invalid_outcomes.to_string(index=False))  # Output invalid rows without index
    else:
        print(f"2. No rows with invalid Treatment Outcome values found.\n")

    # Step 3: Check for missing values
    validate_missing_treatment_outcome(dataset, recognized_column_outcome)

1. Recognized 'Treatment Outcome' as 'PatientTreatmentOutcome_DiseaseStatus' for validation, but the column name is incorrect.
   The correct column name should be: 'PatientTreatmentOutcome_DiseaseStatus'
2. 6 rows have an invalid Treatment Outcome value:
   Row numbers: 48,124,141,202,237,262

 Row Anonymized Patient Identifier Treatment Outcome                           Error
  48                     Anon58131     Stble Disease Invalid Treatment Outcome value
 124                     Anon22499     Stble Disease Invalid Treatment Outcome value
 141                     Anon18169     Stble Disease Invalid Treatment Outcome value
 202                           NaN     Stble Disease Invalid Treatment Outcome value
 237                     Anon50361     Stble Disease Invalid Treatment Outcome value
 262                     Anon82003     Stble Disease Invalid Treatment Outcome value
3. No missing 'Treatment Outcome' values found (null values are not allowed).


In [15]:
import pandas as pd

# Load the dataset
dataset = pd.read_excel('Hospital Patient Dataset Validata.xlsx')

# Step 1: Recognize and validate the 'Start Date of Treatment' column
def find_equivalent_column_start_date(data, expected_column):
    alternative_names = ["Start Date of Treatment", "Treatment Start Date"]  # Add more alternatives if needed
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be: '{expected_column}'")
            return col  # Return the recognized column
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found in dataset.")
    print(f"   Found columns: {list(data.columns)}")
    print(f"   The correct column name should be: '{expected_column}'")
    return None

# Step 2: Validate 'Start Date of Treatment' values and set time to midnight if missing
def convert_or_flag_start_dates(data, start_date_column):
    invalid_start_dates = []

    for index, row in data.iterrows():
        start_date = row[start_date_column]

        # Check if the date is a string and try to convert to datetime
        if isinstance(start_date, str):
            try:
                # Attempt to parse date-only format and set time to midnight (ISO 8601 compatible)
                parsed_date = pd.to_datetime(start_date, format='%Y-%m-%d', errors='raise')
                data.at[index, start_date_column] = parsed_date.replace(hour=0, minute=0, second=0)
            except Exception:
                invalid_start_dates.append({
                    'Row': index,
                    'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                    start_date_column: start_date,
                    'Error': 'Invalid date format'
                })

    return pd.DataFrame(invalid_start_dates)

# Step 3: Check for missing 'Start Date of Treatment' values
def validate_missing_start_dates(data, start_date_column):
    missing_start_dates = data[data[start_date_column].isna()]
    if not missing_start_dates.empty:
        print(f"3. Rows with missing '{start_date_column}':")
        print(missing_start_dates[['Anonymized Patient Identifier', start_date_column]])
    else:
        print(f"3. No missing '{start_date_column}' values found (null values are not allowed).")

# Run the validation functions for 'Start Date of Treatment'
expected_column_start_date = 'DiagStagingTreatmentOverview_DateOfRecord'

# Step 1: Recognize and handle equivalent columns for 'Start Date of Treatment'
recognized_column_start_date = find_equivalent_column_start_date(dataset, expected_column_start_date)

# If an equivalent column is recognized, proceed with the validation
if recognized_column_start_date:
    # Step 2: Convert and validate the 'Start Date of Treatment' values
    invalid_start_dates = convert_or_flag_start_dates(dataset, recognized_column_start_date)
    
    # Output the number of invalid rows
    invalid_count_start_dates = len(invalid_start_dates)
    if invalid_count_start_dates > 0:
        invalid_row_numbers = invalid_start_dates['Row'].tolist()
        invalid_row_numbers_str = ','.join(map(str, invalid_row_numbers))

        print(f"2. {invalid_count_start_dates} rows have an invalid Start Date of Treatment:")
        print(f"   Row numbers: {invalid_row_numbers_str}\n")
        print(invalid_start_dates.to_string(index=False))  # Output invalid rows without index
    else:
        print(f"2. No rows with invalid Start Date of Treatment values found.\n")

    # Step 3: Check for missing values
    validate_missing_start_dates(dataset, recognized_column_start_date)

1. Recognized 'Start Date of Treatment' as 'DiagStagingTreatmentOverview_DateOfRecord' for validation, but the column name is incorrect.
   The correct column name should be: 'DiagStagingTreatmentOverview_DateOfRecord'
2. No rows with invalid Start Date of Treatment values found.

3. No missing 'Start Date of Treatment' values found (null values are not allowed).


In [16]:
import pandas as pd

# Load the dataset
dataset = pd.read_excel('Hospital Patient Dataset Validata.xlsx')

# Step 1: Recognize and validate the 'End Date of Treatment' column
def find_equivalent_column_end_date(data, expected_column):
    alternative_names = ["End Date of Treatment", "Treatment End Date"]  # Add more alternatives if needed
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be: '{expected_column}'")
            return col  # Return the recognized column
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found in dataset.")
    print(f"   Found columns: {list(data.columns)}")
    print(f"   The correct column name should be: '{expected_column}'")
    return None

# Step 2: Validate 'End Date of Treatment' values and set time to midnight if missing
def convert_or_flag_end_dates(data, end_date_column):
    invalid_end_dates = []

    for index, row in data.iterrows():
        end_date = row[end_date_column]

        # Check if the date is a string and try to convert to datetime
        if isinstance(end_date, str):
            try:
                # Attempt to parse date-only format and set time to midnight (ISO 8601 compatible)
                parsed_date = pd.to_datetime(end_date, format='%Y-%m-%d', errors='raise')
                data.at[index, end_date_column] = parsed_date.replace(hour=0, minute=0, second=0)
            except Exception:
                invalid_end_dates.append({
                    'Row': index,
                    'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                    end_date_column: end_date,
                    'Error': 'Invalid date format'
                })

    return pd.DataFrame(invalid_end_dates)

# Step 3: Check for missing 'End Date of Treatment' values
def validate_missing_end_dates(data, end_date_column):
    missing_end_dates = data[data[end_date_column].isna()]
    if not missing_end_dates.empty:
        print(f"3. Rows with missing '{end_date_column}':")
        print(missing_end_dates[['Anonymized Patient Identifier', end_date_column]])
    else:
        print(f"3. No missing '{end_date_column}' values found (null values are not allowed).")

# Run the validation functions for 'End Date of Treatment'
expected_column_end_date = 'PatientTreatmentOutcome_DateOfRecord'

# Step 1: Recognize and handle equivalent columns for 'End Date of Treatment'
recognized_column_end_date = find_equivalent_column_end_date(dataset, expected_column_end_date)

# If an equivalent column is recognized, proceed with the validation
if recognized_column_end_date:
    # Step 2: Convert and validate the 'End Date of Treatment' values
    invalid_end_dates = convert_or_flag_end_dates(dataset, recognized_column_end_date)
    
    # Output the number of invalid rows
    invalid_count_end_dates = len(invalid_end_dates)
    if invalid_count_end_dates > 0:
        invalid_row_numbers = invalid_end_dates['Row'].tolist()
        invalid_row_numbers_str = ','.join(map(str, invalid_row_numbers))

        print(f"2. {invalid_count_end_dates} rows have an invalid End Date of Treatment:")
        print(f"   Row numbers: {invalid_row_numbers_str}\n")
        print(invalid_end_dates.to_string(index=False))  # Output invalid rows without index
    else:
        print(f"2. No rows with invalid End Date of Treatment values found.\n")

    # Step 3: Check for missing values
    validate_missing_end_dates(dataset, recognized_column_end_date)

1. Recognized 'End Date of Treatment' as 'PatientTreatmentOutcome_DateOfRecord' for validation, but the column name is incorrect.
   The correct column name should be: 'PatientTreatmentOutcome_DateOfRecord'
2. No rows with invalid End Date of Treatment values found.

3. No missing 'End Date of Treatment' values found (null values are not allowed).


In [54]:
import pandas as pd

# Load the dataset
dataset = pd.read_excel('Hospital Patient Dataset Validata.xlsx')

# Function to recognize and validate column names with alternatives
def find_equivalent_column(data, expected_column, alternative_names):
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be '{expected_column}'")
            return col
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found.")
    print(f"   Found columns: {list(data.columns)}")
    return None

# Function to validate a column's values against allowed values
def validate_column_values(data, column, valid_values, error_message):
    invalid_values = [
        {
            'Row': index,
            'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
            column: row[column],
            'Error': error_message
        }
        for index, row in data.iterrows()
        if pd.notna(row[column]) and row[column] not in valid_values
    ]

    if invalid_values:
        print(f"2. {len(invalid_values)} rows have an invalid '{column}' value:")
        print(f"   Row numbers: {', '.join(str(val['Row']) for val in invalid_values)}\n")
        print(pd.DataFrame(invalid_values).to_string(index=False))
    else:
        print(f"2. No rows with invalid '{column}' values found.\n")

# Function to flag invalid Pathology Molecular Test Results (must be a string)
def convert_or_flag_pathology_test_result_values(data, column):
    invalid_pathology_test_result = []

    for index, row in data.iterrows():
        test_result = row[column]
        if pd.notna(test_result) and not isinstance(test_result, str):
            invalid_pathology_test_result.append({
                'Row': index,
                'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                column: test_result,
                'Error': 'Invalid format for Pathology Molecular Test Result'
            })

    return pd.DataFrame(invalid_pathology_test_result)

# Function to validate 'Treatment Outcome'
def validate_treatment_outcome_values(data, outcome_column):
    valid_values = [
        "Under Treatment", "No evidence of disease (NED)", "Stable Disease",
        "Partial Response", "Progressive Disease", 
        "Indeterminate (possible pseudo-progression)", "Complete Response", 
        "Biochemical Recurrence", "Primary Recurrence", "Local Recurrence", 
        "Nodal Recurrence", "Distant Recurrence", "(+Other)"
    ]
    invalid_outcomes = [
        {
            'Row': index,
            'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
            outcome_column: row[outcome_column],
            'Error': 'Invalid Treatment Outcome value'
        }
        for index, row in data.iterrows()
        if pd.notna(row[outcome_column]) and row[outcome_column] not in valid_values
    ]

    return pd.DataFrame(invalid_outcomes)

# Function to validate 'Start Date of Treatment' values
def convert_or_flag_start_dates(data, start_date_column):
    invalid_start_dates = []

    for index, row in data.iterrows():
        start_date = row[start_date_column]
        if isinstance(start_date, str):
            try:
                parsed_date = pd.to_datetime(start_date, format='%Y-%m-%d', errors='raise')
                data.at[index, start_date_column] = parsed_date.replace(hour=0, minute=0, second=0)
            except Exception:
                invalid_start_dates.append({
                    'Row': index,
                    'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                    start_date_column: start_date,
                    'Error': 'Invalid date format'
                })

    return pd.DataFrame(invalid_start_dates)

# Function to check for missing values in a specific column
def validate_missing_values(data, column, allow_null=True):
    missing_rows = data[data[column].isna()]
    if not missing_rows.empty:
        print(f"3. Rows with missing '{column}':")
        print(missing_rows[['Anonymized Patient Identifier', column]].to_string(index=False))
    else:
        null_msg = " (null values are allowed)" if allow_null else ""
        print(f"3. No missing '{column}' values found{null_msg}.\n")

# Step 1: Recognize and validate the 'End Date of Treatment' column
def find_equivalent_column_end_date(data, expected_column):
    alternative_names = ["End Date of Treatment", "Treatment End Date"]  # Add more alternatives if needed
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be: '{expected_column}'")
            return col  # Return the recognized column
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found in dataset.")
    print(f"   Found columns: {list(data.columns)}")
    print(f"   The correct column name should be: '{expected_column}'")
    return None

# Step 2: Validate 'End Date of Treatment' values and set time to midnight if missing
def convert_or_flag_end_dates(data, end_date_column):
    invalid_end_dates = []

    for index, row in data.iterrows():
        end_date = row[end_date_column]

        # Check if the date is a string and try to convert to datetime
        if isinstance(end_date, str):
            try:
                # Attempt to parse date-only format and set time to midnight (ISO 8601 compatible)
                parsed_date = pd.to_datetime(end_date, format='%Y-%m-%d', errors='raise')
                data.at[index, end_date_column] = parsed_date.replace(hour=0, minute=0, second=0)
            except Exception:
                invalid_end_dates.append({
                    'Row': index,
                    'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                    end_date_column: end_date,
                    'Error': 'Invalid date format'
                })

    return pd.DataFrame(invalid_end_dates)

# Step 3: Check for missing 'End Date of Treatment' values
def validate_missing_end_dates(data, end_date_column):
    missing_end_dates = data[data[end_date_column].isna()]
    if not missing_end_dates.empty:
        print(f"3. Rows with missing '{end_date_column}':")
        print(missing_end_dates[['Anonymized Patient Identifier', end_date_column]])
    else:
        print(f"3. No missing '{end_date_column}' values found (null values are not allowed).")

# Main function to execute all validations
def main():
    # Validate 'Staging-T Category'
    print("Validating for DiagStaging_StagingTCategory")
    recognized_staging_t_category = find_equivalent_column(
        dataset, 'DiagStaging_StagingTCategory', ["Staging-T Category", "T Category"]
    )
    if recognized_staging_t_category:
        validate_column_values(
            dataset, recognized_staging_t_category, 
            ["Clinical", "Pathologic", "Radiographic"], 
            'Invalid Staging-T Category'
        )
        validate_missing_values(dataset, recognized_staging_t_category)

    # Validate 'Staging-T'
    print("Validating for DiagStaging_StagingT")
    recognized_staging_t = find_equivalent_column(
        dataset, 'DiagStaging_StagingT', ["Staging-T", "T Staging", "Tumor Staging"]
    )
    if recognized_staging_t:
        validate_column_values(
            dataset, recognized_staging_t, 
            ["T1a", "T1b", "T1c", "T2a", "T2b", "T2c", "T3a", "T3b", "T4", 
             "Tx (only if prostate has been removed)", "Information Not Available"], 
            'Invalid Staging-T Value'
        )
        validate_missing_values(dataset, recognized_staging_t)

    # Validate 'Staging-N'
    print("Validating for DiagStaging_StagingN")
    recognized_staging_n = find_equivalent_column(
        dataset, 'DiagStaging_StagingN', ["Staging-N", "Staging N"]
    )
    if recognized_staging_n:
        validate_column_values(
            dataset, recognized_staging_n, 
            ["N0", "N1", "Nx", "Information Not Available"], 
            'Invalid Staging-N Value'
        )
        validate_missing_values(dataset, recognized_staging_n)

    # Validate 'Staging-M'
    print("Validating for DiagStaging_StagingM")
    recognized_staging_m = find_equivalent_column(
        dataset, 'DiagStaging_StagingM', ["Staging-M", "Staging M"]
    )
    if recognized_staging_m:
        validate_column_values(
            dataset, recognized_staging_m, 
            ["M0", "M1a", "M1b", "M1c", "Mx", "Information Not Available"], 
            'Invalid Staging-M Value'
        )
        validate_missing_values(dataset, recognized_staging_m)

    # Validate 'Imaging for Biopsy'
    print("Validating for DiagStaging_ImagingBiopsy")
    recognized_imaging_biopsy = find_equivalent_column(
        dataset, 'DiagStaging_ImagingBiopsy', ["Imaging for Biopsy", "Biopsy Imaging"]
    )
    if recognized_imaging_biopsy:
        validate_column_values(
            dataset, recognized_imaging_biopsy, 
            ["Ultrasound", "Magnetic Resonance Imaging", "Computed Tomography"], 
            'Invalid Imaging for Biopsy value'
        )
        validate_missing_values(dataset, recognized_imaging_biopsy)

    # Validate 'Pathology Molecular Test Name'
    print("Validating for DiagStaging_PathologyMolecularTestName")
    recognized_pathology_test = find_equivalent_column(
        dataset, 'DiagStaging_PathologyMolecularTestName', 
        ["Pathology Molecular Test Name", "Molecular Test Name"]
    )
    if recognized_pathology_test:
        validate_column_values(
            dataset, recognized_pathology_test, 
            ["DECIPHER", "Prolaris", "Oncotype DX Genomic Prostate Score", 
             "ProMark", "(+ Other)"], 
            'Invalid Pathology Molecular Test Name'
        )
        validate_missing_values(dataset, recognized_pathology_test)

    # Validate 'Pathology Molecular Test Result'
    print("Validating for DiagStaging_PathologyMolecularTestResult")
    recognized_test_result = find_equivalent_column(
        dataset, 'DiagStaging_PathologyMolecularTestResult', 
        ["Pathology Molecular Test Result", "Molecular Test Result"]
    )
    if recognized_test_result:
        invalid_test_result = convert_or_flag_pathology_test_result_values(dataset, recognized_test_result)
        if not invalid_test_result.empty:
            print(f"2. {len(invalid_test_result)} rows have an invalid Pathology Molecular Test Result value:")
            print(f"   Row numbers: {', '.join(map(str, invalid_test_result['Row']))}\n")
            print(invalid_test_result.to_string(index=False))
        else:
            print("2. No rows with invalid Pathology Molecular Test Result values found.\n")
        validate_missing_values(dataset, recognized_test_result)

    # Validate 'Treatment Outcome'
    print("Validating for PatientTreatmentOutcome_DiseaseStatus")
    recognized_outcome = find_equivalent_column(
        dataset, 'PatientTreatmentOutcome_DiseaseStatus', ["Treatment Outcome", "Outcome"]
    )
    if recognized_outcome:
        invalid_outcomes = validate_treatment_outcome_values(dataset, recognized_outcome)
        if not invalid_outcomes.empty:
            print(f"2. {len(invalid_outcomes)} rows have an invalid Treatment Outcome value:")
            invalid_row_numbers = ', '.join(map(str, invalid_outcomes['Row'].tolist()))
            print(f"   Row numbers: {invalid_row_numbers}\n")
            print(invalid_outcomes.to_string(index=False))
        else:
            print("2. No rows with invalid Treatment Outcome values found.\n")
        validate_missing_values(dataset, recognized_outcome, allow_null=False)

    # Validate 'Start Date of Treatment'
    print("Validating for DiagStagingTreatmentOverview_DateOfRecord")
    recognized_column_start_date = find_equivalent_column(
        dataset, 'DiagStagingTreatmentOverview_DateOfRecord',
        ["Start Date of Treatment", "Treatment Start Date"]
    )
    if recognized_column_start_date:
        invalid_start_dates = convert_or_flag_start_dates(dataset, recognized_column_start_date)
        if not invalid_start_dates.empty:
            print(f"2. {len(invalid_start_dates)} rows have an invalid Start Date of Treatment:")
            invalid_row_numbers = ', '.join(map(str, invalid_start_dates['Row'].tolist()))
            print(f"   Row numbers: {invalid_row_numbers}\n")
            print(invalid_start_dates.to_string(index=False))
        else:
            print("2. No rows with invalid Start Date of Treatment values found.\n")
        validate_missing_values(dataset, recognized_column_start_date, allow_null=False)

    # Validate 'End Date of Treatment'
    print("Validating for PatientTreatmentOutcome_DateOfRecord")
    recognized_column_end_date = find_equivalent_column(
        dataset, 'PatientTreatmentOutcome_DateOfRecord', ["End Date of Treatment", "Treatment End Date"]
    )
    if recognized_column_end_date:
        invalid_end_dates = convert_or_flag_end_dates(dataset, recognized_column_end_date)
        if not invalid_end_dates.empty:
            print(f"2. {len(invalid_end_dates)} rows have an invalid End Date of Treatment:")
            invalid_row_numbers = ', '.join(map(str, invalid_end_dates['Row'].tolist()))
            print(f"   Row numbers: {invalid_row_numbers}\n")
            print(invalid_end_dates.to_string(index=False))
        else:
            print("2. No rows with invalid End Date of Treatment values found.\n")
        validate_missing_end_dates(dataset, recognized_column_end_date)

# Execute the main function
if __name__ == "__main__":
    main()

Validating for DiagStaging_StagingTCategory
1. Recognized 'Staging-T Category' as 'DiagStaging_StagingTCategory' for validation, but the column name is incorrect.
   The correct column name should be 'DiagStaging_StagingTCategory'
2. No rows with invalid 'Staging-T Category' values found.

3. No missing 'Staging-T Category' values found (null values are allowed).

Validating for DiagStaging_StagingT
1. Recognized 'Staging-T' as 'DiagStaging_StagingT' for validation, but the column name is incorrect.
   The correct column name should be 'DiagStaging_StagingT'
2. No rows with invalid 'Staging-T' values found.

3. No missing 'Staging-T' values found (null values are allowed).

Validating for DiagStaging_StagingN
1. Recognized 'Staging-N' as 'DiagStaging_StagingN' for validation, but the column name is incorrect.
   The correct column name should be 'DiagStaging_StagingN'
2. No rows with invalid 'Staging-N' values found.

3. No missing 'Staging-N' values found (null values are allowed).

V

In [4]:
import pandas as pd

# Load the dataset
dataset = pd.read_excel('Hospital Patient Dataset Validata.xlsx')

# Function to recognize and validate column names with alternatives
def find_equivalent_column(data, expected_column, alternative_names):
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be '{expected_column}'")
            return col
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found.")
    print(f"   Found columns: {list(data.columns)}")
    return None

# Function to validate a column's values against allowed values
def validate_column_values(data, column, valid_values, error_message):
    invalid_values = [
        {
            'Row': index,
            'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
            column: row[column],
            'Error': error_message
        }
        for index, row in data.iterrows()
        if pd.notna(row[column]) and row[column] not in valid_values
    ]

    if invalid_values:
        print(f"2. {len(invalid_values)} rows have an invalid '{column}' value:")
        print(f"   Row numbers: {', '.join(str(val['Row']) for val in invalid_values)}\n")
        print(pd.DataFrame(invalid_values).to_string(index=False))
    else:
        print(f"2. No rows with invalid '{column}' values found.\n")

# Function to validate 'Treatment Outcome'
def validate_treatment_outcome_values(data, outcome_column):
    valid_values = [
        "Under Treatment", "No evidence of disease (NED)", "Stable Disease",
        "Partial Response", "Progressive Disease", 
        "Indeterminate (possible pseudo-progression)", "Complete Response", 
        "Biochemical Recurrence", "Primary Recurrence", "Local Recurrence", 
        "Nodal Recurrence", "Distant Recurrence", "(+Other)"
    ]
    invalid_outcomes = [
        {
            'Row': index,
            'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
            outcome_column: row[outcome_column],
            'Error': 'Invalid Treatment Outcome value. Refer to the list of valid options: '
                     'Under Treatment, No evidence of disease (NED), Stable Disease, Partial Response, '
                     'Progressive Disease, Indeterminate (possible pseudo-progression), Complete Response, '
                     'Biochemical Recurrence, Primary Recurrence, Local Recurrence, '
                     'Nodal Recurrence, Distant Recurrence, (+Other).'
        }
        for index, row in data.iterrows()
        if pd.notna(row[outcome_column]) and row[outcome_column] not in valid_values
    ]

    return pd.DataFrame(invalid_outcomes)

# Function to check for missing values in a specific column
def validate_missing_values(data, column, allow_null=True):
    missing_rows = data[data[column].isna()]
    if not missing_rows.empty:
        print(f"3. Rows with missing '{column}':")
        print(missing_rows[['Anonymized Patient Identifier', column]].to_string(index=False))
    else:
        null_msg = " (null values are allowed)" if allow_null else ""
        print(f"3. No missing '{column}' values found{null_msg}.\n")

# Main function to execute all validations
def main():
    # Validate 'Treatment Outcome'
    print("Validating for PatientTreatmentOutcome_DiseaseStatus")
    outcome_column = find_equivalent_column(
        dataset, 'PatientTreatmentOutcome_DiseaseStatus', ["Treatment Outcome", "Outcome"]
    )
    if outcome_column:
        # Validate values in the 'Treatment Outcome' column
        invalid_outcomes = validate_treatment_outcome_values(dataset, outcome_column)
        if not invalid_outcomes.empty:
            print(f"2. {len(invalid_outcomes)} rows have an invalid Treatment Outcome value:")
            invalid_row_numbers = ', '.join(map(str, invalid_outcomes['Row'].tolist()))
            print(f"   Row numbers: {invalid_row_numbers}\n")
            print(invalid_outcomes.to_string(index=False))
        else:
            print("2. No rows with invalid Treatment Outcome values found.\n")
        
        # Check for missing values in the 'Treatment Outcome' column
        validate_missing_values(dataset, outcome_column, allow_null=False)

# Execute the main function
if __name__ == "__main__":
    main()

Validating for PatientTreatmentOutcome_DiseaseStatus
1. Recognized 'Treatment Outcome' as 'PatientTreatmentOutcome_DiseaseStatus' for validation, but the column name is incorrect.
   The correct column name should be 'PatientTreatmentOutcome_DiseaseStatus'
2. 6 rows have an invalid Treatment Outcome value:
   Row numbers: 48, 124, 141, 202, 237, 262

 Row Anonymized Patient Identifier Treatment Outcome                                                                                                                                                                                                                                                                                                                                                    Error
  48                     Anon58131     Stble Disease Invalid Treatment Outcome value. Refer to the list of valid options: Under Treatment, No evidence of disease (NED), Stable Disease, Partial Response, Progressive Disease, Indeterminate (possible p