In [None]:
import pandas as pd

# Load the dataset
dataset_1 = pd.read_excel('Hospital Patient Dataset Validata.xlsx')
dataset_2 = pd.read_excel('Larger Dataset.xlsx')

# Function to validate a single dataset
def validate_dataset(dataset, dataset_name):
    print(f"\n=== Validating Dataset: {dataset_name} ===")

    # Main function to execute all validations
    # Validate 'Staging-T Category'
    print("Validating for DiagStaging_StagingTCategory")
    recognized_staging_t_category = find_equivalent_column(
        dataset, 'DiagStaging_StagingTCategory', ["Staging-T Category", "T Category"]
    )
    if recognized_staging_t_category:
        validate_column_values(
            dataset, recognized_staging_t_category, 
            ["Clinical", "Pathologic", "Radiographic"], 
            'Invalid Staging-T Category'
        )
        validate_missing_values(dataset, recognized_staging_t_category)

    # Validate 'Staging-T'
    print("Validating for DiagStaging_StagingT")
    recognized_staging_t = find_equivalent_column(
        dataset, 'DiagStaging_StagingT', ["Staging-T", "T Staging", "Tumor Staging"]
    )
    if recognized_staging_t:
        validate_column_values(
            dataset, recognized_staging_t, 
            ["T1a", "T1b", "T1c", "T2a", "T2b", "T2c", "T3a", "T3b", "T4", 
             "Tx (only if prostate has been removed)", "Information Not Available"], 
            'Invalid Staging-T Value'
        )
        validate_missing_values(dataset, recognized_staging_t)

    # Validate 'Staging-N'
    print("Validating for DiagStaging_StagingN")
    recognized_staging_n = find_equivalent_column(
        dataset, 'DiagStaging_StagingN', ["Staging-N", "Staging N"]
    )
    if recognized_staging_n:
        validate_column_values(
            dataset, recognized_staging_n, 
            ["N0", "N1", "Nx", "Information Not Available"], 
            'Invalid Staging-N Value'
        )
        validate_missing_values(dataset, recognized_staging_n)

    # Validate 'Staging-M'
    print("Validating for DiagStaging_StagingM")
    recognized_staging_m = find_equivalent_column(
        dataset, 'DiagStaging_StagingM', ["Staging-M", "Staging M"]
    )
    if recognized_staging_m:
        validate_column_values(
            dataset, recognized_staging_m, 
            ["M0", "M1a", "M1b", "M1c", "Mx", "Information Not Available"], 
            'Invalid Staging-M Value'
        )
        validate_missing_values(dataset, recognized_staging_m)

    # Validate 'Imaging for Biopsy'
    print("Validating for DiagStaging_ImagingBiopsy")
    recognized_imaging_biopsy = find_equivalent_column(
        dataset, 'DiagStaging_ImagingBiopsy', ["Imaging for Biopsy", "Biopsy Imaging"]
    )
    if recognized_imaging_biopsy:
        validate_column_values(
            dataset, recognized_imaging_biopsy, 
            ["Ultrasound", "Magnetic Resonance Imaging", "Computed Tomography"], 
            'Invalid Imaging for Biopsy value'
        )
        validate_missing_values(dataset, recognized_imaging_biopsy)

    # Validate 'Pathology Molecular Test Name'
    print("Validating for DiagStaging_PathologyMolecularTestName")
    recognized_pathology_test = find_equivalent_column(
        dataset, 'DiagStaging_PathologyMolecularTestName', 
        ["Pathology Molecular Test Name", "Molecular Test Name"]
    )
    if recognized_pathology_test:
        validate_column_values(
            dataset, recognized_pathology_test, 
            ["DECIPHER", "Prolaris", "Oncotype DX Genomic Prostate Score", 
             "ProMark", "(+ Other)"], 
            'Invalid Pathology Molecular Test Name'
        )
        validate_missing_values(dataset, recognized_pathology_test)

    # Validate 'Pathology Molecular Test Result'
    print("Validating for DiagStaging_PathologyMolecularTestResult")
    recognized_test_result = find_equivalent_column(
        dataset, 'DiagStaging_PathologyMolecularTestResult', 
        ["Pathology Molecular Test Result", "Molecular Test Result"]
    )
    if recognized_test_result:
        invalid_test_result = convert_or_flag_pathology_test_result_values(dataset, recognized_test_result)
        if not invalid_test_result.empty:
            print(f"2. {len(invalid_test_result)} rows have an invalid Pathology Molecular Test Result value:")
            print(f"   Row numbers: {', '.join(map(str, invalid_test_result['Row']))}\n")
            print(invalid_test_result.to_string(index=False))
        else:
            print("2. No rows with invalid Pathology Molecular Test Result values found.\n")
        validate_missing_values(dataset, recognized_test_result)

    # Validate 'Treatment Outcome'
    print("Validating for PatientTreatmentOutcome_DiseaseStatus")
    recognized_outcome = find_equivalent_column(
        dataset, 'PatientTreatmentOutcome_DiseaseStatus', ["Treatment Outcome", "Outcome"]
    )
    if recognized_outcome:
        invalid_outcomes = validate_treatment_outcome_values(dataset, recognized_outcome)
        if not invalid_outcomes.empty:
            print(f"2. {len(invalid_outcomes)} rows have an invalid Treatment Outcome value:")
            invalid_row_numbers = ', '.join(map(str, invalid_outcomes['Row'].tolist()))
            print(f"   Row numbers: {invalid_row_numbers}\n")
            print(invalid_outcomes.to_string(index=False))
        else:
            print("2. No rows with invalid Treatment Outcome values found.\n")
        validate_missing_values(dataset, recognized_outcome, allow_null=False)

    # Validate 'Start Date of Treatment'
    print("Validating for DiagStagingTreatmentOverview_DateOfRecord")
    recognized_column_start_date = find_equivalent_column(
        dataset, 'DiagStagingTreatmentOverview_DateOfRecord',
        ["Start Date of Treatment", "Treatment Start Date"]
    )
    if recognized_column_start_date:
        invalid_start_dates = convert_or_flag_start_dates(dataset, recognized_column_start_date)
        if not invalid_start_dates.empty:
            print(f"2. {len(invalid_start_dates)} rows have an invalid Start Date of Treatment:")
            invalid_row_numbers = ', '.join(map(str, invalid_start_dates['Row'].tolist()))
            print(f"   Row numbers: {invalid_row_numbers}\n")
            print(invalid_start_dates.to_string(index=False))
        else:
            print("2. No rows with invalid Start Date of Treatment values found.\n")
        validate_missing_values(dataset, recognized_column_start_date, allow_null=False)

    # Validate 'End Date of Treatment'
    print("Validating for PatientTreatmentOutcome_DateOfRecord")
    recognized_column_end_date = find_equivalent_column(
        dataset, 'PatientTreatmentOutcome_DateOfRecord', ["End Date of Treatment", "Treatment End Date"]
    )
    if recognized_column_end_date:
        invalid_end_dates = convert_or_flag_end_dates(dataset, recognized_column_end_date)
        if not invalid_end_dates.empty:
            print(f"2. {len(invalid_end_dates)} rows have an invalid End Date of Treatment:")
            invalid_row_numbers = ', '.join(map(str, invalid_end_dates['Row'].tolist()))
            print(f"   Row numbers: {invalid_row_numbers}\n")
            print(invalid_end_dates.to_string(index=False))
        else:
            print("2. No rows with invalid End Date of Treatment values found.\n")
        validate_missing_end_dates(dataset, recognized_column_end_date)

# Function to recognize and validate column names with alternatives
def find_equivalent_column(data, expected_column, alternative_names):
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be '{expected_column}'")
            return col
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found.")
    print(f"   Found columns: {list(data.columns)}")
    return None

# Function to validate a column's values against allowed values
def validate_column_values(data, column, valid_values, error_message):
    invalid_values = [
        {
            'Row': index,
            'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
            column: row[column],
            'Error': error_message
        }
        for index, row in data.iterrows()
        if pd.notna(row[column]) and row[column] not in valid_values
    ]

    if invalid_values:
        print(f"2. {len(invalid_values)} rows have an invalid '{column}' value:")
        print(f"   Row numbers: {', '.join(str(val['Row']) for val in invalid_values)}\n")
        print(pd.DataFrame(invalid_values).to_string(index=False))
    else:
        print(f"2. No rows with invalid '{column}' values found.\n")

# Function to flag invalid Pathology Molecular Test Results (must be a string)
def convert_or_flag_pathology_test_result_values(data, column):
    invalid_pathology_test_result = []

    for index, row in data.iterrows():
        test_result = row[column]
        if pd.notna(test_result) and not isinstance(test_result, str):
            invalid_pathology_test_result.append({
                'Row': index,
                'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                column: test_result,
                'Error': 'Invalid format for Pathology Molecular Test Result'
            })

    return pd.DataFrame(invalid_pathology_test_result)

# Function to validate 'Treatment Outcome'
def validate_treatment_outcome_values(data, outcome_column):
    valid_values = [
        "Under Treatment", "No evidence of disease (NED)", "Stable Disease",
        "Partial Response", "Progressive Disease", 
        "Indeterminate (possible pseudo-progression)", "Complete Response", 
        "Biochemical Recurrence", "Primary Recurrence", "Local Recurrence", 
        "Nodal Recurrence", "Distant Recurrence", "(+Other)"
    ]
    invalid_outcomes = [
        {
            'Row': index,
            'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
            outcome_column: row[outcome_column],
            'Error': 'Invalid Treatment Outcome value'
        }
        for index, row in data.iterrows()
        if pd.notna(row[outcome_column]) and row[outcome_column] not in valid_values
    ]

    return pd.DataFrame(invalid_outcomes)

# Function to validate 'Start Date of Treatment' values
def convert_or_flag_start_dates(data, start_date_column):
    invalid_start_dates = []

    for index, row in data.iterrows():
        start_date = row[start_date_column]
        if isinstance(start_date, str):
            try:
                parsed_date = pd.to_datetime(start_date, format='%Y-%m-%d', errors='raise')
                data.at[index, start_date_column] = parsed_date.replace(hour=0, minute=0, second=0)
            except Exception:
                invalid_start_dates.append({
                    'Row': index,
                    'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                    start_date_column: start_date,
                    'Error': 'Invalid date format'
                })

    return pd.DataFrame(invalid_start_dates)

# Function to check for missing values in a specific column
def validate_missing_values(data, column, allow_null=True):
    missing_rows = data[data[column].isna()]
    if not missing_rows.empty:
        print(f"3. Rows with missing '{column}':")
        print(missing_rows[['Anonymized Patient Identifier', column]].to_string(index=False))
    else:
        null_msg = " (null values are allowed)" if allow_null else ""
        print(f"3. No missing '{column}' values found{null_msg}.\n")

# Step 1: Recognize and validate the 'End Date of Treatment' column
def find_equivalent_column_end_date(data, expected_column):
    alternative_names = ["End Date of Treatment", "Treatment End Date"]  # Add more alternatives if needed
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be: '{expected_column}'")
            return col  # Return the recognized column
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found in dataset.")
    print(f"   Found columns: {list(data.columns)}")
    print(f"   The correct column name should be: '{expected_column}'")
    return None

# Step 2: Validate 'End Date of Treatment' values and set time to midnight if missing
def convert_or_flag_end_dates(data, end_date_column):
    invalid_end_dates = []

    for index, row in data.iterrows():
        end_date = row[end_date_column]

        # Check if the date is a string and try to convert to datetime
        if isinstance(end_date, str):
            try:
                # Attempt to parse date-only format and set time to midnight (ISO 8601 compatible)
                parsed_date = pd.to_datetime(end_date, format='%Y-%m-%d', errors='raise')
                data.at[index, end_date_column] = parsed_date.replace(hour=0, minute=0, second=0)
            except Exception:
                invalid_end_dates.append({
                    'Row': index,
                    'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                    end_date_column: end_date,
                    'Error': 'Invalid date format'
                })

    return pd.DataFrame(invalid_end_dates)

# Step 3: Check for missing 'End Date of Treatment' values
def validate_missing_end_dates(data, end_date_column):
    missing_end_dates = data[data[end_date_column].isna()]
    if not missing_end_dates.empty:
        print(f"3. Rows with missing '{end_date_column}':")
        print(missing_end_dates[['Anonymized Patient Identifier', end_date_column]])
    else:
        print(f"3. No missing '{end_date_column}' values found (null values are not allowed).")


def main():
    validate_dataset(dataset_1, "Hospital Patient Dataset Validata")
    print("\n----------------------------------------\n")
    validate_dataset(dataset_2, "Larger Dataset")

# Execute the main function
if __name__ == "__main__":
    main()


=== Validating Dataset: Hospital Patient Dataset Validata ===
Validating for DiagStaging_StagingTCategory
1. Recognized 'Staging-T Category' as 'DiagStaging_StagingTCategory' for validation, but the column name is incorrect.
   The correct column name should be 'DiagStaging_StagingTCategory'
2. No rows with invalid 'Staging-T Category' values found.

3. No missing 'Staging-T Category' values found (null values are allowed).

Validating for DiagStaging_StagingT
1. Recognized 'Staging-T' as 'DiagStaging_StagingT' for validation, but the column name is incorrect.
   The correct column name should be 'DiagStaging_StagingT'
2. No rows with invalid 'Staging-T' values found.

3. No missing 'Staging-T' values found (null values are allowed).

Validating for DiagStaging_StagingN
1. Recognized 'Staging-N' as 'DiagStaging_StagingN' for validation, but the column name is incorrect.
   The correct column name should be 'DiagStaging_StagingN'
2. No rows with invalid 'Staging-N' values found.

3. No