In [4]:
import pandas as pd

# Load the dataset
dataset = pd.read_excel('Hospital Patient Dataset Validata.xlsx')

# Step 1: Function to recognize 'Staging-T Category' as 'DiagStaging_StagingTCategory' and flag incorrect names
def find_equivalent_column_staging_t(data, expected_column):
    alternative_names = ["Staging-T Category", "T Category"]  # Add more alternatives as needed
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be '{expected_column}'")
            return col  # Return the recognized column
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found in dataset.")
    print(f"   Found columns: {list(data.columns)}")
    print(f"   The correct column name should be '{expected_column}'")
    return None

# Step 2: Validate 'Staging-T Category' values (must be one of the allowed values)
def validate_staging_t_values(data, staging_t_column):
    allowed_values = ["Clinical", "Pathologic", "Radiographic"]
    invalid_values = []

    for index, row in data.iterrows():
        staging_t_value = row[staging_t_column]
        
        # If value is not null and not in the allowed list, flag as invalid
        if pd.notna(staging_t_value) and staging_t_value not in allowed_values:
            invalid_values.append({
                'Row': index,
                'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                staging_t_column: staging_t_value,
                'Error': 'Invalid Staging-T Category'
            })

    # Return the DataFrame with invalid rows
    return pd.DataFrame(invalid_values)

# Step 3: Check for missing 'DiagStaging_StagingTCategory' values
def validate_missing_staging_t(data, staging_t_column):
    missing_staging_t = data[data[staging_t_column].isna()]
    if not missing_staging_t.empty:
        print(f"3. Rows with missing '{staging_t_column}':")
        print(missing_staging_t[['Anonymized Patient Identifier', staging_t_column]])
    else:
        print(f"3. No missing '{staging_t_column}' values found (null values are allowed).")

# Run the validation functions for 'Staging-T Category'
expected_column_staging_t = 'DiagStaging_StagingTCategory'

# Step 1: Recognize and handle equivalent columns for 'DiagStaging_StagingTCategory'
recognized_column_staging_t = find_equivalent_column_staging_t(dataset, expected_column_staging_t)

# If an equivalent column is recognized, proceed with the validation
if recognized_column_staging_t:
    # Step 2: Validate values in the recognized column
    invalid_staging_t_values = validate_staging_t_values(dataset, recognized_column_staging_t)
    
    # Output the number of invalid rows
    invalid_count_staging_t = len(invalid_staging_t_values)
    if invalid_count_staging_t > 0:
        # Extract just the row numbers
        invalid_row_numbers_staging_t = invalid_staging_t_values['Row'].tolist()
        invalid_row_numbers_str_staging_t = ','.join(map(str, invalid_row_numbers_staging_t))

        # Display the count and the row numbers
        print(f"2. {invalid_count_staging_t} rows have an invalid 'Staging-T Category':")
        print(f"   Row numbers: {invalid_row_numbers_str_staging_t}\n")

        # Output the detailed list of invalid rows
        print(invalid_staging_t_values.to_string(index=False))  # Output invalid rows without index
    else:
        print(f"2. No rows with invalid 'Staging-T Category' values found.\n")

    # Step 3: Re-run the missing value check
    validate_missing_staging_t(dataset, recognized_column_staging_t)

    #More than one value allowed for this one, so no duplicate check necessary

1. Recognized 'Staging-T Category' as 'DiagStaging_StagingTCategory' for validation, but the column name is incorrect.
   The correct column name should be 'DiagStaging_StagingTCategory'
2. No rows with invalid 'Staging-T Category' values found.

3. No missing 'Staging-T Category' values found (null values are allowed).


In [5]:
import pandas as pd

# Load the dataset
dataset = pd.read_excel('Hospital Patient Dataset Validata.xlsx')

# Step 1: Function to recognize 'Staging-T' as 'DiagStaging_StagingT' and flag incorrect names
def find_staging_t_column(data, expected_column):
    alternative_names = ["Staging-T", "T Staging", "Tumor Staging"]  # Add more alternatives as needed
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be '{expected_column}'")
            return col  # Return the recognized column
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found in dataset.")
    print(f"   Found columns: {list(data.columns)}")
    print(f"   The correct column name should be '{expected_column}'")
    return None

# Step 2: Function to validate if the values in the 'Staging-T' column are part of the standard list
def validate_staging_t_values(data, staging_t_column):
    standard_values = [
        "T1a", "T1b", "T1c", "T2a", "T2b", "T2c", "T3a", "T3b", "T4", 
        "Tx (only if prostate has been removed)", "Information Not Available"
    ]
    
    invalid_staging_t = []

    # Validate each entry in the Staging-T column
    for index, row in data.iterrows():
        staging_t_value = row[staging_t_column]
        
        # If the value is not null and not in the standard list, flag it
        if pd.notna(staging_t_value) and staging_t_value not in standard_values:
            invalid_staging_t.append({
                'Row': index,
                'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                staging_t_column: staging_t_value,
                'Error': 'Invalid Staging-T Value'
            })

    # Return the DataFrame with invalid rows
    return pd.DataFrame(invalid_staging_t)

# Step 3: Check for missing 'DiagStaging_StagingT' values
def validate_missing_staging_t(data, staging_t_column):
    missing_staging_t = data[data[staging_t_column].isna()]
    if not missing_staging_t.empty:
        print(f"3. Rows with missing '{staging_t_column}':")
        print(missing_staging_t[['Anonymized Patient Identifier', staging_t_column]])
    else:
        print(f"3. No missing '{staging_t_column}' values found.")

# Run the validation functions
expected_staging_t_column = 'DiagStaging_StagingT'

# Step 1: Recognize and handle equivalent columns for 'Staging-T'
recognized_staging_t_column = find_staging_t_column(dataset, expected_staging_t_column)

# If an equivalent column is recognized, proceed with the validation
if recognized_staging_t_column:
    # Step 2: Validate the values in the 'Staging-T' column
    invalid_staging_t = validate_staging_t_values(dataset, recognized_staging_t_column)
    
    # Output the number of invalid rows
    invalid_count = len(invalid_staging_t)
    if invalid_count > 0:
        # Extract just the row numbers
        invalid_row_numbers = invalid_staging_t['Row'].tolist()
        invalid_row_numbers_str = ','.join(map(str, invalid_row_numbers))

        # Display the count and the row numbers
        print(f"2. {invalid_count} rows have an invalid Staging-T value:")
        print(f"   Row numbers: {invalid_row_numbers_str}\n")

        # Output the detailed list of invalid rows
        print(invalid_staging_t.to_string(index=False))  # Output invalid rows without index
    else:
        print(f"2. No rows with invalid Staging-T values found.\n")

    # Step 3: Re-run the missing value check
    validate_missing_staging_t(dataset, recognized_staging_t_column)

1. Recognized 'Staging-T' as 'DiagStaging_StagingT' for validation, but the column name is incorrect.
   The correct column name should be 'DiagStaging_StagingT'
2. No rows with invalid Staging-T values found.

3. No missing 'Staging-T' values found.


In [6]:
import pandas as pd

# Load the dataset
dataset = pd.read_excel('Hospital Patient Dataset Validata.xlsx')

# Step 1: Function to recognize 'Staging-N' as 'DiagStaging_StagingN' and flag incorrect names
def find_equivalent_column(data, expected_column):
    alternative_names = ["Staging-N", "Staging N"]  # Add more alternatives as needed
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be: '{expected_column}'")
            return col  # Return the recognized column
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found in dataset.")
    print(f"   Found columns: {list(data.columns)}")
    print(f"   The correct column name should be: '{expected_column}'")
    return None

# Step 2: Convert correctly formatted staging values and flag invalid strings
def convert_or_flag_staging_n_values(data, staging_n_column):
    valid_values = [
        "N0", "N1", "Nx", "Information Not Available"
    ]
    invalid_staging_n = []

    for index, row in data.iterrows():
        staging_n = row[staging_n_column]

        # Check if the value is not in the valid list
        if staging_n not in valid_values and pd.notna(staging_n):
            invalid_staging_n.append({
                'Row': index,
                'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                staging_n_column: staging_n,
                'Error': 'Invalid Staging-N value'
            })

    return pd.DataFrame(invalid_staging_n)

# Step 3: Check for missing 'Staging-N' values
def validate_missing_staging_n(data, staging_n_column):
    missing_staging_n = data[data[staging_n_column].isna()]
    if not missing_staging_n.empty:
        print(f"3. Rows with missing '{staging_n_column}':")
        print(missing_staging_n[['Anonymized Patient Identifier', staging_n_column]])
    else:
        print(f"3. No missing '{staging_n_column}' values found.")

# Run the validation functions
expected_column = 'DiagStaging_StagingN'

# Step 1: Recognize and handle equivalent columns for 'Staging-N'
recognized_column = find_equivalent_column(dataset, expected_column)

# If an equivalent column is recognized, proceed with the validation
if recognized_column:
    # Step 2: Convert correctly formatted staging values, flag invalid values
    invalid_staging_n = convert_or_flag_staging_n_values(dataset, recognized_column)
    
    # Output the number of invalid rows
    invalid_count = len(invalid_staging_n)
    if invalid_count > 0:
        invalid_row_numbers = invalid_staging_n['Row'].tolist()
        invalid_row_numbers_str = ','.join(map(str, invalid_row_numbers))

        print(f"2. {invalid_count} rows have an invalid Staging-N value:")
        print(f"   Row numbers: {invalid_row_numbers_str}\n")
        print(invalid_staging_n.to_string(index=False))  # Output invalid rows without index
    else:
        print(f"2. No rows with invalid Staging-N values found.\n")

    # Step 3: Check for missing values
    validate_missing_staging_n(dataset, recognized_column)

1. Recognized 'Staging-N' as 'DiagStaging_StagingN' for validation, but the column name is incorrect.
   The correct column name should be: 'DiagStaging_StagingN'
2. No rows with invalid Staging-N values found.

3. No missing 'Staging-N' values found.


In [7]:
import pandas as pd

# Load the dataset
dataset = pd.read_excel('Hospital Patient Dataset Validata.xlsx')

# Step 1: Function to recognize 'Staging-M' as 'DiagStaging_StagingM' and flag incorrect names
def find_equivalent_column(data, expected_column):
    alternative_names = ["Staging-M", "Staging M"]  # Add more alternatives as needed
    for col in data.columns:
        if col in alternative_names:
            print(f"1. Recognized '{col}' as '{expected_column}' for validation, but the column name is incorrect.")
            print(f"   The correct column name should be: '{expected_column}'")
            return col  # Return the recognized column
    print(f"1. Column name mismatch: Expected '{expected_column}', but not found in dataset.")
    print(f"   Found columns: {list(data.columns)}")
    print(f"   The correct column name should be: '{expected_column}'")
    return None

# Step 2: Convert correctly formatted staging values and flag invalid strings
def convert_or_flag_staging_m_values(data, staging_m_column):
    valid_values = [
        "M0", "M1a", "M1b", "M1c", "Mx", "Information Not Available"
    ]
    invalid_staging_m = []

    for index, row in data.iterrows():
        staging_m = row[staging_m_column]

        # Check if the value is not in the valid list
        if staging_m not in valid_values and pd.notna(staging_m):
            invalid_staging_m.append({
                'Row': index,
                'Anonymized Patient Identifier': row['Anonymized Patient Identifier'],
                staging_m_column: staging_m,
                'Error': 'Invalid Staging-M value'
            })

    return pd.DataFrame(invalid_staging_m)

# Step 3: Check for missing 'Staging-M' values
def validate_missing_staging_m(data, staging_m_column):
    missing_staging_m = data[data[staging_m_column].isna()]
    if not missing_staging_m.empty:
        print(f"3. Rows with missing '{staging_m_column}':")
        print(missing_staging_m[['Anonymized Patient Identifier', staging_m_column]])
    else:
        print(f"3. No missing '{staging_m_column}' values found.")

# Run the validation functions
expected_column = 'DiagStaging_StagingM'

# Step 1: Recognize and handle equivalent columns for 'Staging-M'
recognized_column = find_equivalent_column(dataset, expected_column)

# If an equivalent column is recognized, proceed with the validation
if recognized_column:
    # Step 2: Convert correctly formatted staging values, flag invalid values
    invalid_staging_m = convert_or_flag_staging_m_values(dataset, recognized_column)
    
    # Output the number of invalid rows
    invalid_count = len(invalid_staging_m)
    if invalid_count > 0:
        invalid_row_numbers = invalid_staging_m['Row'].tolist()
        invalid_row_numbers_str = ','.join(map(str, invalid_row_numbers))

        print(f"2. {invalid_count} rows have an invalid Staging-M value:")
        print(f"   Row numbers: {invalid_row_numbers_str}\n")
        print(invalid_staging_m.to_string(index=False))  # Output invalid rows without index
    else:
        print(f"2. No rows with invalid Staging-M values found.\n")

    # Step 3: Check for missing values
    validate_missing_staging_m(dataset, recognized_column)

1. Recognized 'Staging-M' as 'DiagStaging_StagingM' for validation, but the column name is incorrect.
   The correct column name should be: 'DiagStaging_StagingM'
2. No rows with invalid Staging-M values found.

3. No missing 'Staging-M' values found.
