# Problem Statement

Functionalities to backup REDCap data in the database is constantly being maintained and updated. **There needs to be a way to quickly verify that the data in the backup database is the same as the data on REDCap.**

This comparison goes both ways--data found in REDCap should be in the database, and data found in the database should be on REDCap.

# Solution

This notebook pulls data from REDCap and the database and do cross checking. Then, differences (if any) are exported to an excel file.

# Code
### 1. Setup: get project's records and database's data

In [None]:
import AMBRA_Backups
import pandas as pd
import logging
from datetime import datetime
from redcap import Project
from AMBRA_Backups.Database.database import Database
from AMBRA_Backups.redcap_funcs import get_project_instru_field_map
from REDCap.utils import (
    extract_variable,
    get_field_instru_map,
    check_checkbox_redcap_var,
)

logger = logging.getLogger(__name__)

In [None]:
project_name = "Personal Test"
db_name = "TESTED"

project = AMBRA_Backups.redcap_funcs.get_redcap_project(project_name)

# Fields to remove from REDCap
# This is necessary as REDCap retain deleted fields if they have data in them.

Get & process project's data

In [None]:
def get_project_records(project: Project):
    """Get records from REDCap project"""
    project_records = project.export_records(
        format_type="df", export_blank_for_gray_form_status=True
    )
    project_records.reset_index(inplace=True)
    project_records.to_excel(f"{project_name}_REDCap_{datetime.now()}.xlsx")
    return project_records

In [None]:
def process_project_records(project_records: pd.DataFrame, field_instru_map: dict):
    """
    1. Turn data from wide to long for easier comparison with database data.
    2. Rename, sort based on patient_name.
    3. Drop residual rows.
    4. Convert data types for easier comparisons.
    """
    # Wide to long
    project_records = pd.melt(
        project_records,
        id_vars=["record_id", "redcap_repeat_instrument", "redcap_repeat_instance"],
    )

    # Rename for less confusion when comparing
    project_records.rename(
        columns={
            "record_id": "patient_name",
            "redcap_repeat_instrument": "crf_name",
            "redcap_repeat_instance": "instance",
            "variable": "redcap_variable",
        },
        inplace=True,
    )

    # Filter out 'residual rows'
    # Residual rows are defined as rows that have a repeating instrument paired with a
    # redcap variable not belonging to it.
    # These rows do not contain the actual value of said variable.
    project_records["residual"] = False
    for index, row in project_records.iterrows():
        variable = extract_variable(row["redcap_variable"])
        crf_name = row["crf_name"]
        correct_crf_name = field_instru_map[variable]

        # If crf_name is empty, then assign the correct crf_name based on variable
        if crf_name == "" or pd.isna(crf_name):
            project_records.at[index, "crf_name"] = correct_crf_name

        # If value is empty or nan but with crf_name, then it's likely that it's
        # a residual with a repeating redcap form paired with a non-repeating variable.
        # If not, then the value itself is just empty or nan.
        elif pd.isna(row["value"]) or row["value"] == "":
            # Incorrect (form, variable) pair
            if correct_crf_name != crf_name:
                project_records.loc[index, "residual"] = True

            # Correct (form, variable) pair
            else:
                # Check if the form that contains this variable is not gray status for this record
                print("type(row['patient_name'])", type(row["patient_name"]))
                status_df = project_records[
                    (project_records["patient_name"] == row["patient_name"])
                    & (project_records["crf_name"] == f"{correct_crf_name}_complete")
                    & (pd.isna(project_records["value"]))
                ]
                # If gray status was found
                if len(status_df) != 0:
                    project_records.loc[index, "residual"] = True
                # If the form is complete but has in correct pair
                # Should not be possible, unless grabbed wrong correct_crf_name
                else:
                    raise Exception(
                        f"{correct_crf_name} is not gray but {variable} has value and pairs with {crf_name}."
                    )

    project_records.to_excel(f"before_drop_{datetime.now()}.xlsx")
    # Drop residual rows
    project_records = project_records[~project_records["residual"]]
    project_records.to_excel(f"after_drop_{datetime.now()}.xlsx")
    # Sort
    project_records.sort_values(by="patient_name", inplace=True)

    # Data type conversion
    # object -> str
    project_records["value"] = project_records["value"].astype("string")
    project_records["redcap_variable"] = project_records["redcap_variable"].astype(
        "string"
    )
    project_records["crf_name"] = project_records["crf_name"].astype("string")
    project_records["patient_name"] = project_records["patient_name"].astype("string")

    project_records.to_excel(f"{project_name}_redcap.xlsx")

    return project_records

In [None]:
instru_field_map = get_project_instru_field_map(project)
field_instru_map = get_field_instru_map(instru_field_map)

In [None]:
instru_field_map

In [None]:
field_instru_map

In [None]:
project_records = get_project_records(project)
project_records = process_project_records(project_records, field_instru_map)
project_records

In [None]:
raise Exception("hello")

Get & process database data

In [None]:
db = Database(db_name)

In [None]:
def get_database_records(db: Database):
    """Get records from database"""
    records = db.run_select_query(
        f"""
        SELECT * FROM {db_name}.CRF_Data_RedCap 
        INNER JOIN {db_name}.CRF_RedCap 
            ON {db_name}.CRF_Data_RedCap.id_crf = {db_name}.CRF_RedCap.id
        INNER JOIN {db_name}.patients
            ON {db_name}.CRF_RedCap.id_patient = {db_name}.patients.id
        """,
        column_names=True,
    )
    records = pd.DataFrame.from_records(records)
    return records

In [None]:
def process_database_records(db_records: pd.DataFrame):
    """
    1. Drop irrelevant columns and sort based on patient_name
    2. Convert data types for easier comparisons
    """

    # Drop & Sort
    db_records.reset_index(inplace=True)
    db_records.drop(
        labels=[
            "index",
            "id",
            "id_crf",
            "record_created",
            "is_phantom",
            "record_updated",
            "id_patient",
            "patient_id",
            "is_phantom",
        ],
        inplace=True,
        axis=1,
    )
    db_records.sort_values(by="patient_name", inplace=True)

    # Data type conversion
    # Before converting
    print(db_records.info(verbose=True))
    # object -> str
    db_records["value"] = db_records["value"].astype("string")
    db_records["redcap_variable"] = db_records["redcap_variable"].astype("string")
    db_records["crf_name"] = db_records["crf_name"].astype("string")
    db_records["patient_name"] = db_records["patient_name"].astype("string")

    # After converting
    print(db_records.info(verbose=True))
    db_records.to_excel(f"{db_name}_database.xlsx")

    return db_records

In [None]:
db_records = get_database_records(db)
db_records = process_database_records(db_records)

In [None]:
db_records

### 2. Comparison

Map `project_records` to `db_records`.

In [None]:
def map_project_to_db(project_records: pd.DataFrame, db_records: pd.DataFrame):
    """
    Iterate through each project_records and check if
    that data is found in db_records
    """

    logging.basicConfig(
        filename=f"{project_name}REDCap-{db_name}db.log",
        filemode="w",
        level=logging.INFO,
    )

    logger.info("===========================")
    for row in project_records.itertuples():
        patient_name = row.patient_name
        original_variable = row.redcap_variable
        redcap_variable = row.redcap_variable
        instance = row.instance
        crf_name = row.crf_name
        value = row.value

        # If the value is an int, it might be stored as a float in the project_records in df,
        # but an int in db_records. In this case, convert the float into the int.
        try:
            value = str(int(float(value)))
        except Exception:
            logger.info(f"Value {value} is not integer")

        # If the variable is a checkbox variable, then in project_records it would look like
        # checkbox__1 but in db_records it is stored as checkbox(1). Convert it into
        # the db convention for easier viewing.
        if check_checkbox_redcap_var(redcap_variable):
            # master_variable = extract_variable(redcap_variable)
            before_choice = redcap_variable[: (redcap_variable.rindex("___"))]
            choice = redcap_variable[(redcap_variable.rindex("_") + 1) :]
            redcap_variable = f"{before_choice}({choice})"

            # If the variable belongs to a non-filled out instrument, then skip.
            # project_records have values of variables belonging to unfilled instruments because
            # of them being having a default on REDCap. These values are not of concern.

            # actual_crf = field_instru_map[master_variable]
            # if not check_filled_instrument(patient_name, actual_crf, project_records):
            #     logging.info(f'Instrument {actual_crf} not filled for patient {patient_name}')
            #     continue
        # else:
        # actual_crf = field_instru_map[redcap_variable]
        # if not check_filled_instrument(patient_name, actual_crf, project_records):
        #     logging.info(f'Instrument {actual_crf} not filled for patient {patient_name}')
        #     continue
        # Type of each relevant variable is printed out for
        # reference when comparison. For example, '1.0' != 1.0,
        # but for comparison purposes we treat these two as the same.
        logger.info(f"""
            ##########################
            #   Subject:     {patient_name}
            #   Instance:    {instance}
            #   OG Var:      {original_variable}
            $   Variable:    {redcap_variable}
            #   CRF_Name:    {crf_name}
            #   Value:       {value}
            ##########################
            """)

        # If a repeating instrument
        if pd.notna(instance):
            logger.info("--------REPEATING")

            # If the variable is not actually in the repeating instrument, then skip
            # if master_variable not in instru_field_map[crf_name]:
            #     logger.info(f'Master variable:    {master_variable}')
            #     logger.info(f'CRF:                {crf_name}')
            #     continue

            db_record = db_records[
                (db_records["patient_name"] == patient_name)
                & (db_records["redcap_variable"] == redcap_variable)
                & (db_records["instance"] == instance)
                & (db_records["crf_name"] == crf_name)
                & (db_records["value"] == value)
                & (db_records["deleted"] == 0)  # Value must be not deleted in DB
            ]
            if len(db_record) == 0:
                potential_rows = db_records[
                    (db_records["patient_name"] == patient_name)
                    & (db_records["redcap_variable"] == redcap_variable)
                    & (db_records["crf_name"] == crf_name)
                ]
                logger.info(
                    f"""
                    Not found in DB. Potential rows:

                    {potential_rows}
                    """
                )
                return
            else:
                logger.info(f"Length of db_record: {len(db_record)}")
                logger.info("db_record:")
                logger.info(f"\n{db_record}")

        # If not a repeating instrument
        else:
            db_record = db_records[
                (db_records["patient_name"] == patient_name)
                & (db_records["redcap_variable"] == redcap_variable)
                &
                # Use .isna() instead of instance == instance because NaN != NaN is True.
                # https://stackoverflow.com/questions/10034149/why-is-nan-not-equal-to-nan
                (pd.isna(db_records["instance"]))
                & (db_records["value"] == value)
                & (db_records["deleted"] == 0)  # Value must be not deleted in DB
            ]
            if len(db_record) == 0:
                potential_rows = db_records[
                    (db_records["patient_name"] == patient_name)
                    &
                    # (db_records['redcap_variable'] == redcap_variable) &
                    (pd.isna(db_records["instance"]))
                ]

                logger.info(
                    f"""
                    Not found in DB. Potential rows:
                    
                    {potential_rows}
                    """
                )
                return
            else:
                logger.info(f"Length of record found: {len(db_record)}")
                logger.info("db_record:")
                logger.info(f"\n{db_record}")

In [None]:
map_project_to_db(project_records, db_records)