# Patient Data Export

A notebook exporting patient data for study analyses.

## Imports

In [None]:
import dataclasses
from enum import Enum
import io
import IPython.display
import ipywidgets
import itertools
import json
import nbformat
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
import operator
import pandas as pd
import pathlib
import pyzipper
import re
from typing import List, Optional, Union

from scope.documents import document_set
from scope.populate.data.archive import Archive

## Utilities

### Utility: excel_dataframe

Returns bytes containing an Excel export of a dataframe.

In [None]:
def excel_dataframe(df: pd.DataFrame) -> bytes:
    iobytes = io.BytesIO()
    df.to_excel(iobytes, index=False)

    return iobytes.getvalue()

### Utility: markdown_documentation

Returns a string containing markdown content recovered from a cell in this notebook.

Intended to allow including the content of markdown cells as documentation in an export.

In [None]:
def markdown_documentation(documentation_name: str) -> str:
    # Load this same notebook.
    notebook = nbformat.read("patientdata.ipynb", nbformat.NO_CONVERT)

    # Go through each cell, looking for a match.
    for cell_current in notebook["cells"]:
        match = True
        if match:
            match = cell_current["cell_type"] == "markdown"
        if match:
            match = re.match(
                "^(#*) Documentation: ({})\\n(.*)".format(documentation_name),
                cell_current["source"],
            )

        if match:
            return cell_current["source"]

    # If no match was found, raise a ValueError.
    raise ValueError(
        "No matching documentation cell found: {}".format(documentation_name)
    )

### Utility: patient_data_export_file

The path and contents of a file to be exported.

In [None]:
class ExportFileType(Enum):
    EXCEL = "EXCEL"
    MARKDOWN = "MARKDOWN"


@dataclasses.dataclass(frozen=True)
class ExportFile:
    path: pathlib.Path
    type: ExportFileType
    bytes: Optional[bytes]
    text: Optional[str]

    @classmethod
    def from_excel(
        cls,
        path: Union[pathlib.Path, str],
        excel: bytes,
    ):
        return ExportFile(
            path=pathlib.Path(path),
            type=ExportFileType.EXCEL,
            bytes=excel,
            text=None,
        )

    @classmethod
    def from_markdown(
        cls,
        path: Union[pathlib.Path, str],
        markdown: str,
    ):
        return ExportFile(
            path=pathlib.Path(path),
            type=ExportFileType.MARKDOWN,
            bytes=None,
            text=markdown,
        )


def patient_data_export_file(file: ExportFile):
    patient_data_export_file_list.append(file)

### Utility: sanitize_dataframe

Sanitize contents of a dataframe that cannot be written to Excel.

In [None]:
def sanitize_cell(value):
    if type(value) == str:
        value = ILLEGAL_CHARACTERS_RE.sub("?", value)

    return value


def sanitize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    return df.map(sanitize_cell)

## Input

### Obtain Archive Suffix: archive_suffix 

In [None]:
# Obtain suffix indicating desired version of encrypted archives.
# Do not include the '.zip' suffix.
archive_suffix = input("Encrypted archive suffix: ")

### Obtain Archive Password: archive_password

In [None]:
# Obtain password to encrypted archives.
archive_password = input("Encrypted archive password: ")

### Utility: archive_dir_path

In [None]:
# Obtain password to encrypted archives.
archive_dir_path = "../../../secrets/data"

## Load Archives

### Decrypt Archives

In [None]:
# Obtain full path to encrypted archives.

# Start with name of each archive.
archive_multicare_file_name = "archive_multicare_{}.zip".format(archive_suffix)
archive_scca_file_name = "archive_scca_{}.zip".format(archive_suffix)

# Obtain a full path to encrypted archive, relative to the location of the notebook.
# Expects the encrypted archive to be in the "secrets/data" directory.
archive_multicare_path = pathlib.Path(
    archive_dir_path,
    archive_multicare_file_name,
)
archive_scca_path = pathlib.Path(
    archive_dir_path,
    archive_scca_file_name,
)

In [None]:
print("Decrypting archive:")
print("{}".format(archive_multicare_path.resolve()))

# Obtain the archive.
archive_multicare = Archive.read_archive(
    archive_path=archive_multicare_path,
    password=archive_password,
)

print("{} documents.".format(len(archive_multicare.entries.values())))

In [None]:
print("Decrypting archive:")
print("{}".format(archive_scca_path.resolve()))

# Obtain the archive.
archive_scca = Archive.read_archive(
    archive_path=archive_scca_path,
    password=archive_password,
)

print("{} documents.".format(len(archive_scca.entries.values())))

### Process Archives

#### Combine Patients in Datatable

In [None]:
# Get patient documents from MultiCare.
documents_multicare_patients = (
    archive_multicare.collection_documents(
        collection="patients",
    )
    .remove_sentinel()
    .remove_revisions()
)
df_multicare_patients = pd.DataFrame.from_records(
    documents_multicare_patients.documents
)
df_multicare_patients["database"] = "multicare"

# Get patient documents from SCCA.
documents_scca_patients = (
    archive_scca.collection_documents(
        collection="patients",
    )
    .remove_sentinel()
    .remove_revisions()
)
df_scca_patients = pd.DataFrame.from_records(documents_scca_patients.documents)
df_scca_patients["database"] = "fhcc"

# Unify all current patient documents.
df_archive_patients = pd.concat([df_multicare_patients, df_scca_patients]).reset_index(
    drop=True
)

# Sanitize once so contents dataframe can be exported.
df_archive_patients = sanitize_dataframe(df_archive_patients)

#### Filter Pilot Patients

Remove the 6 pilot patients.

In [None]:
df_archive_patients = df_archive_patients.drop(
    df_archive_patients[
        df_archive_patients["patientId"].isin(
            [
                "ymzwx6e6w6kqi",
                "mmmb54v52l7re",
                "ouoa4ucldbhie",
                "zazst4yu23a5q",
                "wf4btxqjtd2oa",
                "s3bcmgmp7gdss",
            ]
        )
    ].index
).reset_index(drop=True)

#### Utility: patient_documents

In [None]:
# Create a helper for accessing the document collection of a unified patient.
def patient_documents(row_patient) -> document_set.DocumentSet:
    if row_patient["database"] == "multicare":
        archive = archive_multicare
    elif row_patient["database"] == "fhcc":
        archive = archive_scca
    else:
        raise ValueError()

    return archive.collection_documents(collection=row_patient["collection"])

## Prepare DocumentSets and DataFrames

#### Prepare Patients: df_patients

A dataframe containing the identity of each patient.

In [None]:
# In development, it can be helpful to sample a subset of patients.
# df_patients = df_archive_patients.groupby("database").sample(n=5).reset_index(drop=True)

df_patients = df_archive_patients

IPython.display.display(df_patients)

#### Prepare Per-Patient DocumentSet: patient_id_to_documentset

In [None]:
patient_id_to_documentset = {}

progress_patient_count = ipywidgets.IntProgress(min=0, max=len(df_patients))
IPython.display.display(progress_patient_count)

for patient_count, patient_current in df_patients.iterrows():
    patient_id_current = patient_current["patientId"]
    patient_collection_current = patient_documents(patient_current.to_dict())

    patient_id_to_documentset[patient_id_current] = patient_collection_current

    progress_patient_count.value = patient_count + 1

#### Prepare Per-Patient DataFrames: patient_id_to_df

In [None]:
patient_id_to_df = {}

progress_patient_count = ipywidgets.IntProgress(min=0, max=len(df_patients))
IPython.display.display(progress_patient_count)

for patient_count, (patient_id_current, patient_documentset_current) in enumerate(
    patient_id_to_documentset.items()
):
    df_patient_current = pd.DataFrame.from_records(patient_documentset_current.documents)
    df_patient_current = sanitize_dataframe(df_patient_current)

    patient_id_to_df[patient_id_current] = df_patient_current

    progress_patient_count.value = patient_count + 1

## Build Export

### Reset Export File List

In [None]:
patient_data_export_file_list: List[ExportFile] = []

### Documentation: Overall Export

- Data is originally taken from two database exports: one from FHCC and one from MultiCare.
- A "database" column is added to indicate the origin of each patient.
- There were 6 pilot patients. These have been completely removed from the export.

In [None]:
patient_data_export_file(
    ExportFile.from_markdown(
        "documentation.md",
        markdown_documentation("Overall Export"),
    )
)

### Documentation: Raw Export

- Raw export files have not been organized or processed.
- They are approximately "everything", but will need transformed for utility of different analyses.

In [None]:
patient_data_export_file(
    ExportFile.from_markdown(
        "raw.md",
        markdown_documentation("Raw Export"),
    )
)

patient_data_export_file(
    ExportFile.from_excel(
        "raw.patients.xlsx",
        excel_dataframe(df_patients),
    )
)

### Sandbox for Per-Patient Export

In [None]:
progress_patient_count = ipywidgets.IntProgress(min=0, max=len(df_patients))
IPython.display.display(progress_patient_count)

for patient_count, (patient_id_current, df_patient_current) in enumerate(
    patient_id_to_df.items()
):

    patient_data_export_file(
        ExportFile.from_excel(
            pathlib.Path(
                "patient_{}".format(patient_id_current),
                "raw.xlsx",
            ),
            excel_dataframe(df_patient_current),
        )
    )

    progress_patient_count.value = patient_count + 1

## Export

In [None]:
# The export is stored in a single zip file
with open(
    pathlib.Path(
        archive_dir_path,
        "export_{}.zip".format(archive_suffix),
    ),
    mode="xb",
) as archive_file:
    with pyzipper.AESZipFile(
        archive_file,
        "w",
        compression=pyzipper.ZIP_LZMA,
        encryption=pyzipper.WZ_AES,
    ) as archive_zipfile:
        # Set the password
        archive_zipfile.setpassword(archive_password.encode("utf-8"))

        for file_current in patient_data_export_file_list:
            if file_current.type == ExportFileType.EXCEL:
                archive_zipfile.writestr(str(file_current.path), file_current.bytes)
            elif file_current.type == ExportFileType.MARKDOWN:
                archive_zipfile.writestr(
                    str(file_current.path), file_current.text.encode("utf-8")
                )
            else:
                raise ValueError("Unknown ExportFileType")