# Patient Data Export

A notebook exporting patient data for study analyses.

## Imports

In [None]:
import io
import itertools
import json
import nbformat
import operator
import pathlib

import pandas as pd
from scope.documents import document_set
from scope.populate.data.archive import Archive

## Utilities

### Documentation Utility

This utility retrieves the content of markdown cells within this notebook.
It is intended to allow including the content of those cells as documentation in the resulting export.

In [None]:
def documentation_markdown(documentation_name):
    # Load this same notebook.
    notebook = nbformat.read("patientdata.ipynb", nbformat.NO_CONVERT)

    # Go through each cell, looking for a match.
    for cell_current in notebook["cells"]:
        match = True
        if match:
            match = cell_current["cell_type"] == "markdown"
        if match:
            match = cell_current["source"].startswith("## Documentation: {}\n".format(documentation_name))
        
        if match:
            return cell_current["source"]

    # If no match was found, raise a ValueError.
    raise ValueError(
        "No matching documentation cell found: {}".format(documentation_name)
    )

## Input

### Obtain Archive Suffix 

In [None]:
# Obtain suffix indicating desired version of encrypted archives.
# Do not include the '.zip' suffix.
archive_suffix = input("Encrypted archive suffix: ")

### Obtain Archive Password 

In [None]:
# Obtain password to encrypted archives.
archive_password = input("Encrypted archive password: ")

## Load Archives

### Documentation: Overall Export

- Data is originally taken from two database exports: one from FHCC and one from MultiCare.
- Data is then combined in a single datatable.
- A "database" column is added to indicate the origin of each patient.
- There were 6 pilot patients, these have been completely removed from the export. 

### Decrypt Archives

In [None]:
# Obtain full path to encrypted archives.

# Start with name of each archive.
archive_multicare_file_name = "archive_multicare_{}.zip".format(archive_suffix)
archive_scca_file_name = "archive_scca_{}.zip".format(archive_suffix)

# Obtain a full path to encrypted archive, relative to the location of the notebook.
# Expects the encrypted archive to be in the "secrets/data" directory.
archive_multicare_path = pathlib.Path(
    "../../../secrets/data",
    archive_multicare_file_name,
)
archive_scca_path = pathlib.Path(
    "../../../secrets/data",
    archive_scca_file_name,
)

In [None]:
print("Decrypting archive:")
print("{}".format(archive_multicare_path.resolve()))

# Obtain the archive.
archive_multicare = Archive.read_archive(
    archive_path=archive_multicare_path,
    password=archive_password,
)

print("{} documents.".format(len(archive_multicare.entries.values())))

In [None]:
print("Decrypting archive:")
print("{}".format(archive_scca_path.resolve()))

# Obtain the archive.
archive_scca = Archive.read_archive(
    archive_path=archive_scca_path,
    password=archive_password,
)

print("{} documents.".format(len(archive_scca.entries.values())))

### Combined in Datatable

In [None]:
# Get patient documents from MultiCare.
documents_multicare_patients = (
    archive_multicare.collection_documents(
        collection="patients",
    )
    .remove_sentinel()
    .remove_revisions()
)
df_multicare_patients = pd.DataFrame.from_records(
    documents_multicare_patients.documents
)
df_multicare_patients["database"] = "multicare"

# Get patient documents from SCCA.
documents_scca_patients = (
    archive_scca.collection_documents(
        collection="patients",
    )
    .remove_sentinel()
    .remove_revisions()
)
df_scca_patients = pd.DataFrame.from_records(documents_scca_patients.documents)
df_scca_patients["database"] = "fhcc"

# Unify all current patient documents.
df_patients = pd.concat([df_multicare_patients, df_scca_patients]).reset_index()

In [None]:
# Create a helper for accessing the document collection of a unified patient.
def patient_documents(row_patient) -> document_set.DocumentSet:
    if row_patient["database"] == "multicare":
        archive = archive_multicare
    elif row_patient["database"] == "fhcc":
        archive = archive_scca
    else:
        raise ValueError()

    return archive.collection_documents(collection=row_patient["collection"])

### Filter Pilot Patients

Remove the 6 pilot patients.

In [None]:
df_patients = df_patients.drop(
    df_patients[
        df_patients["patientId"].isin(
            [
                "ymzwx6e6w6kqi",
                "mmmb54v52l7re",
                "ouoa4ucldbhie",
                "zazst4yu23a5q",
                "wf4btxqjtd2oa",
                "s3bcmgmp7gdss",
            ]
        )
    ].index
).reset_index()

## Export

In [None]:
bytes = io.BytesIO()
df_patients.to_excel(bytes)

print(bytes)

# Junk