# Patient Summary

A notebook summarizing patient data.

## Imports

In [None]:
import itertools
import json
import operator
import pathlib

import pandas as pd
from scope.documents import document_set
from scope.populate.data.archive import Archive

## Obtain Archive Password 

In [None]:
# Obtain password to encrypted archives.
archive_password = input("Encrypted archive password: ")

## Obtain Archive Paths

In [None]:
# Obtain full path to encrypted archives.

# Start with name of each archive.
archive_multicare_file_name = "archive_multicare_v0.8.0_20230821.zip"
archive_scca_file_name = "archive_scca_v0.8.0_20230821.zip"

# Obtain a full path to encrypted archive, relative to the location of the notebook.
# Expects the encrypted archive to be in the "secrets/data" directory.
archive_multicare_path = pathlib.Path(
    "../../../secrets/data",
    archive_multicare_file_name,
)
archive_scca_path = pathlib.Path(
    "../../../secrets/data",
    archive_scca_file_name,
)

## Decrypt Archives

In [None]:
print("Decrypting archive:")
print("{}".format(archive_multicare_path.resolve()))

# Obtain the archive.
archive_multicare = Archive.read_archive(
    archive_path=archive_multicare_path,
    password=archive_password,
)

print("{} documents.".format(len(archive_multicare.entries.values())))

In [None]:
print("Decrypting archive:")
print("{}".format(archive_scca_path.resolve()))

# Obtain the archive.
archive_scca = Archive.read_archive(
    archive_path=archive_scca_path,
    password=archive_password,
)

print("{} documents.".format(len(archive_scca.entries.values())))

## Obtain Patient Datatable

Unify patient documents from the two databases.

In [None]:
# Get patient documents from MultiCare.
documents_multicare_patients = (
    archive_multicare.collection_documents(
        collection="patients",
    )
    .remove_sentinel()
    .remove_revisions()
)
df_multicare_patients = pd.DataFrame.from_records(
    documents_multicare_patients.documents
)
df_multicare_patients["database"] = "multicare"

# Get patient documents from SCCA.
documents_scca_patients = (
    archive_scca.collection_documents(
        collection="patients",
    )
    .remove_sentinel()
    .remove_revisions()
)
df_scca_patients = pd.DataFrame.from_records(documents_scca_patients.documents)
df_scca_patients["database"] = "scca"

# Unify all current patient documents.
df_patients = pd.concat([df_multicare_patients, df_scca_patients]).reset_index()

In [None]:
# Create a helper for accessing the document collection of a unified patient.
def patient_documents(row_patient) -> document_set.DocumentSet:
    if row_patient["database"] == "multicare":
        archive = archive_multicare
    elif row_patient["database"] == "scca":
        archive = archive_scca
    else:
        raise ValueError()

    return archive.collection_documents(collection=row_patient["collection"])

## Filter Pilot Patients

Remove the 6 pilot patients.

In [None]:
df_patients = df_patients.drop(
    df_patients[
        df_patients["patientId"].isin(
            [
                "ymzwx6e6w6kqi",
                "mmmb54v52l7re",
                "ouoa4ucldbhie",
                "zazst4yu23a5q",
                "wf4btxqjtd2oa",
                "s3bcmgmp7gdss",
            ]
        )
    ].index
).reset_index()

## Filter Patient Columns

Filter columns to those which are most relevant.

In [None]:
# Filter columns.
df_patients = df_patients[
    [
        "database",
        "patientId",
        "collection",
        "name",
    ]
]

# Summarize Patients

In [None]:
# Iterate over all raw documents to build a list of known document types.
document_types = {
    document_current["_type"]
    for document_current in itertools.chain(
        archive_multicare.entries.values(),
        archive_scca.entries.values(),
    )
}

sorted(document_types)

In [None]:
def apply_num_documents(row_patient):
    patient_collection = patient_documents(row_patient)
    patient_collection = patient_collection.remove_sentinel()

    # Overall number of unique documents.
    # This includes revisions and even deletion markers.
    row_patient["numDocumentRevisions"] = len(patient_collection.documents)

    # Group documents by their key.
    # For singleton document types, this is the type.
    # For set document types, this is the type plus the set id.
    document_keys = patient_collection.group_revisions().keys()
    row_patient["numDocuments"] = len(document_keys)

    # Calculate counts of certain document types.
    # Because this is calculated using group_revisions(),
    # it counts documents that may have evolved over time (including possible deletion).
    DOCUMENT_COUNTS = [
        ("numActivityDocuments", "activity"),
        ("numActivityLogDocuments", "activityLog"),
        ("numAssessmentLogDocuments", "assessmentLog"),
        ("numMoodLogDocuments", "moodLog"),
        ("numValueDocuments", "value"),
    ]
    for document_count_current in DOCUMENT_COUNTS:
        row_patient[document_count_current[0]] = len(
            [
                key_current
                for key_current in document_keys
                if key_current[0] == document_count_current[1]
            ]
        )

    return row_patient

In [None]:
df_patients = df_patients.apply(apply_num_documents, axis="columns")

## Display Patients

In [None]:
with pd.option_context(
    "display.max_rows",
    None,
    "display.max_columns",
    None,
    "display.width",
    None,
):
    display(
        df_patients.drop(
            [
                # Intentionally do not display patient names.
                "name",
            ],
            axis="columns",
        )
    )

## Output CSV

In [None]:
df_patients.drop(
    [
        # Intentionally do not export patient names.
        "name",
    ],
    axis="columns",
).to_csv("patientsummary-export.csv")