## Extracting data from SafetyPAD API Pull

The previous script _pulled_ data from the SafetyPAD API. This script _parses_ that
data into a CSV format.

Please note that this script should work in the future, but it outputs
`demographic_data.csv` and all of our front-to-back runs begin at that point.

In [None]:
import json

import pandas as pd

from femsntl.datafiles import PRIVATE_DATA_DIR

In [None]:
with open(PRIVATE_DATA_DIR / "pcrs" / "final.json", "rt") as infile:
    data = json.load(infile)

In [None]:
column_names = [
    "pcr_id",
    "fems_id",
    "last_name",
    "first_name",
    "middle_name",
    "home_address",
    "home_city",
    "home_county",
    "home_state",
    "home_zip",
    "home_country",
    "home_tract",
    "ssn",
    "gender",
    "race",
    "age",
    "age_units",
    "date_of_birth",
    "phone_number",
    "email_address",
    "drivers_license_state",
    "drivers_license_number",
    "method_of_payment",
    "insurance_company_name",
    "insurance_group_number",
    "insurance_policy_number",
]

tag_names = (
    ["eCase.01m", "eResponse.03"]
    + ["ePatient.{:02d}".format(i) for i in range(2, len(column_names) + 2 - 4)]
    + ["ePayment.{:02d}".format(i) for i in [1, 10, 17, 18]]
)

In [None]:
def is_none(x):
    return None if x == "None" else x


dict_data = [{x["tag"]: is_none(x["text"]) for x in datum["parsed"]} for datum in data]

In [None]:
len(dict_data)

In [None]:
records = [[x[tag_name] for tag_name in tag_names] for x in dict_data]

In [None]:
len(records)

In [None]:
formatted_df = pd.DataFrame.from_records(records, columns=column_names)

In [None]:
len(formatted_df)

In [None]:
formatted_df.method_of_payment.value_counts(dropna=False)

In [None]:
formatted_df.to_csv(PRIVATE_DATA_DIR / "demographic_data.csv", index=False)

In [None]:
formatted_df.columns

In [None]:
formatted_df.date_of_birth.isnull().mean()

In [None]:
formatted_df.drivers_license_number.isnull().mean()