# Imports

In [1]:
import re
from typing import List, Optional

import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '../../')
from IPython.core.interactiveshell import InteractiveShell

from femsntl.datafiles import INTERMEDIATE_DIR, PUBLIC_DATA_DIR
from femsntl.df_verbs import case_when
from femsntl.utils import get_mostrec

InteractiveShell.ast_node_interactivity = "all"

pd.options.display.float_format = "{:.4f}".format

# Functions to migrate

Three units of analysis:

- `code-level`: the raw form of the claims data. Each unique claim (`ClaimTCNText`) is repeated with different information about that claim included --- e.g., one claim might be tied to multiple codes indicating diff procedures performed or specialists seen
- `claim-level`: aggregates up from codes to unique claims
- `beneficiary (bene)-level`: aggregates up from claims to unique medicaid beneficiaries (`MedicaidSystemID`)

In [2]:
def construct_edvisit(df: pd.DataFrame) -> pd.DataFrame:
    """
    Function to code whether a visit is an emergency dept (ED) visit

    Args:
        df: code-level df with fields relevant for ED visits

    Returns:
       output_df: dataset grouped by unique claims coding whether any visits associated with
       that claim are an ed visit and/or an inpatient admit, as well as two fields we
       need for later transformations (MedicaidSystemID and primarydiagnosiscode, latter is used
       to code ED necessity of visit)

    """
    df = df.copy()

    # code binary indicator for any ed visit associated with claim
    df["is_ed_visit"] = df.RevenueCode.isin(ED_VISIT_CODES)
    df["is_inpatient_admit"] = df.RevenueCode.isin(ROOM_BOARD_CODES)

    # Claims can have multiple revenue codes, and the passed df has one
    # revenue code per line, grouped together by ClaimTCNText. So we
    # aggregate to the claim level and code as an ED visit if:
    #   1. There is at least one ed_visit code, and
    #   2. There are NO inpatient revenue codes
    #
    # This relies on there being one primary diagnosis code per ClaimTCNText
    # which we assert here:
    assert (df.groupby("ClaimTCNText")["PrimaryDiagnosisCode"].nunique() == 1).all()

    output_df = (
        df.groupby("ClaimTCNText")
        .agg(
            {
                "is_ed_visit": "max",
                "is_inpatient_admit": "max",
                "MedicaidSystemID": "first",
                "PrimaryDiagnosisCode": "first",
            }
        )
        .reset_index()
    )

    output_df["is_ed_visit_not_inpatient"] = (
        output_df["is_ed_visit"] & ~output_df["is_inpatient_admit"]
    )
    output_df["is_ed_visit_with_inpatient"] = (
        output_df["is_ed_visit"] & output_df["is_inpatient_admit"]
    )

    return output_df


def mergeclaims_beneficiaries(
    claims_df: pd.DataFrame, benef_df: pd.DataFrame
) -> pd.DataFrame:
    """
    Function to merge claims onto beneficiaries

    Args:
        claims_df: code or claim-level df w/ things repeated across the same beneficiary
        benef_df: beneficiary-level data frame

    Returns:
       merged_df: beneficiary-level dataset where each beneficiary is repeated across claims

    """

    ## subset benef df to merge-relevant fields
    benef_df = benef_df[["num_1", "dispo_broad", "event_status", "MedicaidSystemID"]]

    ## left join claims onto beneficiaries
    ## we want to retain all beneficaries even if they have no claims (left)
    ## but we drop claims that are not linked to a beneficiary (since claims pull
    ## based on more inclusive match to beneficiaries than we're using)
    merged_df = pd.merge(benef_df, claims_df, on="MedicaidSystemID", how="left")

    ## Make sure nothing lost in the merge
    assert (
        benef_df["MedicaidSystemID"].nunique()
        == merged_df["MedicaidSystemID"].nunique()
    )

    return merged_df


def construct_nonED_care(
    df: pd.DataFrame, claim_types_to_remove: List[str]
) -> pd.DataFrame:

    """
    Function to code whether a visit is a general non-ED visit

    Args:
        df: code-level df with revenue and procedure codes relevant for coding

    Returns:
       output_df: claim-level df indicating any qualify within the claim

    """

    df = df.copy()

    ## code binary indicator for any ed visit associated with claim
    df["is_not_general_care"] = (
        df.RevenueCode.isin(ED_VISIT_CODES)
        | df.ProcedureCode.isin(ED_PHYSICIAN_CODES)
        | df.ClaimTypeDescription.isin(claim_types_to_remove)
    )

    df["is_general_care"] = ~df["is_not_general_care"]

    # Our input df has one row per _code_, we want one row per _claim_
    return (
        df.groupby("ClaimTCNText")
        .agg({"is_general_care": "max", "MedicaidSystemID": "first"})
        .reset_index()
    )


DEFAULT_VARS_TO_INCLUDE = ["MedicaidSystemID", "dispo_broad", "event_status", "num_1"]


def code_noclaims(
    df: pd.DataFrame,
    outcome_varnames: List[str],
    code_to: bool = False,
    vars_to_include: Optional[List[str]] = None,
) -> pd.DataFrame:
    """
    Function to code outcomes for ntl-ers matched to the beneficiary file but with no claims in
    a particular window

    Args:
        df: The data frame to code that contains people matched to beneficiaries file but w/ no claims in window
        outcome_varnames: list of strings representing names of numeric-type vars to add
        code_to: numeric value to impute
        vars_to_include: vars from base data to retain

    Returns:
        df_pluscols: Data frame w/ outcomes coded for bene. with no claims
        (diff than imputation bc we affirmatively observe 0)

    """
    vars_to_include = vars_to_include or DEFAULT_VARS_TO_INCLUDE

    df = df[vars_to_include].copy()
    df_pluscols = df.reindex(
        columns=[*df.columns.tolist(), *outcome_varnames], fill_value=code_to
    )

    return df_pluscols


def summarize_bygroup(df: pd.DataFrame, col: str, col_type: str = "binary"):
    """
    This function prints some basic summary statistics of the passed df,
    which must have columns:
        * "dispo_broad"
        * `col`

    Args:
        df: The data frame to summarize
        col: The column to cross tab against "dispo_broad"
        col_type: One of:
            * "binary": Prints a basic cross tab and one normalized by rows
            * "continuous": Prints the mean of `col` for each "dispo_broad"
    """

    if col_type == "binary":
        print(pd.crosstab(df.dispo_broad, df[col]))
        print(pd.crosstab(df.dispo_broad, df[col], normalize="index"))

    elif col_type == "continuous":
        print(df.groupby("dispo_broad")[col].mean())

    else:
        raise ValueError(
            f"col_type must be one of ['binary', 'continuous'] not {col_type}"
        )


def merge_dx_classification(
    data: pd.DataFrame, icd_classify_data: pd.DataFrame, time_horizon: str
) -> pd.DataFrame:
    """
    Function to clean primary dx codes attached to a claim and merge with icd codes

    Args:
       data: code or claim-level data with primarydx code
       icd_classify_data:

    Returns:
        df_pluscols: Data frame w/ outcomes coded for bene. with no claims
        (diff than imputation bc we affirmatively observe 0)

    """

    ## clean dx code
    data["primarydx_tomerge"] = data.PrimaryDiagnosisCode.str.replace("\\.", "")

    ## print match rate
    print(
        "Out of "
        + str(len(data.primarydx_tomerge.unique()))
        + " unique dx codes in callers claims in "
        + time_horizon
        + " we matched "
        + str(len(set(data.primarydx_tomerge).intersection(icd_classify_data.icd10cm)))
        + " with the NYU classifications"
    )

    ## merge onto claims using icd code
    ## only merge for people with some claims
    ## since we'll later code outcomes for
    ## those with no claims
    claims_wEDclass = pd.merge(
        data,
        icd_classify_data,
        left_on="primarydx_tomerge",
        right_on="icd10cm",
        how="left",
    )
    return claims_wEDclass


def classify_visit_approp(
    data: pd.DataFrame, edcol_touse: str = "is_ed_visit_not_inpatient"
):
    """

    Args:
        data: The data frame to code
        edcol_touse: list containing str w/ name of ed visit column (default excludes inpt admissions)

    Returns:
        df: df with different flags for unnec. ED visit
    """

    data["edvisit_nonemergent"] = case_when(
        (data[edcol_touse]) & (data.Non_Emergent > 0),
        "ED visit; non-emergent",
        (data[edcol_touse]) & (data.Non_Emergent == 0),
        "ED visit; emergent",
        data[edcol_touse],
        "ED visit; unknown status",
        "Not ED visit",
    )

    data["edvisit_pctreatable"] = case_when(
        (data[edcol_touse]) & (data.Emergent__PC_Treatable > 0),
        "ED visit; PC treatable",
        (data[edcol_touse]) & (data.Emergent__PC_Treatable == 0),
        "ED visit; not PC treatable",
        data[edcol_touse],
        "ED visit; unknown status",
        "Not ED visit",
    )

    ## unnecessary is if it's non-emergent OR pc treatable
    data["unnecessary_ED"] = case_when(
        (data.edvisit_nonemergent == "ED visit; non-emergent")
        | (data.edvisit_pctreatable == "ED visit; PC treatable"),
        "Unnecessary ED visit (non-emergent or PC treatable)",
        (data.edvisit_nonemergent == "ED visit; emergent")
        | (data.edvisit_pctreatable == "ED visit; not PC treatable"),
        "Necessary ED visit (emergent; not PC treatable)",
        (data.edvisit_nonemergent == "ED visit; unknown status")
        | (data.edvisit_pctreatable == "ED visit; unknown status"),
        "Unclassified ED visit",
        "Not ED visit",
    )

    ## in addition to factor, create series of boolean flags to help with aggregation later
    data["is_unnecessary_ED"] = np.where(
        data.unnecessary_ED == "Unnecessary ED visit (non-emergent or PC treatable)",
        True,
        False,
    )
    data["is_necessary_ED"] = np.where(
        data.unnecessary_ED == "Necessary ED visit (emergent; not PC treatable)",
        True,
        False,
    )
    data["is_unclassified_ED"] = np.where(
        data.unnecessary_ED == "Unclassified ED visit", True, False
    )
    data["is_not_ED"] = np.where(data.unnecessary_ED == "Not ED visit", True, False)

    return data


def construct_binary_measures(data, outcome_vars):
    """
    Function to construct binary measures from count or continuous data

    Args:
        data: The data frame to code
        outcome_vars: outcome vars to binarize

    Returns:
        df: df with cols added for _1ormore (T/F) counts of that outcome
    """
    for var in outcome_vars:
        new_var = var + "_1ormore"
        data[new_var] = np.where(data[var] >= 1, True, False)
    return data


def impute_nonmatch(
    df: pd.DataFrame,
    inclusion_method: str,
    outcome_varnames: List[str],
    code_to_fornumeric: int = np.nan,  # will change after bounding
    vars_to_include: Optional[List[str]] = None,
) -> pd.DataFrame:
    """

    Args:
        df: The data frame to code
        inclusion_method: One of the following that tells which non-matches to Med beneficiaries to impute
        outcomes for
            * "all": include all non-matches, even if they had both name and dob, so a chance to be matched
            * "analytic_liberal": include non-matches if they were missing both name and dob or had a call response (event_status)
                                that was associated with lower match rates
            * "analytic_conservative": only include non-matches if they were missing both name and dob
        outcome_varnames: list of outcomes to code
        vars_to_include: list of

    Returns:
        df: Data frame with outcomes imputed using bounding method/inclusion criteria
    """
    vars_to_include = DEFAULT_VARS_TO_INCLUDE

    if inclusion_method == "all":
        pass
    elif inclusion_method == "analytic_liberal":
        df = df[
            (df.is_miss_nameordob)
            | df.event_status.isin(
                [
                    "NTL Handled - Clinical Referral",
                    "Field Requested NTL - NTL Clinical Referral",
                    "NTL Handled - RSC",
                    "NTL - Other",
                    "Field Requested NTL - NTL Other",
                ]
            )
        ]
    elif inclusion_method == "analytic_conservative":
        df = df[df.is_miss_nameordob]
    else:
        raise ValueError(
            f"inclusion_method must be one of ['all', 'analytic_liberal', 'analytic_conservative'] not {inclusion_method}"
        )

    df = df[vars_to_include].copy()

    ## first, add columns generally
    df_add = df.reindex(
        columns=[*df.columns.tolist(), *outcome_varnames], fill_value=code_to_fornumeric
    )

    ## then, add bounded versions
    df_wbounds = bound_binary(df_add, outcome_varnames)

    return df_wbounds


def bound_binary(
    data: pd.DataFrame, cols_tobound: List["str"], optimistic_tx: bool = False
) -> pd.DataFrame:

    """
    Function to boundary binary outcomes for respondents who:
    (1) did not match to Medicaid file but
    (2) were flagged using the impute_nonmatch function as ones who had little chance of matching due to
    missing identifiers

    Args:
        data: The data frame to code
        cols_tobound: list of columns to create bounds for
        optimistic_tx: value of optimistic bound for tx group members (others are variants of that)

    Returns:
        data: Data frame with bounded versions of columns

    Assumes that all cols in cols_tobound are imputed in the same direction (for us, makes less sense for
    the outcome of necessary ED visits but that's an unregistered outcome)

    If need to impute in diff directions, run with separate lists of cols_tobound
    """

    for col in cols_tobound:
        data[col + "_optimistic"] = np.where(
            data.dispo_broad == "NTL treatment",
            optimistic_tx,  # in default, optimistic is that tx had none (false), control had some (true)
            not optimistic_tx,
        )  # relevant for "bad" outcomes like ed visits
        data[col + "_pessimistic"] = np.where(
            data.dispo_broad == "NTL treatment", not optimistic_tx, optimistic_tx
        )
    return data


def add_bounds_suffix(
    vars_torename: List[str], df: pd.DataFrame, participant_group: str
) -> pd.DataFrame:
    """
    For medicaid beneficiaries for whom we do not need to do bounding, we
    create copies of the variables with the relevant suffixes (content is unchanged but
    just allows rowbinding)

    Args:
        vars_torename: list of vars we need to create suffixes for
        df: main df
        participant_group: so that we can later separate different types of groups of participants
        (e.g., beneficiaries; nonbene)

    Returns:
        df_toret: dataframe with new suffixed-variables added and orig vars dropped

    """

    for var in vars_torename:
        df[var + "_optimistic"] = df[[var]]
        df[var + "_pessimistic"] = df[[var]]

    df_toret = df.drop(columns=vars_torename, inplace=False)
    df_toret["participant_group"] = participant_group
    return df_toret


def code_PCP_visits(df: pd.DataFrame) -> pd.DataFrame:
    """
    For medicaid beneficiaries who have some claims, code whether the claim is
    associated with the pcp visit

    Args:
        df: dataframe at the claim level
    Returns:
        df: claim-level df with additional pcp-relevant columns
    """

    ## is pcp
    df["is_PCP_nofqhc"] = (
        (df.DetailRenderingSpecialtyCode.isin(specialty_codes))
        | (df.BillingProviderTypeCode.isin(type_code))
    ) & (df.ProcedureCode.isin(procedure_codes))

    ## is fqhc
    df["is_FQHC"] = (df.BillingProviderTypeCode.isin(type_code)) & (
        df.DetailRenderingSpecialtyCode.isin(fqhc_specialty_code)
    )

    ## is pcp visit
    df["is_PCP_visit"] = (df.is_PCP_nofqhc) | (df.is_FQHC)
    return df


def construct_ptlevel_PCP(df: pd.DataFrame) -> pd.DataFrame:

    """
    Two aggregations: (1) first, aggregate up to claim level (claimtcntext) so that one or more
    subclaims counts as a PCP claim, (2) then, aggregate up to patient level to construct
    boolean indicator for one or more pcp visits

    Args:
        df: dataframe at the claim level
    Returns:
        df: pt-level df with pcp binary indicator
    """

    ## first aggregate to claim level and code whether a claim had any PCP visits
    ## attached (don't want to count multiple PCP codes tied to
    ## same claim as multiple PCP visits)
    pcp_claimlevel = (
        df.groupby("ClaimTCNText")
        .agg({"is_PCP_visit": "sum", "MedicaidSystemID": "first"})
        .reset_index()
    )
    pcp_claimlevel["is_PCP_visit_claim"] = np.where(
        pcp_claimlevel.is_PCP_visit >= 1, True, False
    )
    pcp_ptlevel = (
        pcp_claimlevel.groupby("MedicaidSystemID")
        .agg({"is_PCP_visit_claim": np.sum})
        .reset_index()
    )
    pcp_ptlevel.columns = ["MedicaidSystemID", "total_PCP"]
    pcp_ptlevel["is_PCP_oneormore"] = np.where(pcp_ptlevel.total_PCP >= 1, True, False)
    return pcp_ptlevel


def summarize_expenditures(df: pd.DataFrame) -> pd.DataFrame:
    """
    Constructs total expenditures at the claim level through following steps:
    (1) Filter out managed clair
    (2) Sum expenditures for claims processed at header level (dropping duplicates on ClaimTCNText, which can result for added
    details like different providers being tied to same claim/expenditure)
    (3) Sum expenditures for claims processed at the detail level
    (4) For both, then aggregate by beneficiary
    (5) Merge a beneficiary's reimbursements across both levels

    Args:
        df: claim-level input data
    Returns:
        Patient-level data with total expenditures

    """

    ## filter out managed care
    data_noMC = df[df.ClaimTypeDescription != "Capitation (MC)"].copy()

    ## sum of claims processed at header level
    vars_tosubset = [
        "MedicaidSystemID",
        "ClaimProcessLevel",
        "ClaimTCNText",
        "HeaderTotalReimbursement",
    ]
    header_reimburse_rows = (
        data_noMC.loc[data_noMC.ClaimProcessLevel == "Header", vars_tosubset]
        .copy()
        .drop_duplicates(subset="ClaimTCNText")
    )
    header_sum_by_benefic = (
        header_reimburse_rows.groupby("MedicaidSystemID")
        .agg({"HeaderTotalReimbursement": np.sum})
        .reset_index()
    )

    ## sum of claims processed at detail level
    vars_tosubset = [
        "MedicaidSystemID",
        "ClaimProcessLevel",
        "ClaimTCNText",
        "DetailReimbursementAmount",
    ]
    detail_reimburse_rows = (
        data_noMC.loc[data_noMC.ClaimProcessLevel == "Detail", vars_tosubset]
        .copy()
        .drop_duplicates(subset="ClaimTCNText")
    )

    detail_sum_by_benefic = (
        detail_reimburse_rows.groupby("MedicaidSystemID")
        .agg({"DetailReimbursementAmount": np.sum})
        .reset_index()
    )

    ## left join each onto original claims
    claims_wheader = pd.merge(
        data_noMC[["MedicaidSystemID"]].drop_duplicates(),
        header_sum_by_benefic,
        on="MedicaidSystemID",
        how="left",
    )
    claims_both = pd.merge(
        claims_wheader, detail_sum_by_benefic, on="MedicaidSystemID", how="left"
    )

    claims_both_final = claims_both.fillna(0)
    claims_both_final["total_expenditures"] = (
        claims_both_final.HeaderTotalReimbursement
        + claims_both_final.DetailReimbursementAmount
    )

    return claims_both_final


def code_topquantile(df: pd.DataFrame, quantiles: dict) -> pd.DataFrame:

    """
    Args:
        df: dataframe
        quantiles: dictionary where keys are quantile names; items are values for that quant;
        subsetted to those we want to create vars for

    Returns:
        df: dataframe with indicators added

    """

    for key, item in quantiles.items():
        df[
            "is_expend_precall_quantile_" + re.sub("\\.", "", str(round(key, 2)))
        ] = case_when(df[precall_expend] >= item, True, False)

    return df

# 0. Inputs and Outputs

Here we load the inputs to the script, which are largely created in script 060

We use the function "find most recent" to get the name of the most recent version of the output
from the last script

In [3]:
## INPUTS
CLAIMS_IN_WINDOW_FINAL_FILE = INTERMEDIATE_DIR / get_mostrec(
    "Medicaid_analytic_peoplewclaims", INTERMEDIATE_DIR
)

### Using output of script 050 so not restricted to first call
DECORATED_NTL_OUTPUT_FILE = INTERMEDIATE_DIR / get_mostrec(
    "ntl_withmedicaidIDS", INTERMEDIATE_DIR
)
FIRSTCALL_FILE = INTERMEDIATE_DIR / get_mostrec("all_analytic", INTERMEDIATE_DIR)
CLAIMS_BEFORECALL_FILE = INTERMEDIATE_DIR / get_mostrec(
    "Medicaid_analytic_precallclaims", INTERMEDIATE_DIR
)
STATIC_ATTRIBUTES_FILE = INTERMEDIATE_DIR / get_mostrec(
    "Medicaid_staticattributes", INTERMEDIATE_DIR
)

NYU_ED_CODES_FILE = PUBLIC_DATA_DIR / "nyu_ed.xlsx"

## FLAGS FOR WHETHER TO RUN CERTAIN CODE SECTIONS
write_forIDsearch = False


## Outputs
MISSING_NTL_IDS_FOR_CHRYSANTHI = INTERMEDIATE_DIR / "ntl_missingidentifiers_forCH.csv"
PTLEVEL_WOUTCOMES_BENEFICONLY = INTERMEDIATE_DIR / "ptlevel_beneficonly.csv"
PTLEVEL_WOUTCOMES_FORROBUST = INTERMEDIATE_DIR / "ptlevel_forrobust.csv"

# 1. Read data and separate into three analytic samples

Here, we read the full claims data `Medicaid_analytic_peoplewclaims`, stored as constant `CLAIMS_IN_WINDOW_FINAL_FILE`,  where each person (`MedicaidSystemID`) is repeated across their Medicaid claims

NTL-ers fall into three groups:

1. People we matched to the beneficiaries file but who have no claims within a 6-month window of their first NTL call
2. People we matched to the beneficiaries file and who have 1+ claims within a 6-month window of their first NTL call
3. People we did not match to the beneficiaries file (might be true non-beneficiaries or nonmatches due to insufficient/poor identifiers)

## 1.1 Read files

In [4]:
## first load those with at least one claim within 6 months of call
## coding certain cols as objects to prevent conversion to integer when leading
## 0's are important
claims_inwindow = pd.read_csv(
    CLAIMS_IN_WINDOW_FINAL_FILE,
    dtype={"MedicaidSystemID": "object", "DetailRenderingSpecialtyCode": "object"},
)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
## then, load full dataset
ntl_medicaidstatus_init = pd.read_csv(
    DECORATED_NTL_OUTPUT_FILE, dtype={"MedicaidSystemID": "object"}
)
## Checking that we have correct # of tx and control
assert ntl_medicaidstatus_init.dispo_broad.value_counts()["NTL treatment"] == 3030
assert ntl_medicaidstatus_init.dispo_broad.value_counts()["NTL control"] == 3023

In [6]:
## finally, load data that restricts ppl to first call (already done
## for the those with claims in window in claims_in_window file; not
## done for non-matches)
firstcall_file = pd.read_csv(FIRSTCALL_FILE, dtype={"MedicaidSystemID": "object"})

## create version of ntl_medicaidstatus restricted to first call
## we use num_1 since those are non-repeated across rows
## and add new constructed ids
ntl_medicaidstatus = pd.merge(
    ntl_medicaidstatus_init.loc[
        ntl_medicaidstatus_init.num_1.isin(firstcall_file.num_1)
    ].copy(),
    firstcall_file[["num_1", "constructed_id_new"]],
    how="left",
    on="num_1",
)

## see that constructed_id_new is not less unique than num_1, so use num_1 for remainder
len(ntl_medicaidstatus.num_1.unique())
len(ntl_medicaidstatus.constructed_id_new.unique())

5352

5352

## 1.2 Create three analytic datasets: benef with claims in window; benef no claims in window; non-match



In [7]:
claims_24hours = claims_inwindow.loc[claims_inwindow.is_within_24_hours == True].copy()
claims_6mo = claims_inwindow.loc[
    (claims_inwindow.is_within_6_months == True)
    | (claims_inwindow.is_within_24_hours == True)
].copy()  # includes both 24 hours and after

In [8]:
## make sure that claims within 6 months encompass claims within 24 hours
assert claims_24hours.ClaimTCNText.isin(claims_6mo.ClaimTCNText).all()

## clean up status cols
## clean up date since we're sorting and restricting to first appearance
ntl_medicaidstatus["date_call_dateformat"] = pd.to_datetime(
    ntl_medicaidstatus.date, errors="coerce"
)

In [9]:
## Coding into 5 mut-ex categories
ntl_medicaidstatus["claims_status"] = case_when(
    ntl_medicaidstatus.MedicaidSystemID.isin(claims_24hours.MedicaidSystemID),
    "Benefic. with claims in 24-hours window",
    ntl_medicaidstatus.MedicaidSystemID.isin(claims_6mo.MedicaidSystemID),
    "Benefic. with claims not in 24 hours but in 6-months window",  # 6 months but not 24 hours
    (ntl_medicaidstatus.has_medicaid_id == "Has Medicaid ID")
    & (~ntl_medicaidstatus.MedicaidSystemID.isin(claims_24hours.MedicaidSystemID))
    & (~ntl_medicaidstatus.MedicaidSystemID.isin(claims_6mo.MedicaidSystemID)),
    "Benefic. no claims in 6-months window",
    ntl_medicaidstatus.has_medicaid_id == "Missing Medicaid ID",
    "Not matched",
    "Other",
)

In [10]:
## value counts; non-unique since a given beneficiary
## can be repeated across calls, so just rough estimate;
## uniquify later
ntl_medicaidstatus.claims_status.value_counts()

Not matched                                                    2285
Benefic. with claims in 24-hours window                        2260
Benefic. with claims not in 24 hours but in 6-months window     419
Benefic. no claims in 6-months window                           388
Name: claims_status, dtype: int64

In [11]:
if write_forIDsearch == True:

    ## subset to callers not matched to medicaid
    ntl_searchIDs = ntl_medicaidstatus.loc[
        ntl_medicaidstatus.has_medicaid_id == "Missing Medicaid ID",
        [
            "num_1",
            "date",
            "event_status",
            "dispo_broad",
            "all_appearances",
            "name_giventoMedicaid",
            "dob_giventoMedicaid",
        ],
    ].copy()

    ## recode as boolean to help with id status
    ntl_searchIDs["name_giventoMedicaid"] = np.where(
        ntl_searchIDs.name_giventoMedicaid == 1, True, False
    )
    ntl_searchIDs["dob_giventoMedicaid"] = np.where(
        ntl_searchIDs.dob_giventoMedicaid == 1, True, False
    )

    ## file to write
    ntl_searchIDs_towrite = ntl_searchIDs[
        ["num_1", "date", "id_status", "event_status", "dispo_broad", "all_appearances"]
    ].drop_duplicates()

    ## write
    ntl_searchIDs_towrite.to_csv(MISSING_NTL_IDS_FOR_CHRYSANTHI, index=False)

In [12]:
## Create three groups since we'll code their outcomes differently

## Group one: matched medicaid beneficiaries with claims in 24 hour or 6 month window
## separating to ensure mutually exclusivity with the no claims datasets
medicaid_patients_wclaims_24hour = ntl_medicaidstatus.loc[
    ntl_medicaidstatus.claims_status.isin(["Benefic. with claims in 24-hours window"])
].copy()
medicaid_patients_firstappear_24hour = (
    medicaid_patients_wclaims_24hour.sort_values(by="date_call_dateformat")
    .groupby("MedicaidSystemID")
    .first()
    .reset_index()
)

medicaid_patients_wclaims_6mo = ntl_medicaidstatus.loc[
    ntl_medicaidstatus.claims_status.isin(
        [
            "Benefic. with claims in 24-hours window",
            "Benefic. with claims not in 24 hours but in 6-months window",
        ]
    )
].copy()
medicaid_patients_firstappear_6mo = (
    medicaid_patients_wclaims_6mo.sort_values(by="date_call_dateformat")
    .groupby("MedicaidSystemID")
    .first()
    .reset_index()
)

## Group two: matched medicaid beneficiaries with no claims in 24 hour or 6 month window
medicaid_patients_noclaims_24hour = ntl_medicaidstatus.loc[
    (
        ~ntl_medicaidstatus.MedicaidSystemID.isin(
            medicaid_patients_firstappear_24hour.MedicaidSystemID
        )
    )
    & (ntl_medicaidstatus.claims_status != "Not matched")
].copy()

medicaid_patients_noclaims_24ho_firstappear = (
    medicaid_patients_noclaims_24hour.sort_values(by="date_call_dateformat")
    .groupby("MedicaidSystemID")
    .first()
    .reset_index()
)

medicaid_patients_noclaims_6mo = ntl_medicaidstatus.loc[
    (
        ~ntl_medicaidstatus.MedicaidSystemID.isin(
            medicaid_patients_firstappear_6mo.MedicaidSystemID
        )
    )
    & (ntl_medicaidstatus.claims_status != "Not matched")
].copy()

medicaid_patients_noclaims_6mo_firstappear = (
    medicaid_patients_noclaims_6mo.sort_values(by="date_call_dateformat")
    .groupby("MedicaidSystemID")
    .first()
    .reset_index()
)

## Group three: those not matched
medicaid_nonmatch = ntl_medicaidstatus.loc[
    ntl_medicaidstatus.claims_status == "Not matched"
].copy()

In [13]:
## then, make sure groups that are mutually exclusive are (not exhaustive)

### make sure that no one classified as a non-match to medicaid appears as a beneficiary
### with claims
overlap_claimnone = set(medicaid_nonmatch.num_1).intersection(
    set(medicaid_patients_wclaims_6mo.num_1)
)
assert len(overlap_claimnone) == 0

### make sure that no one classified as having claims within 6 months is also classified
### as having no claims in 6 months
overlap_claimwindow = set(medicaid_patients_firstappear_6mo.num_1).intersection(
    medicaid_patients_noclaims_6mo_firstappear.num_1
)
assert len(overlap_claimwindow) == 0

In [14]:
## create vectors of ids to check so that we can check later they're included
bene_24ho_init = ntl_medicaidstatus.MedicaidSystemID[
    ntl_medicaidstatus.claims_status == "Benefic. with claims in 24-hours window"
].to_list()
bene_6mo_init = (
    bene_24ho_init
    + ntl_medicaidstatus.MedicaidSystemID[
        ntl_medicaidstatus.claims_status
        == "Benefic. with claims not in 24 hours but in 6-months window"
    ].to_list()
)
bene_all_init = (
    bene_6mo_init
    + ntl_medicaidstatus.MedicaidSystemID[
        ntl_medicaidstatus.claims_status == "Benefic. no claims in 6-months window"
    ].to_list()
)

## deduplicate
bene_24ho = list(set(bene_24ho_init))
bene_6mo = list(set(bene_6mo_init))
bene_all = list(set(bene_all_init))

## check to see that all the beneficiaries are in the
## set addition of (1) patients w/ some claims and (2) patients w/ no claims
missing_noclaims_24ho = set(bene_all).difference(
    set(
        medicaid_patients_noclaims_24ho_firstappear.MedicaidSystemID.to_list()
        + medicaid_patients_firstappear_24hour.MedicaidSystemID.to_list()
    )
)
missing_noclaims_6mo = set(bene_all).difference(
    set(
        medicaid_patients_noclaims_6mo_firstappear.MedicaidSystemID.to_list()
        + medicaid_patients_firstappear_6mo.MedicaidSystemID.to_list()
    )
)

assert len(missing_noclaims_24ho) == 0
assert len(missing_noclaims_6mo) == 0

In [15]:
print(
    "In later analyses, check to make sure there are "
    + str(len(bene_all))
    + " unique beneficiaries"
)

In later analyses, check to make sure there are 3067 unique beneficiaries


# 2. Code whether a claim is associated with an ED visit 

## 2.1 First restrict to revenue codes that indicate ED visit; filter out inpatient admit, and merge with respondent ids

In [16]:
# revenue codes indicating emergency department visit
ED_CODES = [f"045{idx}" for idx in range(10)]
PROF_ED_CODE = ["0981"]
ED_VISIT_CODES = ED_CODES + PROF_ED_CODE

# Procedure codes indicating an ED physician
ED_PHYSICIAN_CODES = ["99281", "99282", "99283", "99284", "99285"]

# revenue codes for whether the visit resulted in an inpatient stay
ROOM_BOARD_CODES = [f"0{idx}" for idx in range(100, 220)]

In [17]:
## construct ed visits in different windows
edvisits_24hours = construct_edvisit(claims_24hours)
edvisits_6months = construct_edvisit(claims_6mo)

In [18]:
## add beneficiary information on to claims information
edvisits_24hours_wntl = mergeclaims_beneficiaries(
    edvisits_24hours, medicaid_patients_firstappear_24hour
)
edvisits_6mo_wntl = mergeclaims_beneficiaries(
    edvisits_6months, medicaid_patients_firstappear_6mo
)

In [19]:
## making sure that more visits are in longer time aggregation
assert edvisits_6mo_wntl.shape > edvisits_24hours_wntl.shape

## 2.2 Then, construct a general care, non-ED use measure (diff from pcp)

Not preregistered but requested analysis for budget hearings

In [20]:
claim_types_to_remove = [
    "Transportion (includes Amb)",
    "Capitation (MC)",
    "Nursing Fac & Long Term Care",
]

### construct ed visits in different windows
generalcare_24hours = construct_nonED_care(claims_24hours, claim_types_to_remove)
generalcare_6months = construct_nonED_care(claims_6mo, claim_types_to_remove)

## merge with beneficiary
generalcare_24hours_wntl = mergeclaims_beneficiaries(
    generalcare_24hours, medicaid_patients_firstappear_24hour
)
generalcare_6mo_wntl = mergeclaims_beneficiaries(
    generalcare_6months, medicaid_patients_firstappear_6mo
)

## not using after so maybe delete code once confirmed

## 2.3 Staying at the ED-visit level, code appropriateness of visit using NYU alg 

Want to code appropriateness at visit level before aggregating visits within a patient

### 2.3.1 Load and merge nyu codes onto primary dx code at visit level

In [21]:
nyu_ed = pd.read_excel(NYU_ED_CODES_FILE)

In [22]:
edvisits_24hours_wntl_wicd = merge_dx_classification(
    data=edvisits_24hours_wntl, icd_classify_data=nyu_ed, time_horizon="24 hours"
)

edvisits_6mo_wntl_wicd = merge_dx_classification(
    data=edvisits_6mo_wntl, icd_classify_data=nyu_ed, time_horizon="6 months"
)

Out of 754 unique dx codes in callers claims in 24 hours we matched 719 with the NYU classifications
Out of 2016 unique dx codes in callers claims in 6 months we matched 1920 with the NYU classifications


### 2.3.2 Staying at the visit level, code categories of ED visits

In [23]:
## visit-level classifications
edvisits_24ho_wcats = classify_visit_approp(data=edvisits_24hours_wntl_wicd)
edvisits_6mo_wcats = classify_visit_approp(data=edvisits_6mo_wntl_wicd)

## 2.4 Aggregate up from visit level to the patient level and code binary outcomes for all three groups

Related to the three groups above, we construct outcome variables as follows:

- For people matched and with claims, use observed outcome var
- For people matched but with no claims in window, impute to 0 (since we had chance to observe but affirmatively do not observe ED use, PCP use, etc.)
- For people not matched, make two decisions:

    1. Who do we want to impute outcomes for? (`inclusion_method`): described below
    2. What bound do we want to impute?

        - Pessimistic bound: if treatment, impute outcome to bad for treatment effect (e.g., ED use = 1); if control, impute outcome to bad for treatment effect (e.g., ED use = 0)
        - Optimistic bound: vice versa

### 2.4.1 Group 1: matched and some claims in window(s)

Aggregate from visit level to patient level for patients with 1+ claim in window

In [24]:
## sums
ptlevel_ed_24ho = (
    edvisits_24ho_wcats.groupby("MedicaidSystemID")
    .agg(
        {
            "is_unnecessary_ED": "sum",
            "is_necessary_ED": "sum",
            "is_unclassified_ED": "sum",
            "is_not_ED": "sum",
            "dispo_broad": "first",
            "event_status": "first",
            "num_1": "first",
        }
    )
    .reset_index()
)

ptlevel_ed_6mo = (
    edvisits_6mo_wcats.groupby("MedicaidSystemID")
    .agg(
        {
            "is_unnecessary_ED": "sum",
            "is_necessary_ED": "sum",
            "is_not_ED": "sum",
            "is_unclassified_ED": "sum",
            "dispo_broad": "first",
            "event_status": "first",
            "num_1": "first",
        }
    )
    .reset_index()
)

## binary
ed_outcome_vars = ["is_unnecessary_ED", "is_necessary_ED", "is_unclassified_ED"]
ptlevel_ed_24ho = construct_binary_measures(
    data=ptlevel_ed_24ho, outcome_vars=ed_outcome_vars
)


ptlevel_ed_6mo = construct_binary_measures(
    data=ptlevel_ed_6mo, outcome_vars=ed_outcome_vars
)

ed_outcome_vars_binary = [var + "_1ormore" for var in ed_outcome_vars]

### 2.4.2 Group 2: matched and no claims in window(s)

For this group, coding outcomes rather than "imputing"/bounding the outcomes, since we affirmatively observed no claims in that window


In [25]:
## add in matched beneficiaries with no claims as zero
ptlevel_ed_noclaims_24ho = code_noclaims(
    df=medicaid_patients_noclaims_24ho_firstappear,
    outcome_varnames=ed_outcome_vars_binary,
)
ptlevel_ed_noclaims_6mo = code_noclaims(
    df=medicaid_patients_noclaims_6mo_firstappear,
    outcome_varnames=ed_outcome_vars_binary,
)

### 2.4.3 Group 3: not matched

In function documentation for `impute_nonmatches` we specify three options for inclusion methods for ntl callers not matched to beneficiary file

Here, we use the "conservative" definition of who we should impute outcomes for (people with neither name nor DOB to give to Medicaid to try to match) and code both optimistic and pessimistic bounds as separate datasets

Then, after specifying who we include and setting placeholder values for them, we use the `bound_binary` function to impute optimistic and pessimistic bounds for them

In [26]:
## first, code indicator for whether
## it's missing either name and dob
## (true)
## or false
medicaid_nonmatch["is_miss_nameordob"] = np.where(
    (~medicaid_nonmatch.has_medicaid_name) | (~medicaid_nonmatch.has_medicaid_dob),
    True,
    False,
)

In [27]:
## then, return new cols for those nonmatches included using the
## specified inclusion method
ptlevel_ed_noclaims_notbene = impute_nonmatch(
    medicaid_nonmatch,
    inclusion_method="analytic_conservative",
    outcome_varnames=ed_outcome_vars_binary,
)

## then, add optimistic and pessimistic bounds
ptlevel_ed_noclaims_notbene_wbounds_init = bound_binary(
    ptlevel_ed_noclaims_notbene,
    cols_tobound=["is_unnecessary_ED_1ormore", "is_unclassified_ED_1ormore"],
)  # bound these so that tx = 0

ptlevel_ed_noclaims_notbene_wbounds = bound_binary(
    ptlevel_ed_noclaims_notbene_wbounds_init,
    cols_tobound=["is_necessary_ED_1ormore"],
    optimistic_tx=False,
)

## drop nonbounded version
ptlevel_ed_noclaims_notbene_wbounds.drop(columns=ed_outcome_vars_binary, inplace=True)
ptlevel_ed_noclaims_notbene_wbounds["participant_group"] = "Not matched"

## 2.5 Rowbind and summarize 

How we rowbind the three groups using the following process:

- Patients not matched are the only group with pessimistic/optimistic bounds
- For the other two groups---benefic w/ claims and those matched but w/ no claims--- we copy the cols and add the suffix optimistic and pessimistic
- This creates a wide dataset with both sets of outcomes
- We also construct a flag indicating which of the three groups the record belongs to


### 2.5.1 Create equivalent columns across the three groups

In [28]:
## first, for ones with matched and with some claims, copy over
## observed cols as opt and pessmistic
ptlevel_ed_24ho_torbind = add_bounds_suffix(
    ed_outcome_vars_binary, ptlevel_ed_24ho, participant_group="Matched + claims"
)

ptlevel_ed_6mo_torbind = add_bounds_suffix(
    ed_outcome_vars_binary, ptlevel_ed_6mo, participant_group="Matched + claims"
)

## then, for ones with matched and no claims, similar copying
ptlevel_ed_noclaims_24ho_torbind = add_bounds_suffix(
    ed_outcome_vars_binary,
    ptlevel_ed_noclaims_24ho,
    participant_group="Matched no claims",
)

ptlevel_ed_noclaims_6mo_torbind = add_bounds_suffix(
    ed_outcome_vars_binary,
    ptlevel_ed_noclaims_6mo,
    participant_group="Matched no claims",
)

### 2.5.2 Datasets 1: only matched beneficiaries

In [29]:
## subset to column cols (just removes some extraneous ones like the 1 or more flags we created during aggregation)
cols_touse = set(ptlevel_ed_noclaims_24ho_torbind.columns).intersection(
    set(ptlevel_ed_24ho_torbind.columns)
)

In [30]:
ptlevel_ed_allbenefic_24ho = pd.concat(
    [ptlevel_ed_24ho_torbind[cols_touse], ptlevel_ed_noclaims_24ho_torbind[cols_touse]],
    axis=0,
    sort=True,
)
ptlevel_ed_allbenefic_6mo = pd.concat(
    [ptlevel_ed_6mo_torbind[cols_touse], ptlevel_ed_noclaims_6mo_torbind[cols_touse]],
    axis=0,
    sort=True,
)

In [31]:
assert ptlevel_ed_allbenefic_24ho.shape[0] == ptlevel_ed_allbenefic_6mo.shape[0]

## make sure there are no beneficiaries missing from aggregation
missing_bene_ed = set(bene_all).difference(
    ptlevel_ed_allbenefic_6mo.MedicaidSystemID.to_list()
)

assert len(missing_bene_ed) == 0

In [32]:
## print outcomes
### note that since optimistic and pessimistic are identical cols
### for beneficiaries, doing optimistic for convenience, but this is
### observed rather than bounded
for var in [x + "_optimistic" for x in ed_outcome_vars_binary]:
    print(
        "-------------------------------\nrate within 24 hours when we include all beneficiaries"
    )
    summarize_bygroup(df=ptlevel_ed_allbenefic_24ho, col=var)

for var in [x + "_optimistic" for x in ed_outcome_vars_binary]:
    print(
        "-------------------------------\nrate within 6 months when we include all beneficiaries"
    )
    summarize_bygroup(df=ptlevel_ed_allbenefic_6mo, col=var)

-------------------------------
rate within 24 hours when we include all beneficiaries
is_unnecessary_ED_1ormore_optimistic  False  True 
dispo_broad                                       
NTL control                            1155    488
NTL treatment                          1066    358
is_unnecessary_ED_1ormore_optimistic  False  True 
dispo_broad                                       
NTL control                          0.7030 0.2970
NTL treatment                        0.7486 0.2514
-------------------------------
rate within 24 hours when we include all beneficiaries
is_necessary_ED_1ormore_optimistic  False  True 
dispo_broad                                     
NTL control                          1317    326
NTL treatment                        1203    221
is_necessary_ED_1ormore_optimistic  False  True 
dispo_broad                                     
NTL control                        0.8016 0.1984
NTL treatment                      0.8448 0.1552
--------------------------

### 2.5.3 Datasets 2: including non-matches based on inclusion criteria above


In [33]:
ptlevel_ed_all_24ho = pd.concat(
    [ptlevel_ed_allbenefic_24ho, ptlevel_ed_noclaims_notbene_wbounds], axis=0, sort=True
)

ptlevel_ed_all_6mo = pd.concat(
    [ptlevel_ed_allbenefic_6mo, ptlevel_ed_noclaims_notbene_wbounds], axis=0, sort=True
)

for var in [x + "_optimistic" for x in ["is_unnecessary_ED_1ormore"]]:
    print(
        "-------------------------------\nrate within 24 hours when we include beneficiaries + opt bounds on some non-matches"
    )
    summarize_bygroup(df=ptlevel_ed_all_24ho, col=var)

for var in [x + "_pessimistic" for x in ["is_unnecessary_ED_1ormore"]]:
    print(
        "-------------------------------\nrate within 24 hours when we include beneficiaries + pess bounds on some non-matches"
    )
    summarize_bygroup(df=ptlevel_ed_all_24ho, col=var)

## remove version of variabl

-------------------------------
rate within 24 hours when we include beneficiaries + opt bounds on some non-matches
is_unnecessary_ED_1ormore_optimistic  False  True 
dispo_broad                                       
NTL control                            1155    870
NTL treatment                          1658    358
is_unnecessary_ED_1ormore_optimistic  False  True 
dispo_broad                                       
NTL control                          0.5704 0.4296
NTL treatment                        0.8224 0.1776
-------------------------------
rate within 24 hours when we include beneficiaries + pess bounds on some non-matches
is_unnecessary_ED_1ormore_pessimistic  False  True 
dispo_broad                                        
NTL control                             1537    488
NTL treatment                           1066    950
is_unnecessary_ED_1ormore_pessimistic  False  True 
dispo_broad                                        
NTL control                           0.7590 0.

## 2.6 Rename columns and merge beneficiaries data

Merging beneficiaries on `MedicaidSystemID`

In [34]:
## first, list of columns to not add time horizon suffix to
cols_toshield = ["MedicaidSystemID", "dispo_broad", "event_status", "num_1"]

ptlevel_ed_allbenefic_24ho.columns = [
    col + "_24ho" if col not in cols_toshield else col
    for col in ptlevel_ed_allbenefic_24ho.columns
]
ptlevel_ed_allbenefic_6mo.columns = [
    col + "_6mo" if col not in cols_toshield else col
    for col in ptlevel_ed_allbenefic_6mo.columns
]

In [35]:
## all beneficiaries + imputed non-matches
## (can remove latter group using participant_group column)
ptlevel_ed_allbenefic_forfin = pd.merge(
    ptlevel_ed_allbenefic_24ho,
    ptlevel_ed_allbenefic_6mo.drop(
        columns=["dispo_broad", "event_status", "num_1"], axis=1, inplace=False
    ),
    how="left",
    on="MedicaidSystemID",
    indicator="check_merge",
)

assert all(x == "both" for x in ptlevel_ed_allbenefic_forfin.check_merge)

## 2.7 Rename columns and merge "all" data

Since MedicaidSystemID is NA for non-beneficiaries, merge on ntl identifier num_1

In [36]:
ptlevel_ed_all_24ho.columns = [
    col + "_24ho" if col not in cols_toshield else col
    for col in ptlevel_ed_all_24ho.columns
]
ptlevel_ed_all_6mo.columns = [
    col + "_6mo" if col not in cols_toshield else col
    for col in ptlevel_ed_all_6mo.columns
]

In [37]:
ptlevel_ed_all_forfin = pd.merge(
    ptlevel_ed_all_24ho,
    ptlevel_ed_all_6mo.drop(
        columns=["dispo_broad", "event_status", "MedicaidSystemID"],
        axis=1,
        inplace=False,
    ),
    how="left",
    on="num_1",
    indicator="check_merge_all",
)

assert all(x == "both" for x in ptlevel_ed_all_forfin.check_merge_all)

# 3. Primary care visits


## 3.1 Code whether line item is PCP visit


In [38]:
specialty_codes = ["004", "006", "012", "013", "031", "044"]
type_code = ["X05"]
procedure_codes = [
    "T1015",
    "99201",
    "99202",
    "99203",
    "99204",
    "99205",
    "99211",
    "99212",
    "99213",
    "99214",
    "99215",
]
fqhc_specialty_code = ["801"]

In [39]:
## first, since vars can sometimes get warped from strings -> int during write/read
## make sure we have correct intersections since, with the same of claims data we have
## there should be at least one overlapping code
assert (
    len(
        set(specialty_codes).intersection(
            claims_24hours.DetailRenderingSpecialtyCode.unique()
        )
    )
    != 0
)
assert (
    len(set(type_code).intersection(claims_24hours.BillingProviderTypeCode.unique()))
    != 0
)
assert (
    len(set(procedure_codes).intersection(claims_24hours.ProcedureCode.unique())) != 0
)
assert (
    len(
        set(fqhc_specialty_code).intersection(
            claims_24hours.DetailRenderingSpecialtyCode.unique()
        )
    )
    != 0
)

In [40]:
claims_24hours_wPCP = code_PCP_visits(df=claims_24hours)
claims_6mo_wPCP = code_PCP_visits(df=claims_6mo)

## 3.2 Aggregate PCP visits up to patient level and construct binary measure


In [41]:
ptlevel_PCP_24ho = construct_ptlevel_PCP(df=claims_24hours_wPCP)
ptlevel_PCP_6mo = construct_ptlevel_PCP(df=claims_6mo_wPCP)

In [42]:
## merge with NTL data
ptlevel_PCP_24ho_wNTL = mergeclaims_beneficiaries(
    ptlevel_PCP_24ho, medicaid_patients_firstappear_24hour
)

ptlevel_PCP_6mo_wNTL = mergeclaims_beneficiaries(
    ptlevel_PCP_6mo, medicaid_patients_firstappear_6mo
)

pcp_outcome_vars_binary = ["is_PCP_oneormore"]

## 3.3 Create similar ptlevel data for beneficiaries with no claims and non-beneficiaries

Follow same process as for ED visit outcomes:

- For beneficiaries with no claims, code them as false
- For non-matches we're including, create bounds: (1) optimistic is that treatment group always has PCP visit; control group doesn't; (2) pessimistic vice versa

### 3.3.1 Beneficiaries with no claims

In [43]:
ptlevel_PCP_noclaims_24ho = code_noclaims(
    df=medicaid_patients_noclaims_24ho_firstappear,
    outcome_varnames=pcp_outcome_vars_binary,
)
ptlevel_PCP_noclaims_6mo = code_noclaims(
    df=medicaid_patients_noclaims_6mo_firstappear,
    outcome_varnames=pcp_outcome_vars_binary,
)

### 3.3.2 Nonmatches

In [44]:
ptlevel_PCP_noclaims_notbene = impute_nonmatch(
    medicaid_nonmatch,
    inclusion_method="analytic_conservative",
    outcome_varnames=pcp_outcome_vars_binary,
)

## then, add optimistic and pessimistic bounds
ptlevel_PCP_noclaims_notbene_wbounds = bound_binary(
    ptlevel_PCP_noclaims_notbene,
    cols_tobound=pcp_outcome_vars_binary,
    optimistic_tx=True,
)  # bound these so that tx = true visit to pcp


## drop nonbounded version
ptlevel_PCP_noclaims_notbene_wbounds.drop(columns=pcp_outcome_vars_binary, inplace=True)
ptlevel_PCP_noclaims_notbene_wbounds["participant_group"] = "Not matched"

## 3.4 Rowbind and summarize

### 3.4.1 Rename cols to bindable with bounding vars df

In [45]:
## first, for ones with matched and with some claims, copy over
## observed cols as opt and pessmistic
ptlevel_PCP_24ho_torbind = add_bounds_suffix(
    pcp_outcome_vars_binary, ptlevel_PCP_24ho_wNTL, participant_group="Matched + claims"
)

ptlevel_PCP_6mo_torbind = add_bounds_suffix(
    pcp_outcome_vars_binary, ptlevel_PCP_6mo_wNTL, participant_group="Matched + claims"
)

## then, for ones with matched and no claims, similar copying
ptlevel_PCP_noclaims_24ho_torbind = add_bounds_suffix(
    pcp_outcome_vars_binary,
    ptlevel_PCP_noclaims_24ho,
    participant_group="Matched no claims",
)

ptlevel_PCP_noclaims_6mo_torbind = add_bounds_suffix(
    pcp_outcome_vars_binary,
    ptlevel_PCP_noclaims_6mo,
    participant_group="Matched no claims",
)

### 3.4.2 Dataset 1: beneficiaries

In [46]:
cols_touse_PCP = set(ptlevel_PCP_24ho_torbind.columns).intersection(
    ptlevel_PCP_noclaims_24ho_torbind.columns
)

In [47]:
ptlevel_PCP_allbenefic_24ho = pd.concat(
    [
        ptlevel_PCP_24ho_torbind[cols_touse_PCP],
        ptlevel_PCP_noclaims_24ho_torbind[cols_touse_PCP],
    ],
    axis=0,
    sort=True,
)
ptlevel_PCP_allbenefic_6mo = pd.concat(
    [
        ptlevel_PCP_6mo_torbind[cols_touse_PCP],
        ptlevel_PCP_noclaims_6mo_torbind[cols_touse_PCP],
    ],
    axis=0,
    sort=True,
)

In [48]:
## make sure they're the same nrows and match length of beneficiaries
assert ptlevel_PCP_allbenefic_24ho.shape[0] == ptlevel_PCP_allbenefic_6mo.shape[0]
missing_bene_PCP = set(bene_all).difference(
    ptlevel_PCP_allbenefic_6mo.MedicaidSystemID.to_list()
)
assert len(missing_bene_ed) == 0

In [49]:
## print raw rates; similar as to ED, we use the suffix for rowbinding
## convenience but these are observed rather than bounded
for var in [x + "_optimistic" for x in pcp_outcome_vars_binary]:
    print(
        "-------------------------------\nrate within 24 hours when we include all beneficiaries"
    )
    summarize_bygroup(df=ptlevel_PCP_allbenefic_24ho, col=var)

for var in [x + "_optimistic" for x in pcp_outcome_vars_binary]:
    print(
        "-------------------------------\nrate within 6 months when we include all beneficiaries"
    )
    summarize_bygroup(df=ptlevel_PCP_allbenefic_6mo, col=var)

-------------------------------
rate within 24 hours when we include all beneficiaries
is_PCP_oneormore_optimistic  False  True 
dispo_broad                              
NTL control                   1602     41
NTL treatment                 1307    117
is_PCP_oneormore_optimistic  False  True 
dispo_broad                              
NTL control                 0.9750 0.0250
NTL treatment               0.9178 0.0822
-------------------------------
rate within 6 months when we include all beneficiaries
is_PCP_oneormore_optimistic  False  True 
dispo_broad                              
NTL control                    931    712
NTL treatment                  815    609
is_PCP_oneormore_optimistic  False  True 
dispo_broad                              
NTL control                 0.5666 0.4334
NTL treatment               0.5723 0.4277


### 3.4.3 Dataset 2: beneficiaries + select non-matches

In [50]:
ptlevel_PCP_all_24ho = pd.concat(
    [ptlevel_PCP_allbenefic_24ho, ptlevel_PCP_noclaims_notbene_wbounds],
    axis=0,
    sort=True,
)

ptlevel_PCP_all_6mo = pd.concat(
    [ptlevel_PCP_allbenefic_6mo, ptlevel_PCP_noclaims_notbene_wbounds],
    axis=0,
    sort=True,
)

for var in [x + "_optimistic" for x in pcp_outcome_vars_binary]:
    print(
        "-------------------------------\nrate within 24 hours when we include beneficiaries + opt bounds on some non-matches"
    )
    summarize_bygroup(df=ptlevel_PCP_all_24ho, col=var)

for var in [x + "_pessimistic" for x in pcp_outcome_vars_binary]:
    print(
        "-------------------------------\nrate within 24 hours when we include beneficiaries + pess bounds on some non-matches"
    )
    summarize_bygroup(df=ptlevel_PCP_all_24ho, col=var)

-------------------------------
rate within 24 hours when we include beneficiaries + opt bounds on some non-matches
is_PCP_oneormore_optimistic  False  True 
dispo_broad                              
NTL control                   1984     41
NTL treatment                 1307    709
is_PCP_oneormore_optimistic  False  True 
dispo_broad                              
NTL control                 0.9798 0.0202
NTL treatment               0.6483 0.3517
-------------------------------
rate within 24 hours when we include beneficiaries + pess bounds on some non-matches
is_PCP_oneormore_pessimistic  False  True 
dispo_broad                               
NTL control                    1602    423
NTL treatment                  1899    117
is_PCP_oneormore_pessimistic  False  True 
dispo_broad                               
NTL control                  0.7911 0.2089
NTL treatment                0.9420 0.0580


## 3.5 Rename columns and merge beneficaries data

In [51]:
ptlevel_PCP_allbenefic_24ho.columns = [
    col + "_24ho" if col not in cols_toshield else col
    for col in ptlevel_PCP_allbenefic_24ho.columns
]
ptlevel_PCP_allbenefic_6mo.columns = [
    col + "_6mo" if col not in cols_toshield else col
    for col in ptlevel_PCP_allbenefic_6mo.columns
]

In [52]:
## all beneficiaries + imputed non-matches
## (can remove latter group using participant_group column)
ptlevel_PCP_allbenefic_forfin = pd.merge(
    ptlevel_PCP_allbenefic_24ho,
    ptlevel_PCP_allbenefic_6mo.drop(
        columns=["dispo_broad", "event_status", "num_1"], axis=1, inplace=False
    ),
    how="left",
    on="MedicaidSystemID",
    indicator="check_merge",
)

assert all(x == "both" for x in ptlevel_PCP_allbenefic_forfin.check_merge)

## 3.6 Rename columns and merge with non-beneficiaries

Similar to ED, merge using `num_1` rather than `MedicaidSystemID` since latter is NA

In [53]:
ptlevel_PCP_all_24ho.columns = [
    col + "_24ho" if col not in cols_toshield else col
    for col in ptlevel_PCP_all_24ho.columns
]
ptlevel_PCP_all_6mo.columns = [
    col + "_6mo" if col not in cols_toshield else col
    for col in ptlevel_PCP_all_6mo.columns
]

In [54]:
ptlevel_PCP_all_forfin = pd.merge(
    ptlevel_PCP_all_24ho,
    ptlevel_PCP_all_6mo.drop(
        columns=["dispo_broad", "event_status", "MedicaidSystemID"],
        axis=1,
        inplace=False,
    ),
    how="left",
    on="num_1",
    indicator="check_merge_all",
)

assert all(x == "both" for x in ptlevel_PCP_all_forfin.check_merge_all)

# 4. Expenditures

For expenditures, since there are no meaningful values to impute / bound non-beneficiaries with, our sole dataset is comprimised of medicaid beneficiaries

## 4.1 Aggregate expenditures by beneficiary

In [55]:
claims_24hours_wexpend = summarize_expenditures(df=claims_24hours)
claims_6mo_wexpend = summarize_expenditures(df=claims_6mo)

In [56]:
## merge with NTL data
ptlevel_expend_24ho_wNTL = mergeclaims_beneficiaries(
    claims_24hours_wexpend, medicaid_patients_firstappear_24hour
)

ptlevel_expend_6mo_wNTL = mergeclaims_beneficiaries(
    claims_6mo_wexpend, medicaid_patients_firstappear_6mo
)
## look at dim
if ptlevel_expend_24ho_wNTL.shape[0] != medicaid_patients_firstappear_24hour.shape[0]:
    print("need to add 0's for people with no expenditures during the aggregation")

## 4.2 Add information for beneficiaries with no claims 

Unlike the binary outcomes, we do NOT construct a dataset with bounds for non-beneficiaries, since there's not a clear way to bound their outcomes

In [57]:
## see based on above, that unlike the other outcomes, we lose some patients from
## medicaid_patients_first_appear_X because in aggregating expenditures, they have no
## rows of expenditures for those claims (could be due to differences between servicedate
## and payment date)
medicaid_patients_noclaims_24ho_tobind = pd.concat(
    [
        medicaid_patients_noclaims_24ho_firstappear,
        medicaid_patients_firstappear_24hour[
            ~medicaid_patients_firstappear_24hour.MedicaidSystemID.isin(
                ptlevel_expend_24ho_wNTL.MedicaidSystemID
            )
        ],
    ],
    axis=0,
    sort=True,
)

medicaid_patients_noclaims_6mo_tobind = pd.concat(
    [
        medicaid_patients_noclaims_6mo_firstappear,
        medicaid_patients_firstappear_6mo[
            ~medicaid_patients_firstappear_6mo.MedicaidSystemID.isin(
                ptlevel_expend_6mo_wNTL.MedicaidSystemID
            )
        ],
    ],
    axis=0,
    sort=True,
)

In [58]:
## add in matched beneficiaries with no claims as zero
expend_outcomes = ["total_expenditures"]
ptlevel_expend_noclaims_24ho = code_noclaims(
    medicaid_patients_noclaims_24ho_tobind, outcome_varnames=expend_outcomes, code_to=0
)

ptlevel_expend_noclaims_6mo = code_noclaims(
    medicaid_patients_noclaims_6mo_tobind, outcome_varnames=expend_outcomes, code_to=0
)

## 4.3 Rowbind and summarize outcomes

In [59]:
ptlevel_expend_24ho_wNTL["participant_group"] = "Matched + claims"
ptlevel_expend_6mo_wNTL["participant_group"] = "Matched + claims"

ptlevel_expend_noclaims_24ho["participant_group"] = "Matched no claims"
ptlevel_expend_noclaims_6mo["participant_group"] = "Matched no claims"


cols_touse_expend = set(ptlevel_expend_noclaims_24ho.columns).intersection(
    ptlevel_expend_24ho_wNTL.columns
)

In [60]:
ptlevel_expend_allbenefic_24ho = pd.concat(
    [
        ptlevel_expend_24ho_wNTL[cols_touse_expend],
        ptlevel_expend_noclaims_24ho[cols_touse_expend],
    ],
    axis=0,
    sort=True,
)

ptlevel_expend_allbenefic_6mo = pd.concat(
    [
        ptlevel_expend_6mo_wNTL[cols_touse_expend],
        ptlevel_expend_noclaims_6mo[cols_touse_expend],
    ],
    axis=0,
    sort=True,
)

In [61]:
## check size and that no beneficiaries are missing
assert ptlevel_expend_allbenefic_24ho.shape[0] == ptlevel_expend_allbenefic_6mo.shape[0]
missing_bene_exp = set(bene_all).difference(
    ptlevel_expend_allbenefic_6mo.MedicaidSystemID.to_list()
)
assert len(missing_bene_exp) == 0

In [62]:
for var in ["total_expenditures"]:
    print(
        "-------------------------------\nmean within 24 hours when we include all beneficiaries"
    )
    summarize_bygroup(df=ptlevel_expend_allbenefic_24ho, col=var, col_type="continuous")

for var in ["total_expenditures"]:
    print(
        "-------------------------------\nmean within 6 months when we include all beneficiaries"
    )
    summarize_bygroup(df=ptlevel_expend_allbenefic_6mo, col=var, col_type="continuous")

-------------------------------
mean within 24 hours when we include all beneficiaries
dispo_broad
NTL control     1091.4425
NTL treatment   1253.0608
Name: total_expenditures, dtype: float64
-------------------------------
mean within 6 months when we include all beneficiaries
dispo_broad
NTL control     9056.1040
NTL treatment   8839.2208
Name: total_expenditures, dtype: float64


# 5. Create indicators for different quantiles of pre-call expenditures

As prespecified, later will look at heterogeneous impacts on expenditures among different percentiles of pre-call expenditures

## 5.1 Load data,  aggregate expenditures to the beneficiary level, and left join onto expenditures analytic df

In [63]:
claims_precall = pd.read_csv(
    CLAIMS_BEFORECALL_FILE, dtype={"MedicaidSystemID": "object"}
)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [64]:
ptlevel_expend_precall = summarize_expenditures(df=claims_precall)
precall_expend = "total_expenditures_precall"

In [65]:
ptlevel_expend_precall.rename(
    columns={"total_expenditures": precall_expend}, inplace=True
)

In [66]:
## left join using medicaid system ID onto expenditures dataset
## so that we know pre-call expenditures for everyone in that df
ptlevel_expend_allbenefic_24ho_wprecall = pd.merge(
    ptlevel_expend_allbenefic_24ho,
    ptlevel_expend_precall[["MedicaidSystemID", precall_expend]],
    on="MedicaidSystemID",
    how="left",
)

## since those missing from pre-call expenditures data had 0 claims in 6 months before call,
## fill NA with zero
ptlevel_expend_allbenefic_24ho_wprecall[
    precall_expend
] = ptlevel_expend_allbenefic_24ho_wprecall[precall_expend].fillna(0)

## repeat for claims within 6 months
ptlevel_expend_allbenefic_6mo_wprecall = pd.merge(
    ptlevel_expend_allbenefic_6mo,
    ptlevel_expend_precall[["MedicaidSystemID", precall_expend]],
    on="MedicaidSystemID",
    how="left",
)

## since those missing from pre-call expenditures data had 0 claims in 6 months before call,
## fill NA with zero
ptlevel_expend_allbenefic_6mo_wprecall[
    precall_expend
] = ptlevel_expend_allbenefic_6mo_wprecall[precall_expend].fillna(0)

## 5.2 Code whether people are in the top 10% or top 5% of pre-call expenditures (inclusive)

In [67]:
## quartiles are the same across df since we just left joined the same
## 6 months pre call data to the same beneficiary data
quantiles = np.arange(0, 1, 0.05).tolist()
quantiles_expend = (
    ptlevel_expend_allbenefic_6mo_wprecall[precall_expend].quantile(quantiles).to_dict()
)
print("Quantiles of expenditures in 6 months pre-call are:----------------------------")
quantiles_expend
quantiles_expend_touse = {k: v for k, v in quantiles_expend.items() if k >= 0.9}

Quantiles of expenditures in 6 months pre-call are:----------------------------


{0.0: 0.0,
 0.05: 0.0,
 0.1: 0.0,
 0.15000000000000002: 0.0,
 0.2: 0.0,
 0.25: 6.835,
 0.30000000000000004: 57.72800000000006,
 0.35000000000000003: 141.24199999999993,
 0.4: 244.60600000000022,
 0.45: 426.335,
 0.5: 686.72,
 0.55: 1111.1250000000005,
 0.6000000000000001: 1744.4500000000007,
 0.65: 2586.7059999999997,
 0.7000000000000001: 3702.724,
 0.75: 5297.91,
 0.8: 7753.93,
 0.8500000000000001: 12398.957000000002,
 0.9: 18346.434000000005,
 0.9500000000000001: 39541.05299999991}

In [68]:
ptlevel_expend_24ho_windicators = code_topquantile(
    ptlevel_expend_allbenefic_24ho_wprecall, quantiles_expend_touse
)
ptlevel_expend_6mo_windicators = code_topquantile(
    ptlevel_expend_allbenefic_6mo_wprecall, quantiles_expend_touse
)

## 5.3 Rename columns and merge

In [69]:
ptlevel_expend_24ho_windicators.columns = [
    col + "_24ho" if col not in cols_toshield else col
    for col in ptlevel_expend_24ho_windicators
]
ptlevel_expend_6mo_windicators.columns = [
    col + "_6mo" if col not in cols_toshield else col
    for col in ptlevel_expend_6mo_windicators
]

In [70]:
## merge
ptlevel_expend_allbenefic_forfin = pd.merge(
    ptlevel_expend_24ho_windicators,
    ptlevel_expend_6mo_windicators.drop(
        columns=["dispo_broad", "event_status", "num_1"], axis=1, inplace=False
    ),
    how="left",
    on="MedicaidSystemID",
    indicator="check_merge",
)

assert all(x == "both" for x in ptlevel_expend_allbenefic_forfin.check_merge)

# 6. Merge all outcomes into same dataset to write


## 6.1 Main focus: beneficaries only

Merge on `MedicaidSystemID` 

In [71]:
## merge ED + PCP
ptlevel_benefic_edpcp = pd.merge(
    ptlevel_ed_allbenefic_forfin.drop(["check_merge"], axis=1, inplace=False),
    ptlevel_PCP_allbenefic_forfin.drop(
        [
            "dispo_broad",
            "event_status",
            "num_1",
            "participant_group_24ho",
            "participant_group_6mo",
            "check_merge",
        ],
        axis=1,
        inplace=False,
    ),
    on="MedicaidSystemID",
    how="left",
    indicator="check_merge",
)


assert all(x == "both" for x in ptlevel_benefic_edpcp.check_merge)

## merge with expend
ptlevel_benefic_edpcpexp = pd.merge(
    ptlevel_benefic_edpcp,
    ptlevel_expend_allbenefic_forfin.drop(
        [
            "dispo_broad",
            "event_status",
            "num_1",
            "participant_group_24ho",
            "participant_group_6mo",
        ],
        axis=1,
        inplace=False,
    ),
    on="MedicaidSystemID",
    how="left",
    indicator="check_merge_2",
)

assert all(x == "both" for x in ptlevel_benefic_edpcpexp.check_merge_2)

## add static attributes OF Medicaid beneficiaries
ptlevel_stat_attributes = pd.read_csv(
    STATIC_ATTRIBUTES_FILE, dtype={"MedicaidSystemID": "object"}
)

## left join static attributes
ptlevel_benefic_wattributes = pd.merge(
    ptlevel_benefic_edpcpexp,
    ptlevel_stat_attributes,
    how="left",
    indicator="static_status",
)

## see that most are probably from having no claims in either 6 months
## before (not reflected here) or 6 months after (reflected here) in call,
## and with attributes coming from the claims data
pd.crosstab(
    ptlevel_benefic_wattributes.participant_group_6mo,
    ptlevel_benefic_wattributes.static_status,
)

## get rid of extraneous merge cols
cols_mergestatus = [
    col
    for col in ptlevel_benefic_wattributes.columns
    if str(col).startswith("check_merge")
]
cols_tokeep = [
    col for col in ptlevel_benefic_wattributes.columns if col not in cols_mergestatus
]

static_status,left_only,both
participant_group_6mo,Unnamed: 1_level_1,Unnamed: 2_level_1
Matched + claims,0,2679
Matched no claims,308,80


In [72]:
## write to csv
ptlevel_benefic_wattributes[cols_tokeep].to_csv(
    PTLEVEL_WOUTCOMES_BENEFICONLY, index=False
)

##  6.2 Beneficiaries + imputed values for non-matches

Merge on `num_1` rather than `MedicaidSystemID`


In [73]:
## merge ED + PCP
ptlevel_all_edpcp = pd.merge(
    ptlevel_ed_all_forfin.drop(["check_merge_all"], axis=1, inplace=False),
    ptlevel_PCP_all_forfin.drop(
        [
            "dispo_broad",
            "event_status",
            "MedicaidSystemID",
            "participant_group_24ho",
            "participant_group_6mo",
            "check_merge_all",
        ],
        axis=1,
        inplace=False,
    ),
    on="num_1",
    how="left",
    indicator="check_merge",
)


assert all(x == "both" for x in ptlevel_all_edpcp.check_merge)

## write as is since just used for robustness checks on those
cols_tokeep = [col for col in ptlevel_all_edpcp.columns if col not in "check_merge"]

## write to csv
ptlevel_all_edpcp[cols_tokeep].to_csv(PTLEVEL_WOUTCOMES_FORROBUST, index=False)