# 0. Imports and functions

In [None]:
import re
from typing import Union

import numpy as np
import pandas as pd

## output
from IPython.core.interactiveshell import InteractiveShell

from femsntl.datafiles import INTERMEDIATE_DIR, PRIVATE_DATA_DIR

InteractiveShell.ast_node_interactivity = "all"
pd.set_option("display.max_columns", None)  # or 1000
pd.set_option("display.max_rows", None)  # or 1000
pd.set_option("display.max_colwidth", -1)

In [None]:
PULL_SQL_EVEN_IF_EXISTS = False

# 1. Load data from previous script

In [None]:
ntl_analytic = pd.read_pickle(INTERMEDIATE_DIR / "ntl_withsafetypad.pkl")

In [None]:
print("Number of unique participants at start:", ntl_analytic.num_1.nunique())
ntl_tx = ntl_analytic.loc[ntl_analytic.dispo_broad == "NTL treatment"].copy()
ntl_control = ntl_analytic.loc[ntl_analytic.dispo_broad == "NTL control"].copy()
print("Number of unique tx group", ntl_tx.num_1.nunique())
print("Number of unique control group", ntl_tx.num_1.nunique())

## get number in other categories and why
non_txcont = (
    ntl_analytic.loc[~ntl_analytic.dispo_broad.isin(["NTL control", "NTL treatment"])]
    .copy()
    .drop_duplicates(subset="num_1")
)
pd.crosstab(non_txcont.dispo_broad, non_txcont.event_status)

# 2. Load AMR data without medicaid ids

In [None]:
amr_df = pd.read_excel(PRIVATE_DATA_DIR / "amr_df.xlsx", sheet_name=0)

# 3. Load AMR data with Medicaid ids

In [None]:
amr_df_withmedicaid = pd.read_excel(
    PRIVATE_DATA_DIR / "dc_fems_medicaidids.xlsx", sheet_name=0
)

In [None]:
amr_df_withmedicaid = pd.read_excel(
    PRIVATE_DATA_DIR / "dc_fems_medicaidids.xlsx", sheet_name=0, skiprows=2
)

# 4. Create flags for which participants are in which data

## 4.1 Medicaid ids

In [None]:
amr_respondents_withmedids = amr_df_withmedicaid.loc[
    amr_df_withmedicaid["Personal ID Number"] != "-"
].copy()

print(
    "There are "
    + str(len(amr_respondents_withmedids["Personal ID Number"].unique()))
    + " unique medicaid ids, corresponding to "
    + str(len(amr_respondents_withmedids.FEMSID.unique()))
    + " calls"
)

In [None]:
fems_ids_forresp_withmedicaid = amr_respondents_withmedids.FEMSID.unique()

found_in_analytic = set(fems_ids_forresp_withmedicaid).intersection(
    set(ntl_analytic.num_1)
)
print(
    "But "
    + str(len(found_in_analytic))
    + " of their FEMS IDs are found in analytic sample of NTL callers"
)

### 4.1.1 since Ids are off, try merging with amr data by name and dob exact match

In [None]:
amr_respondents_withmedids[
    "fname_cap"
] = amr_respondents_withmedids.PatientFName.astype(str).str.upper()
amr_respondents_withmedids[
    "lname_cap"
] = amr_respondents_withmedids.PatientLName.astype(str).str.upper()
amr_respondents_withmedids["name_dob"] = (
    amr_respondents_withmedids.fname_cap
    + "_"
    + amr_respondents_withmedids.lname_cap
    + "_"
    + amr_respondents_withmedids.DOB.astype(str)
)

## create similar column in amr data
amr_df["fname_cap"] = amr_df.PatientFName.astype(str).str.upper()
amr_df["lname_cap"] = amr_df.PatientLName.astype(str).str.upper()
amr_df["dob_strip0"] = [
    str(one_dob).replace(" 00:00:00", "") for one_dob in amr_df.DateofBirth
]

In [None]:
amr_df["name_dob"] = (
    amr_df.fname_cap + "_" + amr_df.lname_cap + "_" + amr_df.dob_strip0
)  # type c

print(
    "There are "
    + str(
        len(set(amr_df.name_dob).intersection(set(amr_respondents_withmedids.name_dob)))
    )
    + " exact matches on name and dob to add medicaid ids out of "
    + str(len(amr_respondents_withmedids["Personal ID Number"].unique()))
    + " Medicaid IDs"
)

In [None]:
## left join relevant columns so that those participants
## have medicaid ids
amr_df_withmedid = pd.merge(
    amr_df,
    amr_respondents_withmedids[["Personal ID Number", "name_dob"]].drop_duplicates(
        keep="first"
    ),
    on="name_dob",
    how="left",
)

## check pre and post-merge n
print("There are " + str(len(amr_df.FEMSID.unique())) + " unique calls pre-merge")
print(
    "There are "
    + str(len(amr_df_withmedid.FEMSID.unique()))
    + " unique calls post-merge"
)

## 4.2 create categories of identifiers

In [None]:
## create different status codes
all_ids = ntl_analytic.num_1.unique().tolist()
ids_insafetyPAD = (
    ntl_analytic.loc[ntl_analytic.incident_number.notnull(), "num_1"].unique().tolist()
)
ids_inAMR_nomedicaidid = (
    amr_df_withmedid.loc[amr_df_withmedid["Personal ID Number"].isnull(), "FEMSID"]
    .unique()
    .tolist()
)
ids_inAMR_medicaidid = (
    amr_df_withmedid.loc[amr_df_withmedid["Personal ID Number"].notnull(), "FEMSID"]
    .unique()
    .tolist()
)

ids_inAMR = ids_inAMR_medicaidid + ids_inAMR_nomedicaidid

In [None]:
## create different categories
ids_AMRsafetyPAD = np.unique(list(set(ids_insafetyPAD).intersection(set(ids_inAMR))))
print(
    "There are "
    + str(len(ids_AMRsafetyPAD))
    + " ids in both AMR data and safety pad data"
)
ids_safetyPAD_notAMR = np.unique(list(set(ids_insafetyPAD).difference(set(ids_inAMR))))
print("There are " + str(len(ids_safetyPAD_notAMR)) + " ids in safetyPAD but not AMR")
ids_AMR_notsafetyPAD = np.unique(list(set(ids_inAMR).difference(set(ids_insafetyPAD))))
print("There are " + str(len(ids_AMR_notsafetyPAD)) + " ids in AMR but not safetypad")
ids_AMR_safetyPAD = ids_insafetyPAD + ids_inAMR
ids_neitherAMR_norsafetyPAD = np.unique(
    list(set(all_ids).difference(set(ids_AMR_safetyPAD)))
)
print(
    "There are "
    + str(len(ids_neitherAMR_norsafetyPAD))
    + " ids in neither AMR nor safetypad"
)

In [None]:
## use the ntl analytic as base, and code id status
ntl_analytic["data_status"] = np.where(
    ntl_analytic.num_1.isin(ids_AMRsafetyPAD),
    "In both Safety PAD and AMR data",
    np.where(
        ntl_analytic.num_1.isin(ids_safetyPAD_notAMR),
        "In Safety PAD but not AMR",
        np.where(
            ntl_analytic.num_1.isin(ids_AMR_notsafetyPAD),
            "In AMR but not SafetyPAD",
            np.where(
                ntl_analytic.num_1.isin(
                    ids_neitherAMR_norsafetyPAD
                ),  # even though this should be remainder, coding explicitly
                "In neither AMR nor safetyPAD",
                "Other",
            ),
        ),
    ),
)

ntl_analytic["medicaid_id_status"] = np.where(
    ntl_analytic.num_1.isin(ids_inAMR_medicaidid), "Has Medicaid id", "No Medicaid id"
)

In [None]:
## keep each ids first appearance
## for purposes of summarizing ID status
ntl_analytic_firstappearance = ntl_analytic.sort_values(by="date").drop_duplicates(
    subset="num_1", keep="first"
)

In [None]:
ntl_analytic_firstappearance.dispo_broad.value_counts()  # check that equal to 3032; 3023 before summarizing
pd.crosstab(
    ntl_analytic_firstappearance.dispo_broad, ntl_analytic_firstappearance.data_status
)

In [None]:
pd.crosstab(
    ntl_analytic_firstappearance.dispo_broad,
    ntl_analytic_firstappearance.medicaid_id_status,
)

In [None]:
pd.crosstab(
    ntl_analytic_firstappearance.event_status, ntl_analytic_firstappearance.data_status
)

# 4. Merge the data-- creating different columns for identifiers from different sources

Left join so that all respondents are retained. Those with records in both will have identifiers from both. Those with data
from neither will be NA in both; etc.

In [None]:
amr_newcolnames = [
    "amr_" + col if col != "FEMSID" else col for col in amr_df_withmedid.columns
]

In [None]:
amr_df_withmedid.columns = amr_newcolnames

In [None]:
## left join onto main data
ntl_analytic_withamr = pd.merge(
    ntl_analytic, amr_df_withmedid, left_on="num_1", right_on="FEMSID", how="left"
)

In [None]:
def convert_phone_to_int(phone_number: Union[int, str]) -> int:
    try:
        return str(int(phone_number))
    except ValueError:
        try:
            # Assume string and remove - characters
            return str(int(re.sub(r"[^0-9]", "", phone_number)))
        except ValueError:
            return phone_number


ntl_analytic_withamr[
    "amr_ApplicantsPhone"
] = ntl_analytic_withamr.amr_ApplicantsPhone.map(
    convert_phone_to_int, na_action="ignore"
)
ntl_analytic_withamr["amr_PatientLName"] = ntl_analytic_withamr.amr_PatientLName.map(
    str, na_action="ignore"
)

In [None]:
## also write medicaid id data in case useful after names clean up
amr_df_withmedid[amr_df_withmedid["amr_Personal ID Number"].notnull()].to_csv(
    INTERMEDIATE_DIR / "medicaid_ids.csv", index=False
)

In [None]:
## write data to use in lookup script (that script will do cleaning)
ntl_analytic_withamr.to_pickle(INTERMEDIATE_DIR / "ntl_withsafetypad_withamr.pkl")