# 0. Imports and functions

In [None]:
import re
from typing import Iterable, Optional, Union

import gender_guesser.detector as gender
import numpy as np
import pandas as pd
from dateutil import parser
from fuzzywuzzy import fuzz, process
from IPython.core.interactiveshell import InteractiveShell

from femsntl.datafiles import EXTERNAL_DIR, INTERMEDIATE_DIR, PRIVATE_DATA_DIR
from femsntl.utils import (
    clean_amr_names,
    extract_DOB_fromname,
    process_safetypad_names,
    standardize_month,
    standardize_year,
)

InteractiveShell.ast_node_interactivity = "all"
pd.set_option("display.max_columns", None)  # or 1000
pd.set_option("display.max_rows", None)  # or 1000
pd.set_option("display.max_colwidth", None)

PULL_SQL_EVEN_IF_EXISTS = False

## Functions

In [None]:
def non_na_unique(all_rows):

    non_na = all_rows.dropna()
    return non_na


def try_parser(date):
    try:
        date_raw = parser.parse(date)
        date_clean = date_raw.strftime("%Y-%m-%d")
    except:
        date_clean = None
    return date_clean


## gets list of ids for each matched pair
def find_ids_foramatch(
    orig_name, matched_name, id_lookup: pd.DataFrame, colname_originaldf: str
):

    """
    Takes in:
        - Original name and matched name from fuzzy matching results (cols in data called via apply and lambda)
        - Original data with names and ids
        - Str with name of the name column in the original data

    Returns:
        - Dataframe with start of lookup table


    """

    subset_basedorig = id_lookup.loc[id_lookup[colname_originaldf] == orig_name]
    subset_basedmatch = id_lookup.loc[id_lookup[colname_originaldf] == matched_name]
    ids_basedorig = subset_basedorig.num_1.tolist()
    ids_basedmatch = subset_basedmatch.num_1.tolist()
    all_ids = sorted(list(dict.fromkeys(ids_basedorig + ids_basedmatch)))
    all_names = sorted([orig_name, matched_name])
    all_names_joined = "; ".join(all_names)
    all_ids_unique_joined = "; ".join(all_ids)
    return pd.Series(
        [all_names_joined, all_ids_unique_joined], index=["all_names", "all_ids"]
    )

# 1. Load data from previous notebook 

Left joins of 1) base NTL data with 2) Safety PAD data (script 0) and 3) AMR data (script 1). Doesn't incorporate medicaid ids bc of issues documented in previous script (none of the ids overlap with the analytic window/sample)



In [None]:
df_analytic = pd.read_pickle(INTERMEDIATE_DIR / "ntl_withsafetypad_withamr.pkl")

# 2. Clean names from safety PAD

In [None]:
## colnames: first_name and last_name
## (names from amr denoted with amr prefix)
## make name 1) all caps; 2) FIRST_LAST
## 3) strip whitespace
df_analytic["name_safetypad_init"] = (
    df_analytic.first_name.str.upper() + "_" + df_analytic.last_name.str.upper()
)
df_analytic["name_safetypad"] = df_analytic.name_safetypad_init.str.strip()

In [None]:
# old version didn;t clip leading spaces
name_and_id = (
    df_analytic[["num_1", "name_safetypad"]]
    .dropna(subset=["name_safetypad"])
    .drop_duplicates()
    .copy()
)
name_and_id["obs"] = (
    name_and_id.groupby("num_1")
    .transform("cumcount")
    .apply(lambda x: f"safetypad_name_{x+1}")
)
name_and_id = name_and_id.pivot(
    index="num_1", columns="obs", values="name_safetypad"
).reset_index()
df_analytic_withnames = pd.merge(df_analytic, name_and_id, on="num_1", how="left")

# 3. Clean names from AMR

## 3.1  Explore 3 names fields:

1. amr_PatientFName
2. amr_PatientLName
3. amr_ApplicantsName

In [None]:
print(
    f"There are {df_analytic_withnames.amr_PatientFName.nunique()} unique first names (but usually contains other info) in AMR data"
)
print(
    f"There are {df_analytic_withnames.amr_PatientLName.nunique()} unique last names in AMR data"
)
print(
    f"There are {df_analytic_withnames.amr_ApplicantsName.nunique()} unique combined names in AMR data"
)

### 3.1  Clean firstnames column
1. remove DOB
2. Split at comma into last name, first name
3. Capitalize and paste into form: FIRSTNAME_LASTNAME for similarity with safety pad identifiers


In [None]:
## list of non-names appearing in the name field to remove
non_names = [
    "UNK",
    "CALLER",
    "UNKNOWN",
    "MEDICAID",
    "FEMS",
    "DOB",
    "YEARS",
    "OLD",
    "MEDCAID",
    "MEDICIAD",
    "UNINSURED",
    "REFUSED",
    "FFS",
    "BLUE",
    "CROSS",
    "SHIELD",
    "PRIVATE",
    "INSURANCE",
    "MEDICARE",
    "AND",
    "MD",
    "CARE",
    "NUMBER",
    "FIRST",
    "NO",
    "COMMERCIAL",
    "AMERIGROUP",
    "VA",
    "AMRIHEALTH",
    "KAISER",
    "DC",
    "TRUSTED",
    "PER",
    "NURSE",
]

In [None]:
df_analytic_withnames["amr_fname_cleaned"] = [
    clean_amr_names(name, non_names=non_names)
    for name in df_analytic_withnames.amr_PatientFName.tolist()
]

## split into multiple columns based on space delimiter
df_amr_splitnames = df_analytic_withnames.amr_fname_cleaned.str.split(" ", expand=True)
df_amr_splitnames.columns = [
    "amr_cleaned_name" + str(i) for i in np.arange(1, df_amr_splitnames.shape[1] + 1)
]

## column bind
df_analytic_withnames_withcleaned = pd.concat(
    [df_analytic_withnames, df_amr_splitnames], axis=1
)


## create name column formatted
## similarly to SafetyPAD data
df_analytic_withnames_withcleaned["amr_tocompare_wsafetypad"] = np.where(
    df_analytic_withnames_withcleaned.amr_cleaned_name1.notnull(),
    df_analytic_withnames_withcleaned.amr_cleaned_name2
    + "_"
    + df_analytic_withnames_withcleaned.amr_cleaned_name1,
    np.nan,
)

# 4. Clean non-name identifiers

Right now cleans CAD ones
There are also some safetypad addresses we could add later
Purpose of doing this before doing more with names is to pave way for incorporating these in future

## 4.1 Clean address from CAD

In [None]:
## first, clean up address string
## via capitalization, removing space
## remove extra spaces and add caps
cleaning_one = [
    re.sub(
        "\\.|WASHINGTON$|WASHINGTON DC$",
        "",
        re.sub("\\s+", " ", str(one_address).upper()),
    ).rstrip()
    if not pd.isna(one_address)
    else ""
    for one_address in df_analytic_withnames_withcleaned.cstr_add
]

In [None]:
df_analytic_withnames_withcleaned["cleaned_address_CAD"] = cleaning_one
print(
    "Originally there were "
    + str(df_analytic_withnames_withcleaned.cstr_add.nunique())
    + " unique addresses"
)
print(
    "Reduced to "
    + str(df_analytic_withnames_withcleaned.cleaned_address_CAD.nunique())
    + " unique addresses after cleaning"
)

## 4.2 Clean phone numbers from CAD

In [None]:
clean_numbers = []
for number in df_analytic_withnames_withcleaned.clrnum:
    if pd.isna(number):
        clean_numbers.append(None)
        continue

    number = re.sub("\\-|\\s+|\\.", "", str(number))
    if number in ["1111111111", "NOPHONE", "TESTCALL", "RADIO", "11111111111111"]:
        number = None
    clean_numbers.append(number)

df_analytic_withnames_withcleaned["cleaned_numbers_CAD"] = clean_numbers

print(
    "Before cleaning, there were "
    + str(len(df_analytic_withnames_withcleaned.clrnum.unique()))
    + " unique numbers"
)
print(
    "After cleaning, there are "
    + str(len(df_analytic_withnames_withcleaned.cleaned_numbers_CAD.unique()))
    + " unique numbers"
)

## 4.3: Clean DOB (in this section, only available for AMR)

In [None]:
df_analytic_withnames_withcleaned[
    "digits_fromfirstname"
] = df_analytic_withnames_withcleaned.amr_PatientFName.apply(extract_DOB_fromname)
df_analytic_withnames_withcleaned[
    "digits_fromlastname"
] = df_analytic_withnames_withcleaned.amr_PatientLName.apply(extract_DOB_fromname)

# Cascade for birthday: Explicit DOB, then from first name, then last name
df_analytic_withnames_withcleaned[
    "amr_dob_updated"
] = df_analytic_withnames_withcleaned.amr_DateofBirth.fillna(
    df_analytic_withnames_withcleaned.digits_fromfirstname.fillna(
        df_analytic_withnames_withcleaned.digits_fromlastname
    )
)

In [None]:
## create copy -- use this data later when adding dob to lookup table
df_datecleaning = df_analytic_withnames_withcleaned.copy()

df_datecleaning["dob_backslashdelim"] = df_datecleaning.amr_dob_updated.str.replace(
    "-", "/"
)
df_dob_split = df_datecleaning.dob_backslashdelim.str.split("\/", expand=True)
df_dob_split.columns = ["month", "day", "year", "other"]

##
df_dob_split["reverse_order"] = np.where(
    df_dob_split.month.str.len() == 4, "switch_yearandmonth", "keep"
)
df_dob_split["month_toclean"] = np.where(
    df_dob_split.reverse_order == "switch_yearandmonth",
    df_dob_split.day,
    df_dob_split.month,
)
df_dob_split["year_toclean"] = np.where(
    df_dob_split.reverse_order == "switch_yearandmonth",
    df_dob_split.month,
    df_dob_split.year,
)
df_dob_split["day_toclean"] = np.where(
    df_dob_split.reverse_order == "switch_yearandmonth",
    df_dob_split.year.str.replace(" 00:00:00", ""),
    df_dob_split.day,
)


month_clean = [standardize_month(str(month)) for month in df_dob_split.month_toclean]
df_dob_split["month_clean"] = month_clean
year_clean = [standardize_year(str(year)) for year in df_dob_split.year_toclean]
df_dob_split["year_clean"] = year_clean
df_datecleaning["updated_dob_toparse"] = (
    df_dob_split.month_clean
    + "/"
    + df_dob_split.day_toclean
    + "/"
    + df_dob_split.year_clean
)

In [None]:
dob_ymd = [try_parser(date) for date in df_datecleaning.updated_dob_toparse]
df_datecleaning["dob_ymd"] = dob_ymd

## 4.4 Code identifier status for different observations before merging in safetyPAD API data

Right now, don't use addresses (since seem much less unique than phone numbers); use phone number and name

In [None]:
df_analytic_withnames_withcleaned["missing_CAD_phone"] = np.where(
    (df_analytic_withnames_withcleaned.cleaned_numbers_CAD.isnull()),
    "Missing phone num.",
    "Has phone num.",
)

In [None]:
df_analytic_withnames_withcleaned["missing_safetyPAD_name"] = np.where(
    (df_analytic_withnames_withcleaned.safetypad_name_1.isnull()),
    "Missing safety pad name",
    "Has safety pad name",
)

df_analytic_withnames_withcleaned["missing_AMR_name"] = np.where(
    (df_analytic_withnames_withcleaned.amr_tocompare_wsafetypad.isnull()),
    "Missing AMR name",
    "Has AMR name",
)

## code the different categories
df_analytic_withnames_withcleaned["missing_DOB"] = np.where(
    df_analytic_withnames_withcleaned.amr_dob_updated.isnull(), "Missing DOB", "Has DOB"
)

In [None]:
## redo identifier categories but with date of birth
df_analytic_withnames_withcleaned["identifier_categories"] = np.where(
    (df_analytic_withnames_withcleaned.missing_DOB == "Has DOB")
    & (
        (
            df_analytic_withnames_withcleaned.missing_safetyPAD_name
            == "Has safety pad name"
        )
        | (df_analytic_withnames_withcleaned.missing_AMR_name == "Has AMR name")
    ),
    "Has both name (either amr or safety pad) and DOB",
    np.where(
        (df_analytic_withnames_withcleaned.missing_DOB == "Missing DOB")
        & (
            (
                df_analytic_withnames_withcleaned.missing_safetyPAD_name
                == "Has safety pad name"
            )
            | (df_analytic_withnames_withcleaned.missing_AMR_name == "Has AMR name")
        ),
        "Has name (either amr or safety pad) but no DOB",
        np.where(
            (df_analytic_withnames_withcleaned.missing_DOB == "Missing DOB.")
            & (
                (
                    df_analytic_withnames_withcleaned.missing_safetyPAD_name
                    == "Missing safety pad name"
                )
                & (
                    df_analytic_withnames_withcleaned.missing_AMR_name
                    == "Missing AMR name"
                )
            ),
            "Missing name (either) and DOB",
            "Missing name but has DOB",
        ),
    ),
)
df_analytic_withnames_withcleaned.identifier_categories.value_counts()


## before crosstab, find unique
df_forcrosstab = df_analytic_withnames_withcleaned.drop_duplicates(
    subset=["num_1"], keep="first"
)

In [None]:
df_forcrosstab.loc[
    (df_forcrosstab.missing_DOB == "Missing DOB")
    & (df_forcrosstab.dispo_broad.isin(["NTL control", "NTL treatment"]))
].shape

# 5. For now, write non-matched data to .csv for ambulance use analysis

Ok to use since we're currently only looking at ambulance use for call $c$ rather than for an individual across calls

In [None]:
df_analytic_withnames_withcleaned.to_csv(INTERMEDIATE_DIR / "df_forambulanceuse.csv")

# 6. Focusing on names, perform fuzzy matching to deduplicate names

## 6.1 clean data to prepare for fuzzy matching

In [None]:
df_forfuzzy = df_analytic_withnames_withcleaned.copy()  # copy df so has shorter name

## create category for different combinations
df_forfuzzy["name_status"] = np.where(
    (df_forfuzzy.missing_AMR_name == "Has AMR name")
    & (df_forfuzzy.missing_safetyPAD_name == "Has safety pad name"),
    "Both names",
    np.where(
        (df_forfuzzy.missing_AMR_name == "Missing AMR name")
        & (df_forfuzzy.missing_safetyPAD_name == "Has safety pad name"),
        "Safety PAD but not AMR",
        np.where(
            (df_forfuzzy.missing_AMR_name == "Has AMR name")
            & (df_forfuzzy.missing_safetyPAD_name == "Missing safety pad name"),
            "AMR not safety PAD",
            "Neither",
        ),
    ),
)

pd.crosstab(df_forfuzzy.name_status, df_forfuzzy.dispo_broad)


## for those with both names, check if identical
df_forfuzzy["identicalname_iftwonames"] = np.where(
    (df_forfuzzy.name_status == "Both names")
    & (df_forfuzzy.amr_tocompare_wsafetypad == df_forfuzzy.safetypad_name_1),
    "AMR and safety pad same name",
    np.where(
        (df_forfuzzy.name_status == "Both names")
        & (df_forfuzzy.amr_tocompare_wsafetypad != df_forfuzzy.safetypad_name_1),
        "AMR and safety pad diff names",
        "Doesn't have two names",
    ),
)

df_forfuzzy.identicalname_iftwonames.value_counts()

In [None]:
## for kevin, sort of a toss up, seems like
## both could be useful so converting to long form so that each id is repeated
## with the multiple names
df_forfuzzy_keycols = df_forfuzzy[
    ["num_1", "amr_tocompare_wsafetypad", "safetypad_name_1"]
].copy()

In [None]:
## make long form
## and drop duplicates so that
## only ids that have different names in
## the two datasets are retained
df_forfuzzy_keycols_long = (
    df_forfuzzy_keycols.melt(id_vars=["num_1"])
    .drop_duplicates(subset=["num_1", "value"])
    .sort_values(by="num_1")
)

## removing col that indicates source of name
df_forfuzzy_keycols_long_touse = df_forfuzzy_keycols_long[["num_1", "value"]]
df_forfuzzy_keycols_long_touse.columns = ["num_1", "first_andlast_name"]

In [None]:
df_forfuzzy_keycols_long_touse[
    "name_nounderscore"
] = df_forfuzzy_keycols_long_touse.first_andlast_name.str.replace("_", " ")
df_forfuzzy_forremote = df_forfuzzy_keycols_long_touse[
    ["num_1", "name_nounderscore"]
].copy()

df_forfuzzy_forremote.to_csv(INTERMEDIATE_DIR / "df_forfuzzy.csv", index=False)

## 6.2 Run fuzzy deduplication



In [None]:
match_threshold = 90
output_df_name = INTERMEDIATE_DIR / "data_withmatches_amrupdates.csv"

## read data
data_formatch = df_forfuzzy_forremote.copy()

## run
all_names = data_formatch.name_nounderscore.unique().tolist()
names_tocheck = [
    str(name) for name in all_names if name is not None and not pd.isna(name)
]


def find_fuzzy_namematches(one_name: str) -> pd.DataFrame:

    ## choices other than name
    other_choices = [choice for choice in names_tocheck if choice != one_name]

    ## extract matches above cutoff
    all_abovecutoff = process.extractBests(
        one_name, other_choices, score_cutoff=match_threshold
    )

    ## make into a dataframe (will thus only capture ones with matches)
    all_abovecutoff_df = pd.DataFrame(
        list(all_abovecutoff), columns=["matched_name", "score"]
    )
    all_abovecutoff_df["original_name"] = one_name
    return all_abovecutoff_df


import time

from joblib import Parallel, delayed

print("Starting fuzzy matching")
t0 = time.time()
fuzzymatch_results_list = Parallel(n_jobs=-1, verbose=True)(
    delayed(find_fuzzy_namematches)(name) for name in names_tocheck
)
t1 = time.time()
print(f"Fuzzy matching took {t1 - t0} seconds to run")

## bind data and write
fuzzymatch_results_df = pd.concat(fuzzymatch_results_list)

## Sometimes fuzzy matching can lead to a few arbitrary decisions that depend on order
## Impose an order
fuzzymatch_results_df = (
    fuzzymatch_results_df.merge(
        pd.DataFrame(
            {
                "original_name": names_tocheck,
                "imposed_order": np.arange(len(names_tocheck), dtype=int),
            }
        ),
        on="original_name",
    )
    .sort_values(by="imposed_order", kind="stable")
    .drop(columns=["imposed_order"])
)

## write to csv
fuzzymatch_results_df.to_csv(output_df_name, index=False)
print("Script completed")

## 6.3  Read in results and clean

In [None]:
## for now, just matches with other ntl participants
matching_results = fuzzymatch_results_df.copy()

In [None]:
## see that it accidentally matched some na's (probably weird thing with writing file)
## remove NA's
matching_results_complete = matching_results.loc[
    (matching_results.matched_name.notnull())
    & (matching_results.original_name.notnull())
].copy()

## most look good, but some are close spelling-wise but different gender
matching_results_complete[
    "matched_name_first"
] = matching_results_complete.matched_name.str.replace("\\s.*", "")
matching_results_complete[
    "original_name_first"
] = matching_results_complete.original_name.str.replace("\\s.*", "")

## long-run have RA go through and hand code which
## seem like matches versus not (or maybe two and only ones
## where they agree)
## for now, use heuristic where if one is definitively male,
## another definitively female, then they don't count
## as matches  (avoiding mark and marika distinction)
gen_detect = gender.Detector(case_sensitive=False)
matching_results_complete[
    "matched_name_first_gender"
] = matching_results_complete.matched_name_first.apply(
    lambda x: gen_detect.get_gender(x)
)
matching_results_complete[
    "original_name_first_gender"
] = matching_results_complete.original_name_first.apply(
    lambda x: gen_detect.get_gender(x)
)

##
matching_results_complete["countas_match"] = np.where(
    (matching_results_complete.matched_name_first_gender == "male")
    & (matching_results_complete.original_name_first_gender == "female"),
    0,
    np.where(
        (matching_results_complete.matched_name_first_gender == "female")
        & (matching_results_complete.original_name_first_gender == "male"),
        0,
        1,
    ),
)


## subset to ones that count as matches and create lookup table
matching_results_complete_truematch = matching_results_complete.loc[
    matching_results_complete.countas_match == 1
].copy()

In [None]:
## create lookup table
## want, for instance, one row for BD with all of their ids
## check set equivalence
ids_names = df_forfuzzy_forremote.loc[
    df_forfuzzy_forremote.name_nounderscore.notnull(), ["num_1", "name_nounderscore"]
]


## matching results to merge
matching_results_tomerge = matching_results_complete_truematch[
    ["matched_name", "original_name", "score"]
]

In [None]:
lookup_wideform = matching_results_tomerge.apply(
    lambda x: find_ids_foramatch(
        x.original_name, x.matched_name, ids_names, "name_nounderscore"
    ),
    axis=1,
).drop_duplicates(keep="first")

In [None]:
lookup_wideform_nameexpanded = lookup_wideform.all_names.str.split(";", expand=True)
lookup_wideform_idsexpanded = lookup_wideform.all_ids.str.split(";", expand=True)
lookup_wideform_nameexpanded.columns = [
    "name" + str(i) for i in np.arange(1, lookup_wideform_nameexpanded.shape[1] + 1)
]
lookup_wideform_idsexpanded.columns = [
    "ntlid_" + str(i) for i in np.arange(1, lookup_wideform_idsexpanded.shape[1] + 1)
]

### combine
lookup_expanded = pd.concat(
    [lookup_wideform_nameexpanded, lookup_wideform_idsexpanded], axis=1
)

## 6.4 Add back respondents who we didnt find matches for

In [None]:
## rowbind with others
## who either dont have name
## or dont have name fuzzy matched
lookup_allids = list(
    dict.fromkeys(pd.melt(lookup_expanded, id_vars=["name1", "name2"]).value.tolist())
)
print("There are " + str(len(lookup_allids)) + " ids with fuzzy matched names")

In [None]:
df_forfuzzy_tomerge = (
    df_forfuzzy.loc[
        ~df_forfuzzy.num_1.isin(lookup_allids),
        ["num_1", "amr_tocompare_wsafetypad", "safetypad_name_1"],
    ]
    .drop_duplicates(keep="first")
    .sort_values(by="num_1")
)

df_forfuzzy_tomerge["name1"] = np.where(
    df_forfuzzy_tomerge.safetypad_name_1.notnull(),
    df_forfuzzy_tomerge.safetypad_name_1.str.replace("_", " "),
    np.where(
        df_forfuzzy_tomerge.amr_tocompare_wsafetypad.notnull(),
        df_forfuzzy_tomerge.amr_tocompare_wsafetypad.str.replace("_", " "),
        np.nan,
    ),
)

df_forfuzzy_tomerge_final = df_forfuzzy_tomerge[
    ["num_1", "name1"]
].copy()  # final to rowbind with ones matched


df_forfuzzy_tomerge_final.columns = ["ntlid_1", "name1"]

## 6.5 Create long-format lookup table with a new id we construct

Repeats respondents different ntl ids

In [None]:
## create new data to rowbind
## this data has its original name as name1,
## original id as ntlid_1, and then adds other cols
## to be empty
df_idsnotinlookup_extracols = pd.DataFrame(
    data=None, columns=["name2"] + ["ntlid_" + str(i) for i in np.arange(2, 14)]
)

df_idsnotinlookup_tobind_final = pd.concat(
    [df_forfuzzy_tomerge_final, df_idsnotinlookup_extracols], axis=1
)


df_lookup_all = pd.concat([lookup_expanded, df_idsnotinlookup_tobind_final])

## create one id per row
df_lookup_all_reshuffled = df_lookup_all.sort_values(by="ntlid_1").reset_index(
    drop=True
)

In [None]:
df_lookup_all_reshuffled["constructed_id"] = df_lookup_all_reshuffled.index + 1

In [None]:
df_lookup_all_forlong = df_lookup_all_reshuffled[
    [
        col
        for col in df_lookup_all_reshuffled.columns
        if "constructed_id" in col or "ntlid" in col
    ]
]


## create long format (better for analyses)
df_lookup_all_long = pd.melt(
    df_lookup_all_forlong, id_vars="constructed_id"
).sort_values(by="constructed_id")

In [None]:
df_lookup_all_long_complete = df_lookup_all_long[
    df_lookup_all_long.value.notnull()
].copy()

In [None]:
## merge back with names
df_allnames = df_lookup_all_reshuffled[
    [
        col
        for col in df_lookup_all_reshuffled.columns
        if "constructed_id" in col or "ntlid" not in col
    ]
]
df_lookup_all_wnames = pd.merge(
    df_lookup_all_long_complete, df_allnames, on="constructed_id", how="left"
)

df_lookup_all_wnames.rename(
    columns={"variable": "which_identifier", "value": "num_1"}, inplace=True
)

df_lookup_all_wnames[
    "num_1"
] = df_lookup_all_wnames.num_1.str.strip()  # whitespace causing ids to appear twice
df_lookup_all_wnames_final = df_lookup_all_wnames.drop_duplicates(
    subset="num_1", keep="first"
)

print(
    "Originally, there were "
    + str(len(df_lookup_all_wnames_final.num_1.unique()))
    + " NTL ids"
)
print(
    "In current lookup table, there are "
    + str(len(df_lookup_all_wnames_final.constructed_id.unique()))
    + " unique individuals"
)

# 7. Add other attributes to lookup table



## 7.1 Phone numbers

In [None]:
## get ids and phone numbers from original data
df_mergephone = (
    df_analytic_withnames_withcleaned[["num_1", "cleaned_numbers_CAD"]]
    .drop_duplicates(keep="first")
    .copy()
)

## left join onto lookup table
df_lookup_all_wnames_wnumbers = pd.merge(
    df_lookup_all_wnames_final, df_mergephone, on="num_1", how="left"
)

## 7.2 Treatment status

In [None]:
## timestamp and treatment status
df_mergetxstatus = (
    df_analytic[["num_1", "dispo_broad", "event_status", "date"]]
    .drop_duplicates(keep="first")
    .copy()
)

df_lookup_all_wnames_wnumbers_wtx = pd.merge(
    df_lookup_all_wnames_wnumbers, df_mergetxstatus, on="num_1", how="left"
)

## 7.3 DOB

### 7.3.1 DOB based on AMR data

In [None]:
df_mergedob = df_datecleaning[["num_1", "dob_ymd"]].drop_duplicates(keep="first").copy()

In [None]:
df_lookup_all_wnames_wnumbers_wtx_wdob = pd.merge(
    df_lookup_all_wnames_wnumbers_wtx, df_mergedob, on="num_1", how="left"
)

### 7.3.2 DOB based on safetyPAD API pull

In [None]:
extra_dem_safetypad_comprehensive = pd.read_csv(
    PRIVATE_DATA_DIR / "dem_fromsafetyPAD_20191115.csv"
)

In [None]:
##
extra_dem_safetypad_tomerge = (
    extra_dem_safetypad_comprehensive[["fems_id", "date_of_birth"]]
    .copy()
    .drop_duplicates()
)

extra_dem_safetypad_tomerge.rename(
    columns={"date_of_birth": "date_of_birth_safetypad"}, inplace=True
)

In [None]:
## left join on fems ID
df_lookup_all_wnames_wnumbers_wtx_wdob.rename(
    columns={"dob_ymd": "date_of_birth_AMR", "date": "date_call"}, inplace=True
)
df_lookup_wAPIdob = pd.merge(
    df_lookup_all_wnames_wnumbers_wtx_wdob,
    extra_dem_safetypad_tomerge,
    left_on="num_1",
    right_on="fems_id",
    how="left",
).drop_duplicates()


## create dob final
df_lookup_wAPIdob["final_dob"] = np.where(
    df_lookup_wAPIdob.date_of_birth_AMR.notnull(),
    df_lookup_wAPIdob.date_of_birth_AMR,
    np.where(
        df_lookup_wAPIdob.date_of_birth_safetypad.notnull(),
        df_lookup_wAPIdob.date_of_birth_safetypad,
        np.nan,
    ),
)

## code identifier status with dob added
df_lookup_wAPIdob["id_status"] = np.where(
    (df_lookup_wAPIdob.name1.notnull()) & (df_lookup_wAPIdob.final_dob.notnull()),
    "Name and DOB",
    np.where(
        (df_lookup_wAPIdob.name1.notnull()) & (df_lookup_wAPIdob.final_dob.isnull()),
        "Name but no DOB",
        np.where(
            (df_lookup_wAPIdob.name1.isnull())
            & (df_lookup_wAPIdob.final_dob.isnull())
            & (df_lookup_wAPIdob.cleaned_numbers_CAD.notnull()),
            "Only phone number",
            "None",
        ),
    ),
)
df_idsummary_nondup = df_lookup_wAPIdob.drop_duplicates(
    subset="num_1", keep="first"
).loc[df_lookup_wAPIdob.dispo_broad.isin(["NTL control", "NTL treatment"])]
print(
    pd.crosstab(
        df_idsummary_nondup.id_status,
        df_idsummary_nondup.dispo_broad,
        normalize="columns",
    ).to_latex()
)
print(
    pd.crosstab(
        df_idsummary_nondup.id_status, df_idsummary_nondup.dispo_broad
    ).to_latex()
)


## write to csv
df_lookup_wAPIdob.to_csv(INTERMEDIATE_DIR / "df_forrepeatcalls.csv", index=False)

# 8. Create version for DHCR medicaid id lookup

(Not changing code so that we can re-create the file we sent them/so that the file is static)


In [None]:
df_fordhcr = df_lookup_all_wnames_wnumbers_wtx_wdob.copy()

## parse names if multiple (could make more efficient with function)
df_fordhcr["firstname_name1"] = df_fordhcr["name1"].str.split(" ").str[0]
lastnames_name1 = df_fordhcr["name1"].str.split(" ").str[1:]
lastnames_name1_clean = [
    " ".join(name) if isinstance(name, list) else None for name in lastnames_name1
]
df_fordhcr["lastname_name1"] = lastnames_name1_clean

In [None]:
df_fordhcr["firstname_name2"] = df_fordhcr["name2"].str.strip().str.split(" ").str[0]
lastnames_name2 = df_fordhcr["name2"].str.strip().str.split(" ").str[1:]
lastnames_name2_clean = [
    " ".join(name) if isinstance(name, list) else None for name in lastnames_name2
]
df_fordhcr["lastname_name2"] = lastnames_name2_clean

# The rename back to dob_ymb is for historical consistency
df_fordhcr_relcols = df_fordhcr[
    [
        "firstname_name1",
        "lastname_name1",
        "firstname_name2",
        "lastname_name2",
        "date_of_birth_AMR",
        "cleaned_numbers_CAD",
        "constructed_id",
        "which_identifier",
        "num_1",
        "dispo_broad",
    ]
].rename(columns={"date_of_birth_AMR": "dob_ymd"})
df_fordhcr_relcols["id_status"] = np.where(
    (df_fordhcr_relcols.firstname_name1.notnull())
    & (df_fordhcr_relcols.dob_ymd.notnull()),
    "Name and DOB",
    np.where(
        (df_fordhcr_relcols.firstname_name1.notnull())
        & (df_fordhcr_relcols.dob_ymd.isnull()),
        "Name but no DOB",
        np.where(
            (df_fordhcr_relcols.firstname_name1.isnull())
            & (df_fordhcr_relcols.dob_ymd.isnull())
            & (df_fordhcr_relcols.cleaned_numbers_CAD.notnull()),
            "Only phone number",
            "Other",
        ),
    ),
)


## just write the ones that
cols_rename = {
    "dob_ymd": "date_of_birth",
    "cleaned_numbers_CAD": "phone_number",
    "num_1": "ntl_id",
}

df_fordhcr_relcols.id_status.value_counts(normalize=True)

df_fordhcr_relcols.rename(columns=cols_rename, inplace=True)
df_fordhcr_relrows = df_fordhcr_relcols.loc[
    df_fordhcr_relcols.id_status.isin(["Name and DOB", "Name but no DOB"])
].copy()
df_fordhcr_relrows.to_csv(EXTERNAL_DIR / "identifiers_fordhcr.csv", index=False)

## 8.1 subset to those missing DOB and try to add based on safetypad api info

In [None]:
## subset to those missing dob
missing_dob_init = df_fordhcr_relrows.loc[
    df_fordhcr_relrows.id_status == "Name but no DOB"
].copy()


extra_dem_safetypad = pd.read_csv(PRIVATE_DATA_DIR / "dem_fromsafetyPAD.csv")

## capitalize fname and lname and construct key
extra_dem_safetypad["name_key"] = (
    extra_dem_safetypad.last_name.str.upper()
    + "_"
    + extra_dem_safetypad.first_name.str.upper()
)
missing_dob_init["name_key"] = (
    missing_dob_init.lastname_name1.str.upper()
    + "_"
    + missing_dob_init.firstname_name1.str.upper()
)


## look for intersecting name keys
name_keys_match = set(missing_dob_init.name_key).intersection(
    extra_dem_safetypad.name_key
)

## check how many name keys match exactly
print(
    "There are "
    + str(len(name_keys_match))
    + " name keys that match exactly between base data missing dobs and api pull"
)

## using exact name keys, replace DOB with one from API
missing_dob_toadd = missing_dob_init.drop(columns="date_of_birth", inplace=False)


## merge in dob
dob_tomerge = extra_dem_safetypad[["name_key", "date_of_birth"]].copy()

## left join onto main data
missing_dob_toadd_withdob = pd.merge(
    missing_dob_toadd, dob_tomerge, on="name_key", how="left"
)

## drop name key and update id status
missing_dob_tomerge = missing_dob_toadd_withdob.drop(columns="name_key", inplace=False)
missing_dob_tomerge["id_status"] = np.where(
    (missing_dob_tomerge.firstname_name1.notnull())
    & (missing_dob_tomerge.date_of_birth.notnull()),
    "Name and DOB",
    "Name but no DOB",
)


## rowbind back
missing_dob_tomerge_nodup = missing_dob_tomerge.drop_duplicates()
observed_dob_tomerge = df_fordhcr_relrows.loc[
    df_fordhcr_relrows.id_status != "Name but no DOB"
].copy()

## combine
df_fordhcr_updateddob = pd.concat(
    [missing_dob_tomerge_nodup, observed_dob_tomerge]
).drop_duplicates()


df_fordhcr_relrows.id_status.value_counts(normalize=True)
df_fordhcr_updateddob.id_status.value_counts(normalize=True)


## save
df_fordhcr_updateddob.to_csv(EXTERNAL_DIR / "df_fordhcr_DOBsadded.csv", index=False)