In [None]:
from datetime import datetime

import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell

from femsntl.df_verbs import append_max_and_count, case_when, cross_join, find_ids
from femsntl.readers import read_file

pd.options.display.float_format = "{:.4f}".format
InteractiveShell.ast_node_interactivity = "all"

# 0. Inputs and Outputs

In this section we describe the inputs and outputs of this file

In [None]:
## get timestamp for final output writing
## (then use function to read in most current version w/ root filename)
current_time = datetime.now().strftime(format="%m-%d-%y-%H-%M-%S")

In [None]:
from femsntl.datafiles import INTERMEDIATE_DIR, PRIVATE_DATA_DIR

## INPUTS
BENEFICIARY_FILE = PRIVATE_DATA_DIR / "Member_Matches_wDHCF.xlsx"
MEDICARE_FILE = PRIVATE_DATA_DIR / "MedicareEnrollmentForNTLMembersList.csv"
FILE_SHARED_DHCF = INTERMEDIATE_DIR / "df_fordhcr_DOBsadded.csv"
CALL_RESPONSE_POST_TX_FILE = INTERMEDIATE_DIR / "callresponse_forposttx.csv"
CONSTRUCTED_IDS_FILE = INTERMEDIATE_DIR / "df_forrepeatcalls.csv"

## OUTPUTS
MEDICAID_MULTIPLE_NTL_FILE = INTERMEDIATE_DIR / "medicaid_multiplentl.xlsx"
NTL_BENEFITS_MUTLIPLE_REVIEW_FILE = INTERMEDIATE_DIR / "ntlbene_mult_toreview.csv"
DECORATED_NTL_OUTPUT_FILE = INTERMEDIATE_DIR / "ntl_withmedicaidIDS_{}.csv".format(
    current_time
)

## Hand-edited intermediate inputs
MEDICAID_MULTIPLE_NTL_FILE_CODED_R1 = (
    INTERMEDIATE_DIR / "medicaid_multiplentl_RAreview_MR.xlsx"
)

MEDICAID_MULTIPLE_NTL_FILE_CODED_R2 = (
    INTERMEDIATE_DIR / "medicaid_multiplentl_RAreview_JG.xlsx"
)

NTL_BENEFITS_MUTLIPLE_REVIEW_FILE_CODED = (
    INTERMEDIATE_DIR / "ntlbene_mult_reviewed_by_kevin.csv"
)

# 1. Read data and merge back with NTL ids

In all of what follows, our goal is to answer the question: "What is going on with multiple matches?" On the left hand side, there is the Medicaid beneficiary file, which is keyed by `MedicaidSystemID`. On the right hand side, there are the NTL call logs, which are keyed by `date_of_birth` and `FirstLastName1`, which we combine into `name_dob_id`.

We examine two types of matches:
  1. Cases where a single `name_dob_id` matches to multiple `MedicaidSystemID`s, and
  2. Cases where a single `MedicaidSystemID` matches to multiple `name_dob_id`s.

In the first case, we send multiple matches back to DHCF and request more information after some manual review.

In the second case, we manually review fo

Finally, in section 1.3, below, we join these results back together.

In [None]:
## reads beneficiary file that contains all matches
beneficiary_file = read_file(BENEFICIARY_FILE)

## new: data on medicare enrollment (also have medicaid enrollment spells
## but using in next script since those depend more on ntl study dates)
medicare_file = read_file(MEDICARE_FILE)

In [None]:
## create indicators of match to help adjudicate
## between cases that have multiple matches
beneficiary_file["matched_fname"] = (
    beneficiary_file.firstname_name1 == beneficiary_file.MemberFirstName
)
beneficiary_file["matched_lname"] = (
    beneficiary_file.lastname_name1 == beneficiary_file.MemberLastName
)
beneficiary_file["matched_dob"] = (
    beneficiary_file.date_of_birth == beneficiary_file.MemberDateofBirth
)

## match points range from 0-3 and reflect # of fields that
beneficiary_file["match_points"] = beneficiary_file[
    ["matched_fname", "matched_lname", "matched_dob"]
].sum(axis=1)

if "name_dob_id" not in beneficiary_file.columns:
    beneficiary_file["name_dob_id"] = (
        beneficiary_file.date_of_birth.astype(str)
        + "_"
        + beneficiary_file.FirstLastName1
    )

In [None]:
## compare dimensions for beneficiary file
print(
    f"There are {len(beneficiary_file)} rows in Medicaid beneficiary file"
    f"resulting from match, corresponding to {beneficiary_file.name_dob_id.nunique()} unique name-dob pairs"
)

## subset data to relevant fields. This is our primary dataframe
COMMON_COLS = [
    "MedicaidSystemID",
    "MemberFullName",
    "MemberDateofBirth",
    "match_points",
    "name_dob_id",
]
df = beneficiary_file[COMMON_COLS].drop_duplicates().copy()

## find people who within same name and dob, have multiple medicaid ids
count_eachname = df.name_dob_id.value_counts()
count_eachID = df.MedicaidSystemID.value_counts()

## name and dobs of ntlers who appear multiple times
ntl_ids_in_many_rows = count_eachname[count_eachname > 1].index

## medicaid system ids of medicaid beneficiaries who appear multiple times
medicaid_ids_in_many_rows = count_eachID[count_eachID > 1].index

## 1.1: Look at case one: One NTL ID, Many Medicaid ids

Need to deduplicate because this can't be due to repeat calls.

In [None]:
one_of_both_id_df = df[
    ~df["name_dob_id"].isin(ntl_ids_in_many_rows)
    & ~df["MedicaidSystemID"].isin(medicaid_ids_in_many_rows)
]

multi_ntl_df = df[df["name_dob_id"].isin(ntl_ids_in_many_rows)].copy()

# Determine the maximum number of matches in a group
multi_ntl_df_wmax = append_max_and_count(multi_ntl_df)

# View examples
# multi_ntl_df_wmax.sort_values(by="name_dob_id").head()

In [None]:
# Now we divide NTL ids into three cases:
#   a) There is _not_ a  unique Medicaid ID for the top match value.
#      That is, the same ntl-er is matched to multiple medicaid beneficiaries with same score
#      We will ship these to DHCF for further evaluation (count_of_match_points > 1)
#   b) There is a unique Medicaid ID for the top match value (count_of_match_points == 1) _and_
#      that Medicaid ID is the top match for _only one_ NTL id
#   c) Case when there is a unique Medicaid ID _but_
#      that unique Medicaid ID is the top match for _multiple_ NTL ids


# Case (a)
one_to_many_df = multi_ntl_df[
    (multi_ntl_df["match_points"] == multi_ntl_df["max_match_points"])
    & (multi_ntl_df["count_of_match_points"] > 1)
]

# Start of cases b and c
# looks at ntlers who have one top match
one_topmatch = multi_ntl_df[
    (multi_ntl_df["match_points"] == multi_ntl_df["max_match_points"])
    & (multi_ntl_df["count_of_match_points"] == 1)
]

# Then, separate into Case (b) (ntl-ers whose medicaid top match is only a match for them,
# which we call no_confusion_df_init)
# and Case (c) (ntl-ers whose medicaid top match is a match for multiple ntl-ers)
# for Case (c) there is only one ntl-er who this happens to,
# so we manually examine
dup_top = one_topmatch.MedicaidSystemID.value_counts()

no_confusion_df_init = one_topmatch[
    one_topmatch.MedicaidSystemID.isin(dup_top.index[dup_top == 1])
]

repeated_df = one_topmatch[
    one_topmatch.MedicaidSystemID.isin(dup_top.index[dup_top > 1])
]

print(
    f"Out of the {len(no_confusion_df_init)} ntlers with one top medicaid match, for "
    f"{len(repeated_df)} of those, the medicaid benef. was matched to multiple ntl ids"
)

## hand examine the single repeated matach
# print(repeated_df)

## clear match among the two so we just keep one of them
repeated_tokeep = repeated_df[
    repeated_df.name_dob_id.astype(str).str.contains("RICE")
].copy()

## rowbind
no_confusion_df = pd.concat([no_confusion_df_init, repeated_tokeep])

# Each row should have a unique MedicaidSystemID
assert len(no_confusion_df) == no_confusion_df["MedicaidSystemID"].nunique()

In [None]:
# Combine unique fuzzy matches above with those who were already
# unique in the original data set
deduped_ntl_df = pd.concat([one_of_both_id_df, no_confusion_df[COMMON_COLS]])

assert len(deduped_ntl_df) == deduped_ntl_df["MedicaidSystemID"].nunique()

print(f"After reducing to best matches, we have {len(deduped_ntl_df)} rows")

## 1.2: One Medicaid id, Many NTL ids

Could be true multiple matches, since could be indication of repeat calls (so slightly diff ntl name and dob). We will hand code these.

In [None]:
multi_medicaid_df = df[df["MedicaidSystemID"].isin(medicaid_ids_in_many_rows)]


assert not multi_medicaid_df.duplicated().any()

# Write out for hand coding
multi_medicaid_df.to_csv(MEDICAID_MULTIPLE_NTL_FILE, index=False)

In [None]:
# Read in hand coded file
# two ras coding independently
deduped_medicaid_df_r1 = read_file(MEDICAID_MULTIPLE_NTL_FILE_CODED_R1)
deduped_medicaid_df_r2 = read_file(MEDICAID_MULTIPLE_NTL_FILE_CODED_R2)

## code inter-rater reliability metric
compare_keep = pd.DataFrame(
    {
        "keep_r1": deduped_medicaid_df_r1.keep_match,
        "keep_r2": deduped_medicaid_df_r2.keep_match,
    }
)

compare_keep["agree"] = np.where(
    compare_keep.keep_r1 == compare_keep.keep_r2, True, False
)
irr = np.sum(compare_keep.agree) / compare_keep.shape[0]
print(f"The IRR between RAs hand coding which match to keep was {round(irr, 4)}")

## get indices of ones to keep
matches_keep = compare_keep[compare_keep.agree == True].index.tolist()
deduped_medicaid_df = deduped_medicaid_df_r1.copy().rename(
    columns={"keep_match": "keep_match_init"}
)
deduped_medicaid_df["agree"] = np.where(
    deduped_medicaid_df.index.isin(matches_keep), True, False
)
deduped_medicaid_df["keep_match"] = np.where(
    (deduped_medicaid_df.keep_match_init == 1) & (deduped_medicaid_df.agree == True),
    1,  # code to keep only if both agree on keep
    0,
)
deduped_medicaid_df.drop(columns=["keep_match_init", "agree"], inplace=True)

In [None]:
# Fix types
assert deduped_medicaid_df["MedicaidSystemID"].notna().all()
assert not deduped_medicaid_df.MedicaidSystemID.str.contains(".", regex=False).any()

# Make sure types are correct
assert deduped_medicaid_df["keep_match"].notna().all()
assert deduped_medicaid_df["keep_match"].isin([0, 1]).all()

# Keep the indicated one
deduped_medicaid_df = deduped_medicaid_df[deduped_medicaid_df.keep_match == 1].copy()


# Combine with previous section and perform some sanity checks
deduped_df = pd.concat([deduped_ntl_df, deduped_medicaid_df[COMMON_COLS]])

# Check: Each NTL id has *one* Medicaid id. This turns out to be FALSE!
# assert (deduped_df.groupby('name_dob_id')['MedicaidSystemID'].nunique() == 1).all()

# What's left?
still_has_dupes = deduped_df[
    deduped_df.groupby("name_dob_id")["MedicaidSystemID"].transform(
        lambda x: len(np.unique(x))
    )
    > 1
].sort_values(by="name_dob_id")
append_max_and_count(still_has_dupes)
kept_dupes = still_has_dupes[
    still_has_dupes["match_points"] == still_has_dupes["max_match_points"]
]

# Append to our original deduped_df
deduped_df = pd.concat(
    [
        deduped_df[~deduped_df["name_dob_id"].isin(still_has_dupes["name_dob_id"])],
        kept_dupes[COMMON_COLS],
    ]
)

print(
    f"The final file has {deduped_df.MedicaidSystemID.nunique()} beneficiaries "
    f"appearing {len(deduped_df)} times"
)

## 1.3: Merging DHCF responses to (1.1)

In this section, we merge back in the results we obtained from DHCF after sending them the output of `1.1`.

In [None]:
## merge in the file we shared with them that has first and last name and dob
dhcf_df = read_file(FILE_SHARED_DHCF)

dhcf_df["FirstLastName1"] = dhcf_df.firstname_name1 + " " + dhcf_df.lastname_name1
# dhcf_df = dhcf_df[['ntl_id', 'phone_number', 'FirstLastName1', 'date_of_birth']]

In [None]:
## turn dob to string
dhcf_df["name_dob_id"] = (
    dhcf_df["date_of_birth"].astype(str) + "_" + dhcf_df["FirstLastName1"]
)

print(
    f"There are {dhcf_df.ntl_id.nunique()} unique NTL ids in file shared with them to merge"
)
print(
    f"There are {dhcf_df.name_dob_id.nunique()} unique names in file shared with them to merge"
)

### 1.3.1 Cleaning up duplicates from DHCF

By examining the data by hand, we remove several sources of duplicate `ntl_id`s. In order:
  1. The presence vs non-presence of a `phone_number` seems to drive a lot of dupes. Drop it.
  2. Sometimes other identifiers, in particular date of birth, were missing in one source but
     not another. So keep the row with the _most_ identifiers present (upto 1)
  3. There are a small number of NTL ids that seem to have slightly different DOBs depending
     on the source of the birthday. Make a flag and move on for now.

In [None]:
# Step 1
after_step_one = dhcf_df.drop(columns=["phone_number"]).drop_duplicates().copy()

# Step 2: Our identifiers are ['FirstLastName1', 'date_of_birth']
id_cols = ["FirstLastName1", "date_of_birth"]
all_cols = ["ntl_id"] + id_cols

# NOTE(khw): This is _slightly_ different than the original file but I think what was intended
after_step_one["num_ids_present"] = after_step_one[id_cols].notna().sum(axis=1)
after_step_one = append_max_and_count(
    after_step_one, id_col="ntl_id", value_col="num_ids_present"
)
after_step_two = (
    after_step_one.loc[
        after_step_one["max_num_ids_present"] == after_step_one["num_ids_present"]
    ]
    .drop(
        columns=["num_ids_present", "max_num_ids_present", "count_of_num_ids_present"]
    )
    .drop_duplicates()
    .copy()
)

# Step 3
after_step_two["is_repeated_ntl"] = (
    after_step_two.groupby("ntl_id")["ntl_id"].transform(len) > 1
)

print(
    f"After attempted deduplication, the proportion of duplicated NTL ids "
    f"is {after_step_two.is_repeated_ntl.mean():0.4f}"
)

### 1.3.2 Perform DHCF and Original File merger

Now that we've cleaned everything up, merge the two sets of deduplicated files.

In [None]:
decorated_df = pd.merge(
    after_step_two, deduped_df, on="name_dob_id", how="left"
).drop_duplicates()

assert len(decorated_df) == len(after_step_two)

### 1.3.3 Deduplicate the Decorated output of (1.3.2)

There are _still_ some duplicates after we've performed the work in 1.3.2. In particular, we
perform the following steps:
  1. Pull out the rows which don't have duplicate `ntl_id`s
  2. Of the remaining rows, grab the rows with the most `match_point`s
  3. If there's just one row remaining here, we're golden
  4. Else, perform a hand review
  5. Combine the three outputs above
  6. For any remaining NTL ids, deduplicate down to ntl_id and FirstLastName1 and null out other info

In [None]:
COMMON_COLS_PLUS = COMMON_COLS + ["ntl_id", "FirstLastName1"]

# Step 1
non_dupe_ntl_df = decorated_df[~decorated_df.duplicated("ntl_id", keep=False)]

# Step 2
dupe_ntl_df = decorated_df[decorated_df.duplicated("ntl_id", keep=False)]
dupe_ntl_df = append_max_and_count(dupe_ntl_df, id_col="ntl_id", inplace=False)
dupe_ntl_df = dupe_ntl_df[
    dupe_ntl_df["match_points"] == dupe_ntl_df["max_match_points"]
]

# Step 3
step_three_df = dupe_ntl_df.loc[
    dupe_ntl_df["count_of_match_points"] == 1, COMMON_COLS_PLUS
]

# Step 4
dupe_ntl_df.loc[
    dupe_ntl_df["count_of_match_points"] > 1,
    [
        "ntl_id",
        "MedicaidSystemID",
        "MemberFullName",
        "FirstLastName1",
        "MemberDateofBirth",
        "date_of_birth",
        "name_dob_id",
    ],
].sort_values(by=["ntl_id", "MedicaidSystemID"]).to_csv(
    NTL_BENEFITS_MUTLIPLE_REVIEW_FILE, index=False
)

# Read in hand coded file and make sure types are correct
step_four_df = read_file(NTL_BENEFITS_MUTLIPLE_REVIEW_FILE_CODED)
assert step_four_df["keep_match"].notna().all()
assert step_four_df["keep_match"].isin([0, 1]).all()
step_four_df = step_four_df[step_four_df["keep_match"] == 1]

# Step 5
step_five_df = pd.concat([non_dupe_ntl_df, step_three_df, step_four_df])

# Step 6
step_six_df = decorated_df[
    ~decorated_df["ntl_id"].isin(step_five_df["ntl_id"])
].drop_duplicates(["ntl_id", "FirstLastName1"])[["ntl_id", "FirstLastName1"]]
step_six_df = cross_join(
    step_six_df, pd.DataFrame.from_records([], columns=["date_of_birth"] + COMMON_COLS)
)
step_six_df["name_dob_id"] = (
    step_six_df["date_of_birth"].astype(str) + "_" + step_six_df["FirstLastName1"]
)

final_decorated_df = pd.concat([step_five_df, step_six_df])

In [None]:
final_decorated_df["medicare_dualenrollee"] = case_when(
    final_decorated_df.MedicaidSystemID.isin(medicare_file.MedicaidSystemID),
    "Medicare enrollee",
    final_decorated_df.MedicaidSystemID.isnull(),
    "Unknown (not matched to Medicaid)",
    "Not medicare enrollee",
)

final_decorated_df.medicare_dualenrollee.value_counts(normalize=True)

# 2: Merge participants file with treatment status

In this section, we take the output of the previous section, `final_decorated_df`, which consists of ??? and merge it with the treatment statuses we received from FEMS. This consists of

0. Reading the data
1. Doing some sanity checks
2. Merging the data and writing it out for future use

### 2.0: Reading in the data

In [None]:
# Most recent ids and create_date variable
call_response_df = read_file(CALL_RESPONSE_POST_TX_FILE)

# Constructed ids from fuzzy matching
constructed_ids_df = read_file(CONSTRUCTED_IDS_FILE)

participants_df = pd.merge(
    call_response_df,
    constructed_ids_df[["num_1", "constructed_id"]].drop_duplicates(),
    on="num_1",
    how="left",
)

## 2.1 Summarize which identifiers we gave to medicaid

Create three flags:

* `has_medicaid_name`: Do they have a first name (NOTE(khw): Original says 'either first or last name' but code says different)
* `has_medicaid_dob`: Do they have a date of birth
* `has_medicaid_phone`: Do they have a phone number

cf. slightly different scheme in `02_create_identifiers` which limits to people with a name
TODO(khw): Is this a correct description of what was true before?

Finally, merge with the above decorated data and write it out.

In [None]:
# classify main data
for target_col, source_col in zip(
    ["name", "dob", "phone"], ["firstname_name1", "date_of_birth", "phone_number"]
):
    participants_df[f"has_medicaid_{target_col}"] = participants_df["num_1"].isin(
        find_ids(dhcf_df, "ntl_id", source_col)
    )

# updated has no medicaid identifiers
# to rely on above indicator flags
# rather than dhcf_df in statement
participants_df["has_no_medicaid_ids"] = np.where(
    (participants_df.has_medicaid_name == False)
    & (participants_df.has_medicaid_dob == False)
    & (participants_df.has_medicaid_phone == False),
    True,
    False,
)

In [None]:
decorated_ntl = pd.merge(
    participants_df,
    final_decorated_df[
        [
            "ntl_id",
            "MedicaidSystemID",
            "medicare_dualenrollee",
            "MemberFullName",
            "MemberDateofBirth",
            "name_dob_id",
        ]
    ],
    left_on="num_1",
    right_on="ntl_id",
    how="left",
)

decorated_ntl["has_medicaid_id"] = case_when(
    decorated_ntl["MedicaidSystemID"].isnull(), "Missing Medicaid ID", "Has Medicaid ID"
)

## crosstabs of presence
pd.crosstab(decorated_ntl.has_medicaid_id, decorated_ntl.dispo_broad)

## crosstab of proportions
pd.crosstab(
    decorated_ntl.has_medicaid_id, decorated_ntl.dispo_broad, normalize="columns"
)

decorated_ntl["id_status"] = case_when(
    ~decorated_ntl.has_medicaid_name & ~decorated_ntl.has_medicaid_dob,
    "Missing dob and name",
    decorated_ntl.has_medicaid_name & ~decorated_ntl.has_medicaid_dob,
    "Missing dob; has name",
    ~decorated_ntl.has_medicaid_name & decorated_ntl.has_medicaid_dob,
    "Missing name; has dob",
    "Has both",
)

# Commented out for pushing to github
# ntl_withmedicaid.head()

In [None]:
## write new version since updated since round 1 to be able to compare
decorated_ntl.to_csv(DECORATED_NTL_OUTPUT_FILE, index=False)