# Merging files

## Importing the libraries

In [1]:
## importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

## setting the maximum columns and rows
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

## removing warnings
import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

In [2]:
## importing datasets
diagnosis_dict = {"ClmDiagnosisCode_1":str,"ClmDiagnosisCode_2":str,"ClmDiagnosisCode_3":str,"ClmDiagnosisCode_4":str,
             "ClmDiagnosisCode_5":str,"ClmDiagnosisCode_6":str,"ClmDiagnosisCode_7":str,"ClmDiagnosisCode_8":str,
             "ClmDiagnosisCode_9":str,"ClmDiagnosisCode_10":str}
PATH_DIR = "..\\data\\Raw Data"
df_ben_train = pd.read_csv(PATH_DIR + "\\Train_Beneficiarydata-1542865627584.csv", parse_dates = ["DOB","DOD"])
df_ben_test = pd.read_csv(PATH_DIR + "\\Test_Beneficiarydata-1542969243754.csv", parse_dates = ["DOB","DOD"])
df_in_train = pd.read_csv(PATH_DIR + "\\Train_Inpatientdata-1542865627584.csv",dtype = diagnosis_dict)
df_in_test = pd.read_csv(PATH_DIR + "\\Test_Inpatientdata-1542969243754.csv",dtype = diagnosis_dict)
df_out_train = pd.read_csv(PATH_DIR + "\\Train_Outpatientdata-1542865627584.csv",dtype = diagnosis_dict)
df_out_test = pd.read_csv(PATH_DIR + "\\Test_Outpatientdata-1542969243754.csv",dtype = diagnosis_dict)
df_icd9_icd10 = pd.read_csv("..\data\icd9_to_icd10_cm_mapping.csv",dtype = {'icd9cm': str, 'icd10cm': str})
df_icd10_hcc = pd.read_excel("..\data\PY 2024 Proposed Clinical Revision Part C Model ICD-10 Mappings.xlsx"
                             ,skiprows=3,dtype = {'2020_CMS-HCC': str, '2024_CMS-HCC': str})
df_SDoH = pd.read_excel(r'C:\Users\MR321NC\OneDrive - EY\POC\General - Suspect analysis - Risk adjustment\Health Care HCC recommendation\data\SDOH_working_file.xlsx',
                        sheet_name = 1,dtype = {'STATE CODE': str, 'mapping_state_code': str} )
df_disease_cofficients = pd.read_excel(r'C:\Users\MR321NC\OneDrive - EY\POC\General - Suspect analysis - Risk adjustment\Health Care HCC recommendation\data\RAF Factors data\V28_HCC_Factors_unpivoted.xlsx',
                        sheet_name = "Disease Coefficients_p",dtype = {'HCC_code': str} )

df_icd10_hcc = df_icd10_hcc[:-5]

In [3]:
## appending train and test files
df_ben = df_ben_train.append(df_ben_test[df_ben_train.columns.to_list()]).reset_index(drop = True)
df_in = df_in_train.append(df_in_test[df_in_train.columns.to_list()]).reset_index(drop = True)
df_out = df_out_train.append(df_out_test[df_out_train.columns.to_list()]).reset_index(drop = True)

# PATIENTS

## Adding flags to _InPatient_ and _OutPatient_

In [4]:
## creating inpatient and out patient flag
df_in["flag_ip"] = "in"
df_out["flag_ip"] = "out"

In [5]:
## creating empty extra columns in outpatient data present in inpatient data
for col in list(set(df_in.columns) - set(df_out.columns)):
    df_out[col] = ""

In [6]:
## Reordering the columns base on InPatients' dataframe
df_out = df_out[df_in.columns]

## Appending the _InPatient_ and _OutPatient_ data

In [7]:
## appening Inpatient and out patient data
df_patients = df_in.append(df_out).reset_index(drop = True)

In [8]:
## sorting data by BENEFICIARY ID
df_patients = df_patients.sort_values(by = "BeneID").reset_index(drop = True)

In [9]:
## calculating age of beneficiaries

def calculate_age(row):
    if row["DOD"] is not pd.NaT:
        val = round(((row["DOD"] - row["DOB"]).days)/365, 0)
    else:
        date = pd.to_datetime("2009-12-01")
        val = round(((date - row["DOB"]).days)/365, 0)
    return val

parse_dates = ["DOB", "DOD"]
df_ben[parse_dates] = df_ben[parse_dates].apply(pd.to_datetime)
df_ben["Age"] = df_ben.apply(calculate_age, axis=1)
df_ben["Aged"] = np.where((df_ben["Age"] > 65), 1,0)

In [10]:
## creating community_institutional column
df_ben["community_institutional"] = np.random.choice(['Community', 'Institutional'],
                                                             size=len(df_ben), p=[0.9,0.1])

In [11]:
## creating Disability_condn column
df_nonAged_community = df_ben[(df_ben["Aged"]==0) & (df_ben["community_institutional"]=="Community")]
df_nonAged_community_opposite = df_ben[~((df_ben["Aged"]==0) & (df_ben["community_institutional"]=="Community"))]

df_nonAged_community["Disability_condn"] = np.random.choice(['Non_originally_disabled', 'Originally_disabled '],
                                                             size=len(df_nonAged_community), p=[0.745,0.255])
df_nonAged_community_opposite["Disability_condn"] = np.random.choice(['Non_disabled ', 'Non_originally_disabled', 'Originally_disabled '],
                                                             size=len(df_nonAged_community_opposite), p=[0.7745,0.1765,0.049])

df_ben = df_nonAged_community.append(df_nonAged_community_opposite[df_nonAged_community.columns.to_list()])

In [12]:
## creating Benefits column
df_community = df_ben[df_ben["community_institutional"]=="Community"]
df_institutional = df_ben[df_ben["community_institutional"] !="Community"]
df_community["Benefits"] = np.random.choice(['NonDual', 'FBDual', 'PBDual'],
                                                             size=len(df_community), p=[0.66, 0.12, 0.22])
df_institutional["Benefits"] = ""

In [13]:
## creating medicaif_flag
df_community["medicaid_flag"] = np.where((df_community["Benefits"] =='NonDual'), 0,1)
df_institutional["medicaid_flag"] = np.random.choice([0,1],
                                                    size=len(df_institutional), p=[0.66, 0.34])

df_ben = df_community.append(df_institutional[df_community.columns.to_list()])

# Merging the Patients Dataframe with Beneficiary

In [15]:
## merging the patient data with beneficiary data
df_patients_ben = pd.merge(df_patients, df_ben,how = "inner", on = "BeneID")

In [16]:
## putting all diagnosis codes and procedure codes into one column
diagnosis = ["ClmDiagnosisCode_1","ClmDiagnosisCode_2","ClmDiagnosisCode_3","ClmDiagnosisCode_4",
             "ClmDiagnosisCode_5","ClmDiagnosisCode_6","ClmDiagnosisCode_7","ClmDiagnosisCode_8",
             "ClmDiagnosisCode_9","ClmDiagnosisCode_10"]

claim_procedure = ["ClmProcedureCode_1","ClmProcedureCode_2","ClmProcedureCode_3",
                   "ClmProcedureCode_4","ClmProcedureCode_5","ClmProcedureCode_6"]

df_patients_ben["ClmDiagnosisCode"] = df_patients_ben[diagnosis].apply(lambda x: x.values.tolist(), axis=1)
df_patients_ben['ClmDiagnosisCode'] = df_patients_ben['ClmDiagnosisCode'].apply(lambda x: list(filter(lambda y: not pd.isna(y), x)))
df_patients_ben.drop(columns=diagnosis, inplace= True)

df_patients_ben["ClmProcedureCode"] = df_patients_ben[claim_procedure].apply(lambda x: x.values.tolist(), axis=1)
df_patients_ben['ClmProcedureCode'] = df_patients_ben['ClmProcedureCode'].apply(lambda x: list(filter(lambda y: not pd.isna(y), x)))
df_patients_ben.drop(columns=claim_procedure, inplace= True)

In [17]:
## replacing values 1 to 0 and 2 to 1
value_replacement_col = ['Gender', 'Race','ChronicCond_Alzheimer',
       'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
       'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Depression', 'ChronicCond_Diabetes',
       'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
       'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke']
df_patients_ben[value_replacement_col] = df_patients_ben[value_replacement_col].replace({1:0, 2:1})

In [18]:
## creating a dataframe of diagnosis codes and procedure codes
# Diagnosis_Code = pd.DataFrame(list(set([a for x in df_patients_ben["ClmDiagnosisCode"]
#                                         for a in x]))).rename(columns={0:"Diagnosis Code"})
# Procedure_Code = pd.DataFrame(list(set([int(a) for x in df_patients_ben["ClmProcedureCode"] for a in x]))).rename(columns={0:"Procedure Code"})

In [19]:
# ## exporting data
# df_patients_ben.to_csv(r"../data/processed_data/master_data.csv",index=False)
# Diagnosis_Code.to_csv(r"../data/processed_data/Diagnosis_Code.csv",index=False)
# Procedure_Code.to_csv(r"../data/processed_data/Procedure_Code.csv",index=False)

In [20]:
df_patients_ben.head()

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DischargeDt,DiagnosisGroupCode,flag_ip,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,Age,Aged,community_institutional,Disability_condn,Benefits,medicaid_flag,ClmDiagnosisCode,ClmProcedureCode
0,BENE100000,CLM126832,2009-01-08,2009-01-08,PRV57172,50,PHY383401,,,,,0.0,,,out,1938-01-03,NaT,0,0,0,49,430,12,12,1,1,1,1,1,1,1,0,1,1,1,0,0,120,30,72.0,1,Community,Non_originally_disabled,NonDual,0,[79678],[]
1,BENE100000,CLM351838,2009-05-10,2009-05-10,PRV57172,70,PHY370909,,PHY363377,,,0.0,,,out,1938-01-03,NaT,0,0,0,49,430,12,12,1,1,1,1,1,1,1,0,1,1,1,0,0,120,30,72.0,1,Community,Non_originally_disabled,NonDual,0,[71656],[]
2,BENE100001,CLM389298,2009-05-31,2009-06-05,PRV55158,20,PHY425169,,,,V573,0.0,,,out,1939-01-08,NaT,0,0,0,33,420,12,12,0,1,1,1,1,1,1,0,0,1,0,0,0,2530,540,71.0,1,Community,Non_originally_disabled,NonDual,0,[31400],[]
3,BENE100001,CLM389298,2009-05-31,2009-06-05,PRV55158,20,PHY425169,,,,V573,0.0,,,out,1939-08-01,NaT,0,0,0,33,420,12,12,0,1,1,1,1,1,1,0,0,1,0,0,0,2530,540,70.0,1,Community,Originally_disabled,NonDual,0,[31400],[]
4,BENE100001,CLM258631,2009-03-21,2009-03-21,PRV54966,70,PHY347511,,,,36401,0.0,,,out,1939-01-08,NaT,0,0,0,33,420,12,12,0,1,1,1,1,1,1,0,0,1,0,0,0,2530,540,71.0,1,Community,Non_originally_disabled,NonDual,0,"[36401, V4569]",[]


## mapping icd 9 to icd 10 and hcc

In [21]:
## converting strings in lower case
df_icd9_icd10["icd10cm"] = df_icd9_icd10["icd10cm"].str.lower()
df_icd10_hcc["Diagnosis_Code"] = df_icd10_hcc["Diagnosis_Code"].str.lower()

In [22]:
## exploding the data on claim diagnosis code
df_patients_ben_exploded = df_patients_ben.explode("ClmDiagnosisCode")

In [23]:
## mergning the exploded data and icd9 to 10 mapping to get icd 10 mapping in exploded data
df_patients_ben_exploded = df_patients_ben_exploded.merge(df_icd9_icd10, how = "inner",left_on = "ClmDiagnosisCode",right_on = "icd9cm")

In [24]:
## dropping the duplicates
columns = ['BeneID', 'ClaimID', 'ClaimStartDt', 'ClaimEndDt', 'Provider',
       'InscClaimAmtReimbursed', 'AttendingPhysician', 'OperatingPhysician',
       'OtherPhysician', 'AdmissionDt', 'ClmAdmitDiagnosisCode',
       'DeductibleAmtPaid', 'DischargeDt', 'DiagnosisGroupCode', 'flag_ip', 'DOB',
       'DOD', 'Gender', 'Race', 'RenalDiseaseIndicator', 'State', 'County',
       'NoOfMonths_PartACov', 'NoOfMonths_PartBCov', 'ChronicCond_Alzheimer',
       'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
       'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Depression', 'ChronicCond_Diabetes',
       'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
       'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke',
       'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
       'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'ClmDiagnosisCode','icd9cm', 'icd10cm']
df_patients_ben_exploded = df_patients_ben_exploded.drop_duplicates(subset = columns)

In [25]:
## mapping ICD10 to HCC thorugh files mergning
df_patients_ben_final = df_patients_ben_exploded.merge(df_icd10_hcc[["Diagnosis_Code","2024_CMS-HCC"]],how = "left",left_on = "icd10cm",right_on="Diagnosis_Code")
df_patients_ben_final = df_patients_ben_exploded.merge(df_icd10_hcc[["Diagnosis_Code","2020_CMS-HCC"]],how = "left",left_on = "icd10cm",right_on="Diagnosis_Code")

In [26]:
## dropping the rows which do not have HCC's mapped since all icd10 can't be mapped to HCC's
df_patients_ben_final = df_patients_ben_final.dropna(subset = ["2024_CMS-HCC"])

In [27]:
## groupping the data and putting all HCC's into a list then merging it with original file
df_patients_ben_final_1 = (df_patients_ben_final.groupby('ClaimID')
                          .agg({'2024_CMS-HCC': lambda x: list(set(x))})
                           .rename({'2024_CMS-HCC' : 'list_hcc_code'},axis=1)
                           .reset_index())
df_patients_ben_final = df_patients_ben_final.merge(df_patients_ben_final_1, on = "ClaimID", how = "inner")

In [28]:
## dropping the unnecessary columns
df_patients_ben_final = df_patients_ben_final.drop(columns = ["ClmDiagnosisCode","Diagnosis_Code","ClmProcedureCode"])

## age and claim duration calculation

In [26]:
# ## calculating age of beneficiaries

# def calculate_age(row):
#     if row["DOD"] is not pd.NaT:
#         val = round(((row["DOD"] - row["DOB"]).days)/365, 0)
#     else:
#         date = pd.to_datetime("2009-12-01")
#         val = round(((date - row["DOB"]).days)/365, 0)
#     return val

# parse_dates = ["DOB", "DOD"]
# df_patients_ben_final[parse_dates] = df_patients_ben_final[parse_dates].apply(pd.to_datetime)
# df_patients_ben_final["Age"] = df_patients_ben_final.apply(calculate_age, axis=1)
# df_patients_ben_final["Aged"] = np.where((df_patients_ben_final["Age"] > 65), 1,0)

In [29]:
## calculating claim duration
df_patients_ben_final["ClaimStartDt"] = pd.to_datetime(
    df_patients_ben_final['ClaimStartDt'], errors='ignore')
df_patients_ben_final["ClaimEndDt"] = pd.to_datetime(
    df_patients_ben_final['ClaimEndDt'], errors='ignore')
df_patients_ben_final["claim_duration_num"] = (
    df_patients_ben_final["ClaimEndDt"] - df_patients_ben_final["ClaimStartDt"]).dt.days

## mockup data

In [28]:
# ## creating community_institutional column
# df_patients_ben_final["community_institutional"] = np.random.choice(['Community', 'Institutional'],
#                                                              size=len(df_patients_ben_final), p=[0.9,0.1])

In [29]:
# ## creating Disability_condn column
# df_nonAged_community = df_patients_ben_final[(df_patients_ben_final["Aged"]==0) & (df_patients_ben_final["community_institutional"]=="Community")]
# df_nonAged_community_opposite = df_patients_ben_final[~((df_patients_ben_final["Aged"]==0) & (df_patients_ben_final["community_institutional"]=="Community"))]

# df_nonAged_community["Disability_condn"] = np.random.choice(['Non_originally_disabled', 'Originally_disabled '],
#                                                              size=len(df_nonAged_community), p=[0.745,0.255])
# df_nonAged_community_opposite["Disability_condn"] = np.random.choice(['Non_disabled ', 'Non_originally_disabled', 'Originally_disabled '],
#                                                              size=len(df_nonAged_community_opposite), p=[0.7745,0.1765,0.049])

# df_patients_ben_final = df_nonAged_community.append(df_nonAged_community_opposite[df_nonAged_community.columns.to_list()])

In [30]:
# ## creating Benefits column
# df_community = df_patients_ben_final[df_patients_ben_final["community_institutional"]=="Community"]
# df_institutional = df_patients_ben_final[df_patients_ben_final["community_institutional"] !="Community"]
# df_community["Benefits"] = np.random.choice(['NonDual', 'FBDual', 'PBDual'],
#                                                              size=len(df_community), p=[0.66, 0.12, 0.22])
# df_institutional["Benefits"] = ""

In [31]:
# ## creating medicaif_flag
# df_community["medicaid_flag"] = np.where((df_community["Benefits"] =='NonDual'), 0,1)
# df_institutional["medicaid_flag"] = np.random.choice([0,1],
#                                                     size=len(df_institutional), p=[0.66, 0.34])

# df_patients_ben_final = df_community.append(df_institutional[df_community.columns.to_list()])

## creating disease and disease_disable traction columns

In [30]:
## creaing list of standard disease code list

DIABETES = ["35","36", "37", "38"]
HF = ['221','222', '223','224','225','226'] 
LUNG_DISORDER = ['276','277','278','279','280']
KIDNEY = ['326', '327', '328', '329']
Cardio_Respiratory = ['211','212','213']
Heart_Arrhythmias = ['238']
SUBSTANCE_Use_Disorder = ['137', '138', '139']
SKIN_ULCER = ['379', '380', '381', '382']
CANCER = ['17', '18', '19', '20', '21', '22', '23']
neuological = ['180','181','182','183','184','185','186','187','188','189','190','191','192',
              '195','196','198','199']
psychiatric = ['151','152','153','154','155']
substance_use = ['135','136','137','138','139']

In [31]:
## creating disease intractions column

# Define a function to check if all elements of a list are present in another list
def check_in_list(lst1, lst2):
    return any([elem in lst2 for elem in lst1])

# Apply the function to each row of the dataframe
df_patients_ben_final['Disease_intraction_DIABETES_HF'] = df_patients_ben_final['list_hcc_code'].apply(lambda x: 1 if check_in_list(DIABETES, x) and check_in_list(HF, x) else 0)
df_patients_ben_final['Disease_intraction_HF_CHR_LUNG'] = df_patients_ben_final['list_hcc_code'].apply(lambda x: 1 if check_in_list(LUNG_DISORDER, x) and check_in_list(HF, x) else 0)
df_patients_ben_final['Disease_intraction_HF_KIDNEY'] = df_patients_ben_final['list_hcc_code'].apply(lambda x: 1 if check_in_list(KIDNEY, x) and check_in_list(HF, x) else 0)
df_patients_ben_final['Disease_intraction_CHR_LUNG_CARD_RESP_FAIL'] = df_patients_ben_final['list_hcc_code'].apply(lambda x: 1 if check_in_list(LUNG_DISORDER, x) and check_in_list(Cardio_Respiratory, x) else 0)
df_patients_ben_final['Disease_intraction_HF_HCC238'] = df_patients_ben_final['list_hcc_code'].apply(lambda x: 1 if check_in_list(Heart_Arrhythmias, x) and check_in_list(HF, x) else 0)
df_patients_ben_final['Disease_intraction_gSubUseDisorder_gPsych_'] = df_patients_ben_final['list_hcc_code'].apply(lambda x: 1 if check_in_list(substance_use, x) and check_in_list(psychiatric, x) else 0)

In [32]:
## disable disease intraction

# Define a function to check if all elements of a list are present in another list
def check_in_list(lst1, lst2):
    return any([elem in lst2 for elem in lst1])

df_patients_ben_final_disable = df_patients_ben_final[~(df_patients_ben_final["Disability_condn"] =='Non_disabled ')]
df_patients_ben_final_Non_disable = df_patients_ben_final[(df_patients_ben_final["Disability_condn"] =='Non_disabled ')]

# Apply the function to each row of the dataframe
df_patients_ben_final_disable['DISABLED_HF'] = df_patients_ben_final_disable['list_hcc_code'].apply(lambda x: 1 if check_in_list(HF, x) else 0)
df_patients_ben_final_disable['DISABLED_ULCER_'] = df_patients_ben_final_disable['list_hcc_code'].apply(lambda x: 1 if check_in_list(SKIN_ULCER, x) else 0)
df_patients_ben_final_disable['DISABLED_CANCER'] = df_patients_ben_final_disable['list_hcc_code'].apply(lambda x: 1 if check_in_list(CANCER, x) else 0)
df_patients_ben_final_disable['DISABLED_NEURO'] = df_patients_ben_final_disable['list_hcc_code'].apply(lambda x: 1 if check_in_list(neuological, x) else 0)
df_patients_ben_final_disable['DISABLED_CHR_LUNG'] = df_patients_ben_final_disable['list_hcc_code'].apply(lambda x: 1 if check_in_list(LUNG_DISORDER, x) else 0)

df_patients_ben_final_Non_disable['DISABLED_HF'] = 0
df_patients_ben_final_Non_disable['DISABLED_ULCER_'] = 0
df_patients_ben_final_Non_disable['DISABLED_CANCER'] = 0
df_patients_ben_final_Non_disable['DISABLED_NEURO'] = 0
df_patients_ben_final_Non_disable['DISABLED_CHR_LUNG'] = 0

df_patients_ben_final = df_patients_ben_final_disable.append(
                            df_patients_ben_final_Non_disable[df_patients_ben_final_disable.columns.to_list()]).sort_index()

## merging SDoH data based on population density of each zipcode in each state

In [33]:
## creating zip list
df_state_ziplist = df_SDoH.groupby("mapping_state_code")["ZIPCODE"].agg(lambda x: x.to_list()).reset_index()

# create empty dictionary to store subgroups
subgroups = {}

# loop through each unique value in the state column
for State in df_patients_ben_final['State'].unique():
    # create sub_data based on category value
    subgroup = df_patients_ben_final[df_patients_ben_final['State'] == State]
    # add sub_data to dictionary
    subgroups[State] = subgroup

In [34]:
## creating zipcode lists and population density of each zipcode in each state
df_state_ziplist = df_SDoH.groupby("mapping_state_code")[
    "ZIPCODE","ACS_TOT_POP_WT_ZC"].agg({"ZIPCODE" : (lambda x: x.to_list()),
                                        "ACS_TOT_POP_WT_ZC" : (lambda x: (x/sum(x)).to_list())}).reset_index()

In [35]:
## putting zipcodes based on population denity in subgroups
for state in subgroups.keys():
    ziplist = df_state_ziplist.loc[df_state_ziplist["mapping_state_code"] == str(state), "ZIPCODE"]
    zip_codes_weightage = df_state_ziplist.loc[df_state_ziplist["mapping_state_code"] == str(state), "ACS_TOT_POP_WT_ZC"]
    if not ziplist.empty:
        subgroups[state]["ZIPCODE"] = np.random.choice((ziplist.tolist())[0], size=len(subgroups[state]), p=(zip_codes_weightage.tolist())[0])
    else:
        # handle the case when ziplist is empty
        subgroups[state]["ZIPCODE"] = ""

In [36]:
## creating fulldata from subgroups
df_patients_ben_final = pd.concat(subgroups.values(), ignore_index=True)

In [37]:
## mergning SDoH data with master data on ZIP code
df_patients_ben_final = df_patients_ben_final.merge(df_SDoH, on = "ZIPCODE", how = "left")

## creating other hcc's RAF factors column

In [38]:
## converting HCC's to string
df_patients_ben_final["2024_CMS-HCC"] = df_patients_ben_final["2024_CMS-HCC"].apply(lambda x: {str(x)})

In [39]:
## creating a column of other hcc's than the target HCC
other_hcc = []
for i in range(len(df_patients_ben_final)):
    
    list_hcc_code = set(df_patients_ben_final.loc[i,"list_hcc_code"])
    current_hcc = df_patients_ben_final.loc[i,"2024_CMS-HCC"]
    if (len(list_hcc_code) > 1):
        other_hcc.append(list_hcc_code - current_hcc)
    else:
        other_hcc.append(current_hcc)
        
df_patients_ben_final["hcc_otherthan_dependent_hcc"] = other_hcc

In [40]:
## other extar feature engineering
df_patients_ben_final['disability'] = np.where(df_patients_ben_final['Disability_condn'] == "Non_disabled", 0, 1)
df_disease_cofficients["HCC_code"] = df_disease_cofficients["HCC_code"].apply(lambda x: x[3:])
df_disease_cofficients['disability'] = np.where(df_disease_cofficients['Aged_disabled'] == "Disabled", 1, 0)
df_disease_cofficients['Aged'] = np.where(df_disease_cofficients['Aged_disabled'] == "Aged", 1, 0)

In [41]:
## exploding data on hcc_otherthan_dependent_hcc and deviding data on community and institutional
df_hcc_otherthan_dependent_hcc_exploded = pd.DataFrame(df_patients_ben_final[["Benefits","disability",
                                     "community_institutional",
                                     "Aged", "hcc_otherthan_dependent_hcc",
                                     ]].explode("hcc_otherthan_dependent_hcc").reset_index())


df_hcc_otherthan_dependent_hcc_exploded_community = df_hcc_otherthan_dependent_hcc_exploded[df_hcc_otherthan_dependent_hcc_exploded
                                                                                            ["community_institutional"]=="Community"]
df_hcc_otherthan_dependent_hcc_exploded_Institutional = df_hcc_otherthan_dependent_hcc_exploded[df_hcc_otherthan_dependent_hcc_exploded
                                                                                                ["community_institutional"]=="Institutional"]

In [42]:
## merging hcc's cofficients on exploded data and appending both datasets
df_hcc_otherthan_dependent_hcc_exploded_community = df_hcc_otherthan_dependent_hcc_exploded_community.merge(df_disease_cofficients,
                                              left_on = ["community_institutional","Aged","Benefits","hcc_otherthan_dependent_hcc"],
                                              right_on = ["community_institutional","Aged","Benefits","HCC_code"],
                                              how = "left"
                                             )

df_hcc_otherthan_dependent_hcc_exploded_Institutional = df_hcc_otherthan_dependent_hcc_exploded_Institutional.merge(df_disease_cofficients,
                                              left_on = ["community_institutional","hcc_otherthan_dependent_hcc"],
                                              right_on = ["community_institutional","HCC_code"],
                                              how = "left"
                                             )

df_factors = df_hcc_otherthan_dependent_hcc_exploded_Institutional[["index","HCC_factor"]].append(
    df_hcc_otherthan_dependent_hcc_exploded_community[["index","HCC_factor"]]).sort_values("index")

df_factors["HCC_factor"] = df_factors["HCC_factor"].astype(float)

In [43]:
## finding the mean of other HCC's
df_patients_ben_final["other_disease_mean_hcc_factor"] = df_factors.groupby("index")["HCC_factor"].mean()

In [44]:
## renaming the column
df_patients_ben_final.rename(columns = {"2024_CMS-HCC":"HCC"}, inplace=True)

In [45]:
## selecting and ordering the columns
selected_cols = ['BeneID','HCC', 'InscClaimAmtReimbursed', 'DeductibleAmtPaid','flag_ip','Gender','Race', 'RenalDiseaseIndicator','NoOfMonths_PartACov',
'NoOfMonths_PartBCov','ChronicCond_Alzheimer','ChronicCond_Heartfailure','ChronicCond_KidneyDisease','ChronicCond_Cancer',
'ChronicCond_ObstrPulmonary','ChronicCond_Depression','ChronicCond_Diabetes','ChronicCond_IschemicHeart','ChronicCond_Osteoporasis',
'ChronicCond_rheumatoidarthritis','ChronicCond_stroke','IPAnnualReimbursementAmt','IPAnnualDeductibleAmt','OPAnnualReimbursementAmt',
'OPAnnualDeductibleAmt','Age','Aged','claim_duration_num','community_institutional','Disability_condn','Benefits',
'medicaid_flag','Disease_intraction_DIABETES_HF','Disease_intraction_HF_CHR_LUNG','Disease_intraction_HF_KIDNEY','Disease_intraction_CHR_LUNG_CARD_RESP_FAIL',
'Disease_intraction_HF_HCC238','Disease_intraction_gSubUseDisorder_gPsych_','DISABLED_HF','DISABLED_ULCER_','DISABLED_CANCER','DISABLED_NEURO','DISABLED_CHR_LUNG','ACS_TOT_POP_WT_ZC','ACS_TOT_WORKER_HH_ZC',
'ACS_TOT_HH_ZC','WORKER_PER_HH','ACS_PCT_FEMALE_ZC','ACS_PCT_MALE_ZC','MALE_TO_FEMALE_RATIO','ACS_PCT_HH_LIMIT_ENGLISH_ZC','ACS_PCT_AGE_ABOVE65_ZC',
'ACS_PCT_AGE_ABOVE80_ZC','ACS_PCT_HH_NO_COMP_DEV_ZC','ACS_PCT_HH_SMARTPHONE_ZC','ACS_PCT_HH_PC_ZC','ACS_PCT_HH_NO_INTERNET_ZC','PCT_HAS_INTERNET',
'ACS_MEDIAN_HH_INC_ZC','ACS_PCT_INC50_ABOVE65_ZC','ACS_PCT_HEALTH_INC_BELOW137_ZC','ACS_PCT_HEALTH_INC_138_199_ZC','ACS_PCT_HEALTH_INC_200_399_ZC',
'ACS_PCT_HEALTH_INC_ABOVE400_ZC','ACS_PER_CAPITA_INC_ZC','ACS_PCT_COLLEGE_ASSOCIATE_DGR_ZC','ACS_PCT_BACHELOR_DGR_ZC','ACS_PCT_HS_GRADUATE_ZC',
'ACS_PCT_LT_HS_ZC','ACS_PCT_POSTHS_ED_ZC','ACS_PCT_HH_ABOVE65_ZC','ACS_PCT_HH_ALONE_ABOVE65_ZC','CEN_POPDENSITY_ZC','ACS_PCT_HU_NO_VEH_ZC',
'ACS_PCT_PUBL_TRANSIT_ZC','ACS_PCT_TAXICAB_2WORK_ZC','AVG_DIST_TO_MEDI_CARE','ACS_PCT_MEDICARE_ONLY_ZC','PCT_ANY_OTHER_INSUR','ACS_PCT_UNINSURED_ZC',
'disability','other_disease_mean_hcc_factor']

df_patients_ben_final = df_patients_ben_final[df_patients_ben_final["ZCTA"].notnull()]
df_patients_ben_final = df_patients_ben_final[selected_cols]
df_patients_ben_final["HCC"] = [list(x)[0] for x in df_patients_ben_final["HCC"]]

In [46]:
df_patients_ben_final.head()

Unnamed: 0,BeneID,HCC,InscClaimAmtReimbursed,DeductibleAmtPaid,flag_ip,Gender,Race,RenalDiseaseIndicator,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,Age,Aged,claim_duration_num,community_institutional,Disability_condn,Benefits,medicaid_flag,Disease_intraction_DIABETES_HF,Disease_intraction_HF_CHR_LUNG,Disease_intraction_HF_KIDNEY,Disease_intraction_CHR_LUNG_CARD_RESP_FAIL,Disease_intraction_HF_HCC238,Disease_intraction_gSubUseDisorder_gPsych_,DISABLED_HF,DISABLED_ULCER_,DISABLED_CANCER,DISABLED_NEURO,DISABLED_CHR_LUNG,ACS_TOT_POP_WT_ZC,ACS_TOT_WORKER_HH_ZC,ACS_TOT_HH_ZC,WORKER_PER_HH,ACS_PCT_FEMALE_ZC,ACS_PCT_MALE_ZC,MALE_TO_FEMALE_RATIO,ACS_PCT_HH_LIMIT_ENGLISH_ZC,ACS_PCT_AGE_ABOVE65_ZC,ACS_PCT_AGE_ABOVE80_ZC,ACS_PCT_HH_NO_COMP_DEV_ZC,ACS_PCT_HH_SMARTPHONE_ZC,ACS_PCT_HH_PC_ZC,ACS_PCT_HH_NO_INTERNET_ZC,PCT_HAS_INTERNET,ACS_MEDIAN_HH_INC_ZC,ACS_PCT_INC50_ABOVE65_ZC,ACS_PCT_HEALTH_INC_BELOW137_ZC,ACS_PCT_HEALTH_INC_138_199_ZC,ACS_PCT_HEALTH_INC_200_399_ZC,ACS_PCT_HEALTH_INC_ABOVE400_ZC,ACS_PER_CAPITA_INC_ZC,ACS_PCT_COLLEGE_ASSOCIATE_DGR_ZC,ACS_PCT_BACHELOR_DGR_ZC,ACS_PCT_HS_GRADUATE_ZC,ACS_PCT_LT_HS_ZC,ACS_PCT_POSTHS_ED_ZC,ACS_PCT_HH_ABOVE65_ZC,ACS_PCT_HH_ALONE_ABOVE65_ZC,CEN_POPDENSITY_ZC,ACS_PCT_HU_NO_VEH_ZC,ACS_PCT_PUBL_TRANSIT_ZC,ACS_PCT_TAXICAB_2WORK_ZC,AVG_DIST_TO_MEDI_CARE,ACS_PCT_MEDICARE_ONLY_ZC,PCT_ANY_OTHER_INSUR,ACS_PCT_UNINSURED_ZC,disability,other_disease_mean_hcc_factor
0,BENE100001,238,10,0.0,out,0,0,0,12,12,0,1,1,1,1,1,1,0,0,1,0,0,0,2530,540,71.0,1,0,Community,Non_originally_disabled,NonDual,0,0,0,0,0,0,0,0,0,0,0,0,40179.0,20293.0,12872.0,2.0,49.48,50.52,1.021019,6.21,17.91,6.5,8.46,82.09,86.63,9.01,90.99,118286.0,1.68,5.71,6.69,23.84,63.75,43498.0,24.63,24.9,27.55,6.75,65.7,40.56,10.22,6055.89,4.37,15.85,1.24,2.2725,4.86,74.64,4.59,1,0.299
1,BENE100001,238,10,0.0,out,0,0,0,12,12,0,1,1,1,1,1,1,0,0,1,0,0,0,2530,540,70.0,1,0,Community,Originally_disabled,NonDual,0,0,0,0,0,0,0,0,0,0,0,0,21223.0,10163.0,7442.0,1.0,49.03,50.97,1.039568,3.37,16.11,4.38,5.2,86.13,87.77,7.85,92.15,83240.0,2.71,6.92,6.29,32.34,54.44,34082.0,27.97,25.29,22.23,6.58,71.19,31.31,11.17,1445.17,5.35,1.81,1.65,3.6425,3.15,75.96,3.45,1,0.299
2,BENE100170,238,20,0.0,out,1,0,0,12,12,1,0,1,1,1,0,0,0,0,1,0,0,0,640,350,62.0,0,0,Community,Non_originally_disabled,FBDual,1,0,0,0,0,1,0,1,0,0,0,0,64728.0,26951.0,23835.0,1.0,52.92,47.08,0.889645,6.08,11.05,2.52,11.94,81.72,76.09,14.93,85.07,57010.0,4.04,29.06,11.04,23.27,36.62,37106.0,16.63,24.36,20.27,12.63,67.11,24.16,12.07,74275.08,77.92,69.19,3.43,0.5275,3.21,60.2,5.04,1,0.537
3,BENE100170,238,20,0.0,out,1,0,0,12,12,1,0,1,1,1,0,0,0,0,1,0,0,0,640,350,62.0,0,0,Community,Non_originally_disabled,NonDual,0,0,0,0,0,1,0,1,0,0,0,0,60714.0,34335.0,28850.0,1.0,56.91,43.09,0.75716,2.29,20.36,5.61,4.7,89.56,90.79,5.33,94.67,137126.0,3.73,10.81,4.19,9.7,75.31,128591.0,7.94,35.77,5.4,4.11,90.5,32.05,17.72,70774.7,71.63,71.96,10.08,1.00375,6.42,79.07,2.64,1,0.442
4,BENE100170,226,20,0.0,out,1,0,0,12,12,1,0,1,1,1,0,0,0,0,1,0,0,0,640,350,62.0,0,0,Community,Non_originally_disabled,FBDual,1,0,0,0,0,1,0,1,0,0,0,0,11052.0,5088.0,3672.0,1.0,50.29,49.71,0.988467,5.23,16.75,3.9,2.91,89.43,90.36,2.97,97.03,145000.0,0.0,4.91,3.37,16.12,75.6,67399.0,22.6,23.93,12.53,4.83,82.64,31.89,8.33,4549.6,6.37,24.6,0.95,3.365,2.47,82.72,2.94,1,0.304


In [47]:
df_patients_ben_final[df_patients_ben_final["HCC"] != 'nan'].reset_index(drop=True).to_csv(r"../data/processed_data/master_data_for_modelling.csv",index=False)
df_patients_ben_final.to_csv(r"../data/processed_data/data_for_powerBI.csv",index=False)

In [48]:
print(len(df_patients_ben_final[df_patients_ben_final["HCC"] != 'nan'].reset_index(drop=True)))
print(len(df_patients_ben_final))

719386
719386
