# Overview
The purpose of this analysis is to compare counts of unique claims per beneficiary by service type.<br>
Visit Data.gov to download the data used for this analysis.

In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

# Load libararies

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import plotly.io as pio
pio.renderers.default='iframe'
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.pyplot import figure
from matplotlib import rcParams
matplotlib.rcParams['figure.figsize'] = (12,8)
dpi=200
matplotlib.rcParams['figure.dpi']=dpi

from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Load data

In [3]:
# load data
synthetic_bene_df = pd.read_csv('data/beneficiary_2022.csv', sep="|", usecols=['BENE_ID'], dtype={'CLM_ID':'Int64','BENE_ID':'Int64','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_inpatient_df = pd.read_csv('data/inpatient.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'Int64','BENE_ID':'Int64','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_outpatient_df = pd.read_csv('data/outpatient.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'Int64','BENE_ID':'Int64','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_carrier_df = pd.read_csv('data/carrier.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'Int64','BENE_ID':'Int64','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_dme_df = pd.read_csv('data/dme.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'Int64','BENE_ID':'Int64','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_hha_df = pd.read_csv('data/hha.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'Int64','BENE_ID':'Int64','CLM_FROM_DT':'str'},low_memory=False)
synthetic_hospice_df = pd.read_csv('data/hospice.csv', sep="|", usecols=['CLM_ID','BENE_ID','CLM_FROM_DT'], dtype={'CLM_ID':'Int64','BENE_ID':'Int64','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_snf_df = pd.read_csv('data/snf.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'Int64','BENE_ID':'Int64','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_pde_df = pd.read_csv('data/pde.csv', sep="|", usecols=['BENE_ID', 'PDE_ID','PD_DT'], dtype={'BENE_ID':'Int64'}, low_memory=False)

In [4]:
# count number of unique BENE_IDs in real data
unique_ids = synthetic_bene_df['BENE_ID'].unique()

print(f"{len(unique_ids)} unique ids and {synthetic_bene_df.shape[0]} total rows")

8671 unique ids and 8671 total rows


# Filter synthetic data

In [5]:
# CARRIER
# evaluate 2022 only
values = [2022]
# convert CLM_FROM_DT to datetime
synthetic_carrier_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_carrier_df['CLM_FROM_DT'])
# year from datetime
synthetic_carrier_df['CLM_FROM_YR'] = synthetic_carrier_df['CLM_FROM_DT'].dt.year
# limit year to 2021 only
synthetic_carrier_df = synthetic_carrier_df[synthetic_carrier_df.CLM_FROM_YR.isin(values)]
# merge on BENE_ID
synthetic_carrier_df = pd.merge(synthetic_bene_df, synthetic_carrier_df, on ='BENE_ID', how ="left")
# limit df to unique BENE_ID and CLM_ID pairs
synthetic_carrier_df['tot_claims'] = synthetic_carrier_df.groupby(['BENE_ID'])['CLM_ID'].transform('nunique')
# select first row in each groupby group to ensure bene_id and clm_id pairs are not duplicated
synthetic_carrier_df = synthetic_carrier_df.groupby('BENE_ID').first()
# reset index and save as a column
synthetic_carrier_df.reset_index(inplace=True)
# remove outliers
# synthetic_carrier_df = synthetic_carrier_df[synthetic_carrier_df.tot_claims < synthetic_carrier_df.tot_claims.quantile(.98)]

# INPATIENT
values = [2022]
synthetic_inpatient_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_inpatient_df['CLM_FROM_DT'])
synthetic_inpatient_df['CLM_FROM_YR'] = synthetic_inpatient_df['CLM_FROM_DT'].dt.year
synthetic_inpatient_df = synthetic_inpatient_df[synthetic_inpatient_df.CLM_FROM_YR.isin(values)]
synthetic_inpatient_df = pd.merge(synthetic_bene_df, synthetic_inpatient_df, on ='BENE_ID', how ="left")
synthetic_inpatient_df['tot_claims'] = synthetic_inpatient_df.groupby(['BENE_ID'])['CLM_ID'].transform('nunique')
synthetic_inpatient_df = synthetic_inpatient_df.groupby('BENE_ID').first()
synthetic_inpatient_df.reset_index(inplace=True)
# synthetic_inpatient_df = synthetic_inpatient_df[synthetic_inpatient_df.tot_claims < synthetic_inpatient_df.tot_claims.quantile(.99)]

# OUTPATIENT
values = [2022]
synthetic_outpatient_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_outpatient_df['CLM_FROM_DT'])
synthetic_outpatient_df['CLM_FROM_YR'] = synthetic_outpatient_df['CLM_FROM_DT'].dt.year
synthetic_outpatient_df = synthetic_outpatient_df[synthetic_outpatient_df.CLM_FROM_YR.isin(values)]
synthetic_outpatient_df = pd.merge(synthetic_bene_df, synthetic_outpatient_df, on ='BENE_ID', how ="left")
synthetic_outpatient_df['tot_claims'] = synthetic_outpatient_df.groupby(['BENE_ID'])['CLM_ID'].transform('nunique')
synthetic_outpatient_df = synthetic_outpatient_df.groupby('BENE_ID').first()
synthetic_outpatient_df.reset_index(inplace=True)
# synthetic_outpatient_df = synthetic_outpatient_df[synthetic_outpatient_df.tot_claims < synthetic_outpatient_df.tot_claims.quantile(.98)]

# DME
values = [2022]
synthetic_dme_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_dme_df['CLM_FROM_DT'])
synthetic_dme_df['CLM_FROM_YR'] = synthetic_dme_df['CLM_FROM_DT'].dt.year
synthetic_dme_df = synthetic_dme_df[synthetic_dme_df.CLM_FROM_YR.isin(values)]
synthetic_dme_df = pd.merge(synthetic_bene_df, synthetic_dme_df, on ='BENE_ID', how ="left")
synthetic_dme_df['tot_claims'] = synthetic_dme_df.groupby(['BENE_ID'])['CLM_ID'].transform('nunique')
synthetic_dme_df = synthetic_dme_df.groupby('BENE_ID').first()
synthetic_dme_df.reset_index(inplace=True)
# synthetic_dme_df = synthetic_dme_df[synthetic_dme_df.tot_claims < synthetic_dme_df.tot_claims.quantile(.99)]

# SNF
values = [2022]
synthetic_snf_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_snf_df['CLM_FROM_DT'])
synthetic_snf_df['CLM_FROM_YR'] = synthetic_snf_df['CLM_FROM_DT'].dt.year
synthetic_snf_df = synthetic_snf_df[synthetic_snf_df.CLM_FROM_YR.isin(values)]
synthetic_snf_df = pd.merge(synthetic_bene_df, synthetic_snf_df, on ='BENE_ID', how ="left")
synthetic_snf_df['tot_claims'] = synthetic_snf_df.groupby(['BENE_ID'])['CLM_ID'].transform('nunique')
synthetic_snf_df = synthetic_snf_df.groupby('BENE_ID').first()
synthetic_snf_df.reset_index(inplace=True)
# synthetic_snf_df = synthetic_snf_df[synthetic_snf_df.tot_claims < synthetic_snf_df.tot_claims.quantile(.99)]

# HOSPICE
values = [2022]
synthetic_hospice_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_hospice_df['CLM_FROM_DT'])
synthetic_hospice_df['CLM_FROM_YR'] = synthetic_hospice_df['CLM_FROM_DT'].dt.year
synthetic_hospice_df = synthetic_hospice_df[synthetic_hospice_df.CLM_FROM_YR.isin(values)]
synthetic_hospice_df = pd.merge(synthetic_bene_df, synthetic_hospice_df, on ='BENE_ID', how ="left")
synthetic_hospice_df['tot_claims'] = synthetic_hospice_df.groupby(['BENE_ID'])['CLM_ID'].transform('nunique')
synthetic_hospice_df = synthetic_hospice_df.groupby('BENE_ID').first()
synthetic_hospice_df.reset_index(inplace=True)
# synthetic_hospice_df = synthetic_hospice_df[synthetic_hospice_df.tot_claims < synthetic_hospice_df.tot_claims.quantile(.99)]

# HHA
values = [2022]
synthetic_hha_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_hha_df['CLM_FROM_DT'])
synthetic_hha_df['CLM_FROM_YR'] = synthetic_hha_df['CLM_FROM_DT'].dt.year
synthetic_hha_df = synthetic_hha_df[synthetic_hha_df.CLM_FROM_YR.isin(values)]
synthetic_hha_df = pd.merge(synthetic_bene_df, synthetic_hha_df, on ='BENE_ID', how ="left")
synthetic_hha_df['tot_claims'] = synthetic_hha_df.groupby(['BENE_ID'])['CLM_ID'].transform('nunique')
synthetic_hha_df = synthetic_hha_df.groupby('BENE_ID').first()
synthetic_hha_df.reset_index(inplace=True)
# synthetic_hha_df = synthetic_hha_df[synthetic_hha_df.tot_claims < synthetic_hha_df.tot_claims.quantile(.995)]

# PDE
values = [2022]
synthetic_pde_df['PD_DT'] = pd.to_datetime(synthetic_pde_df['PD_DT'])
synthetic_pde_df['PD_YR'] = synthetic_pde_df['PD_DT'].dt.year
synthetic_pde_df = synthetic_pde_df[synthetic_pde_df.PD_YR.isin(values)]
synthetic_pde_df = pd.merge(synthetic_bene_df, synthetic_pde_df, on ='BENE_ID', how ="left")
synthetic_pde_df['tot_claims'] = synthetic_pde_df.groupby(['BENE_ID'])['PDE_ID'].transform('nunique')
synthetic_pde_df = synthetic_pde_df.groupby('BENE_ID').first()
synthetic_pde_df.reset_index(inplace=True)
# synthetic_pde_df = synthetic_pde_df[synthetic_pde_df.tot_claims < synthetic_pde_df.tot_claims.quantile(.99)]


plt.close()

# Synthetic number of claims per beneficiary distributions

In [6]:
# # set figure size
# figure(figsize=(5.5, 5), dpi=250)
# establish subplot axes
fig = make_subplots(rows=2, cols=4, horizontal_spacing = 0.1, vertical_spacing = 0.2, subplot_titles=("Carrier",
                                                    "Inpatient",
                                                    "Outpatient",
                                                    "DME",
                                                    "SNF",
                                                    "Hospice",
                                                    "HHA",
                                                    "PDE"))
# CARRIER
# plot histogram
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_carrier_df['tot_claims'],
), row=1, col=1)

# axes
fig.update_xaxes(title_text="Total Claims", row=1, col=1)
fig.update_yaxes(title_text="Beneficiaries (%)", row=1, col=1)
fig.update_yaxes(range = [0,100])

# INPATIENT
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_inpatient_df['tot_claims'],
), row=1, col=2)


fig.update_xaxes(title_text="Total Claims", row=1, col=2)
fig.update_yaxes(title_text="Beneficiaries (%)", row=1, col=2)
fig.update_yaxes(range = [0,100])

# OUTPATIENT
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_outpatient_df['tot_claims'],
), row=1, col=3)

fig.update_xaxes(title_text="Total Claims", row=1, col=3)
fig.update_yaxes(title_text="Beneficiaries (%)", row=1, col=3)
fig.update_yaxes(range = [0,100])

# DME
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_dme_df['tot_claims'],
), row=1, col=4)

fig.update_xaxes(title_text="Total Claims", row=1, col=4)
fig.update_yaxes(title_text="Beneficiaries (%)", row=1, col=4)
fig.update_yaxes(range = [0,100])

# SNF
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_snf_df['tot_claims'],
), row=2, col=1)

fig.update_xaxes(title_text="Total Claims", row=2, col=1)
fig.update_yaxes(title_text="Beneficiaries (%)", row=2, col=1)
fig.update_yaxes(range = [0,100])

# HOSPICE
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_hospice_df['tot_claims'],
), row=2, col=2)

fig.update_xaxes(title_text="Total Claims", row=2, col=2)
fig.update_yaxes(title_text="Beneficiaries (%)", row=2, col=2)
fig.update_yaxes(range = [0,100])

# HHA
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_hha_df['tot_claims'],
), row=2, col=3)

fig.update_xaxes(title_text="Total Claims", row=2, col=3)
fig.update_yaxes(title_text="Beneficiaries (%)", row=2, col=3)
fig.update_yaxes(range = [0,100])

# PDE
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_pde_df['tot_claims'],
), row=2, col=4)

fig.update_xaxes(title_text="Total Claims", row=2, col=4)
fig.update_yaxes(title_text="Beneficiaries (%)", row=2, col=4)
fig.update_yaxes(range = [0,100])



# update plot layout
fig.update_layout(title_text="2022 Percentage of Synthetic Claims per Beneficiary", bargap=0.30, width=1200, height=700, showlegend=False)