# Overview
The purpose of this analysis is to observe synthetic claims per beneficiary distributions. Visit Data.gov to download the data used for this analysis. <br>
Note: The data are filtered on 2021 only.

# Load libararies

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import plotly.io as pio
pio.renderers.default='iframe'
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Load data

In [2]:
synthetic_bene_df = pd.read_csv('data/beneficiary_final.csv', sep="|", usecols=['BENE_ID'], dtype={'CLM_ID':'Int64','BENE_ID':'Int64','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_inpatient_df = pd.read_csv('data/inpatient.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'Int64','BENE_ID':'Int64','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_outpatient_df = pd.read_csv('data/outpatient.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'Int64','BENE_ID':'Int64','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_carrier_df = pd.read_csv('data/carrier.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'Int64','BENE_ID':'Int64','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_dme_df = pd.read_csv('data/dme.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'Int64','BENE_ID':'Int64','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_hha_df = pd.read_csv('data/hha.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'Int64','BENE_ID':'Int64','CLM_FROM_DT':'str'},low_memory=False)
synthetic_hospice_df = pd.read_csv('data/hospice.csv', sep="|", usecols=['CLM_ID','BENE_ID','CLM_FROM_DT'], dtype={'CLM_ID':'Int64','BENE_ID':'Int64','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_snf_df = pd.read_csv('data/snf.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'Int64','BENE_ID':'Int64','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_pde_df = pd.read_csv('data/pde.csv', sep="|", usecols=['BENE_ID', 'PDE_ID', 'CLM_GRP_ID','PD_DT'], dtype={'CLM_ID':'Int64','BENE_ID':'Int64','CLM_FROM_DT':'str'}, low_memory=False)

# Filter data

In [3]:
# CARRIER
# evaluate 2021 only
values = [2021]
# convert CLM_FROM_DT to datetime
synthetic_carrier_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_carrier_df['CLM_FROM_DT'])
# extract year from datetime
synthetic_carrier_df['CLM_FROM_YR'] = synthetic_carrier_df['CLM_FROM_DT'].dt.year
# limit year to 2021 only
synthetic_carrier_df = synthetic_carrier_df[synthetic_carrier_df.CLM_FROM_YR.isin(values)]
# merge on BENE_ID
synthetic_carrier_df = pd.merge(synthetic_bene_df, synthetic_carrier_df, on ='BENE_ID', how ="left")
# limit df to unique BENE_ID and CLM_ID pairs
synthetic_carrier_df['tot_claims'] = synthetic_carrier_df.groupby(['BENE_ID'])['CLM_ID'].transform('nunique')
# select first row in each groupby group to ensure bene_id and clm_id pairs are not duplicated
synthetic_carrier_df = synthetic_carrier_df.groupby('BENE_ID').first()
# reset index and save as a column
synthetic_carrier_df.reset_index(inplace=True)
# remove outliers
synthetic_carrier_df = synthetic_carrier_df[synthetic_carrier_df.tot_claims < synthetic_carrier_df.tot_claims.quantile(.98)]

# INPATIENT
values = [2021]
synthetic_inpatient_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_inpatient_df['CLM_FROM_DT'])
synthetic_inpatient_df['CLM_FROM_YR'] = synthetic_inpatient_df['CLM_FROM_DT'].dt.year
synthetic_inpatient_df = synthetic_inpatient_df[synthetic_inpatient_df.CLM_FROM_YR.isin(values)]
synthetic_inpatient_df = pd.merge(synthetic_bene_df, synthetic_inpatient_df, on ='BENE_ID', how ="left")
synthetic_inpatient_df['tot_claims'] = synthetic_inpatient_df.groupby(['BENE_ID'])['CLM_ID'].transform('nunique')
synthetic_inpatient_df = synthetic_inpatient_df.groupby('BENE_ID').first()
synthetic_inpatient_df.reset_index(inplace=True)
synthetic_inpatient_df = synthetic_inpatient_df[synthetic_inpatient_df.tot_claims < synthetic_inpatient_df.tot_claims.quantile(.99)]

# OUTPATIENT
values = [2021]
synthetic_outpatient_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_outpatient_df['CLM_FROM_DT'])
synthetic_outpatient_df['CLM_FROM_YR'] = synthetic_outpatient_df['CLM_FROM_DT'].dt.year
synthetic_outpatient_df = synthetic_outpatient_df[synthetic_outpatient_df.CLM_FROM_YR.isin(values)]
synthetic_outpatient_df = pd.merge(synthetic_bene_df, synthetic_outpatient_df, on ='BENE_ID', how ="left")
synthetic_outpatient_df['tot_claims'] = synthetic_outpatient_df.groupby(['BENE_ID'])['CLM_ID'].transform('nunique')
synthetic_outpatient_df = synthetic_outpatient_df.groupby('BENE_ID').first()
synthetic_outpatient_df.reset_index(inplace=True)
synthetic_outpatient_df = synthetic_outpatient_df[synthetic_outpatient_df.tot_claims < synthetic_outpatient_df.tot_claims.quantile(.98)]

# DME
values = [2021]
synthetic_dme_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_dme_df['CLM_FROM_DT'])
synthetic_dme_df['CLM_FROM_YR'] = synthetic_dme_df['CLM_FROM_DT'].dt.year
synthetic_dme_df = synthetic_dme_df[synthetic_dme_df.CLM_FROM_YR.isin(values)]
synthetic_dme_df = pd.merge(synthetic_bene_df, synthetic_dme_df, on ='BENE_ID', how ="left")
synthetic_dme_df['tot_claims'] = synthetic_dme_df.groupby(['BENE_ID'])['CLM_ID'].transform('nunique')
synthetic_dme_df = synthetic_dme_df.groupby('BENE_ID').first()
synthetic_dme_df.reset_index(inplace=True)
# synthetic_dme_df = synthetic_dme_df[synthetic_dme_df.tot_claims < synthetic_dme_df.tot_claims.quantile(.99)]

# SNF
values = [2021]
synthetic_snf_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_snf_df['CLM_FROM_DT'])
synthetic_snf_df['CLM_FROM_YR'] = synthetic_snf_df['CLM_FROM_DT'].dt.year
synthetic_snf_df = synthetic_snf_df[synthetic_snf_df.CLM_FROM_YR.isin(values)]
synthetic_snf_df = pd.merge(synthetic_bene_df, synthetic_snf_df, on ='BENE_ID', how ="left")
synthetic_snf_df['tot_claims'] = synthetic_snf_df.groupby(['BENE_ID'])['CLM_ID'].transform('nunique')
synthetic_snf_df = synthetic_snf_df.groupby('BENE_ID').first()
synthetic_snf_df.reset_index(inplace=True)
# synthetic_snf_df = synthetic_snf_df[synthetic_snf_df.tot_claims < synthetic_snf_df.tot_claims.quantile(.99)]

# HOSPICE
values = [2021]
synthetic_hospice_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_hospice_df['CLM_FROM_DT'])
synthetic_hospice_df['CLM_FROM_YR'] = synthetic_hospice_df['CLM_FROM_DT'].dt.year
synthetic_hospice_df = synthetic_hospice_df[synthetic_hospice_df.CLM_FROM_YR.isin(values)]
synthetic_hospice_df = pd.merge(synthetic_bene_df, synthetic_hospice_df, on ='BENE_ID', how ="left")
synthetic_hospice_df['tot_claims'] = synthetic_hospice_df.groupby(['BENE_ID'])['CLM_ID'].transform('nunique')
synthetic_hospice_df = synthetic_hospice_df.groupby('BENE_ID').first()
synthetic_hospice_df.reset_index(inplace=True)
# synthetic_hospice_df = synthetic_hospice_df[synthetic_hospice_df.tot_claims < synthetic_hospice_df.tot_claims.quantile(.99)]

# HHA
values = [2021]
synthetic_hha_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_hha_df['CLM_FROM_DT'])
synthetic_hha_df['CLM_FROM_YR'] = synthetic_hha_df['CLM_FROM_DT'].dt.year
synthetic_hha_df = synthetic_hha_df[synthetic_hha_df.CLM_FROM_YR.isin(values)]
synthetic_hha_df = pd.merge(synthetic_bene_df, synthetic_hha_df, on ='BENE_ID', how ="left")
synthetic_hha_df['tot_claims'] = synthetic_hha_df.groupby(['BENE_ID'])['CLM_ID'].transform('nunique')
synthetic_hha_df = synthetic_hha_df.groupby('BENE_ID').first()
synthetic_hha_df.reset_index(inplace=True)
synthetic_hha_df = synthetic_hha_df[synthetic_hha_df.tot_claims < synthetic_hha_df.tot_claims.quantile(.995)]

# PDE
values = [2021]
synthetic_pde_df['PD_DT'] = pd.to_datetime(synthetic_pde_df['PD_DT'])
synthetic_pde_df['PD_YR'] = synthetic_pde_df['PD_DT'].dt.year
synthetic_pde_df = synthetic_pde_df[synthetic_pde_df.PD_YR.isin(values)]
synthetic_pde_df = pd.merge(synthetic_bene_df, synthetic_pde_df, on ='BENE_ID', how ="left")
synthetic_pde_df['tot_claims'] = synthetic_pde_df.groupby(['BENE_ID'])['CLM_GRP_ID'].transform('nunique')
synthetic_pde_df = synthetic_pde_df.groupby('BENE_ID').first()
synthetic_pde_df.reset_index(inplace=True)
synthetic_pde_df = synthetic_pde_df[synthetic_pde_df.tot_claims < synthetic_pde_df.tot_claims.quantile(.99)]


plt.close()

# Unique claims per beneficiary distributions

In [4]:
# establish subplot axes
fig = make_subplots(rows=2, cols=4, horizontal_spacing = 0.07, vertical_spacing = 0.2, subplot_titles=("Carrier",
                                                    "Inpatient",
                                                    "Outpatient",
                                                    "DME",
                                                    "SNF",
                                                    "Hospice",
                                                    "HHA",
                                                    "PDE"))
# CARRIER
# plot histogram
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_carrier_df['tot_claims'],
), row=1, col=1)

# update axes
fig.update_xaxes(title_text="Total Claims", row=1, col=1)
fig.update_yaxes(title_text="Beneficiaries (%)", row=1, col=1)
fig.update_yaxes(range = [0,100])

# INPATIENT
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_inpatient_df['tot_claims'],
), row=1, col=2)


fig.update_xaxes(title_text="Total Claims", row=1, col=2)
fig.update_yaxes(title_text="Beneficiaries (%)", row=1, col=2)
fig.update_yaxes(range = [0,100])

# OUTPATIENT
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_outpatient_df['tot_claims'],
), row=1, col=3)

fig.update_xaxes(title_text="Total Claims", row=1, col=3)
fig.update_yaxes(title_text="Beneficiaries (%)", row=1, col=3)
fig.update_yaxes(range = [0,100])

# DME
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_dme_df['tot_claims'],
), row=1, col=4)

fig.update_xaxes(title_text="Total Claims", row=1, col=4)
fig.update_yaxes(title_text="Beneficiaries (%)", row=1, col=4)
fig.update_yaxes(range = [0,100])

# SNF
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_snf_df['tot_claims'],
), row=2, col=1)

fig.update_xaxes(title_text="Total Claims", row=2, col=1)
fig.update_yaxes(title_text="Beneficiaries (%)", row=2, col=1)
fig.update_yaxes(range = [0,100])

# HOSPICE
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_hospice_df['tot_claims'],
), row=2, col=2)

fig.update_xaxes(title_text="Total Claims", row=2, col=2)
fig.update_yaxes(title_text="Beneficiaries (%)", row=2, col=2)
fig.update_yaxes(range = [0,100])

# HHA
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_hha_df['tot_claims'],
), row=2, col=3)

fig.update_xaxes(title_text="Total Claims", row=2, col=3)
fig.update_yaxes(title_text="Beneficiaries (%)", row=2, col=3)
fig.update_yaxes(range = [0,100])

# PDE
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_pde_df['tot_claims'],
), row=2, col=4)

fig.update_xaxes(title_text="Total Claims", row=2, col=4)
fig.update_yaxes(title_text="Beneficiaries (%)", row=2, col=4)
fig.update_yaxes(range = [0,100])
fig.update_xaxes(range = [0,30])



# update plot layout
fig.update_layout(title_text="2021 Percentage of Synthetic Claims per Beneficiary", bargap=0.30, width=1500, height=800, showlegend=False)