# Overview
The purpose of this analysis is to compare counts of claim lines per claim by service type. <br>
Visit Data.gov to download the data used for this analysis.

# Load libararies

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import Path
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.renderers.default='iframe'
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.pyplot import figure
from matplotlib import rcParams
matplotlib.rcParams['figure.figsize'] = (12,8)

from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Load data

In [2]:
# load data
synthetic_bene_df = pd.read_csv('data/beneficiary_2022.csv', sep="|", usecols=['BENE_ID'], dtype={'CLM_ID':'str','BENE_ID':'str','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_inpatient_df = pd.read_csv('data/inpatient.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'str','BENE_ID':'str','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_outpatient_df = pd.read_csv('data/outpatient.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'str','BENE_ID':'str','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_carrier_df = pd.read_csv('data/carrier.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'str','BENE_ID':'str','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_dme_df = pd.read_csv('data/dme.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'str','BENE_ID':'str','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_hha_df = pd.read_csv('data/hha.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'str','BENE_ID':'str','CLM_FROM_DT':'str'},low_memory=False)
synthetic_hospice_df = pd.read_csv('data/hospice.csv', sep="|", usecols=['CLM_ID','BENE_ID','CLM_FROM_DT'], dtype={'CLM_ID':'str','BENE_ID':'str','CLM_FROM_DT':'str'}, low_memory=False)
synthetic_snf_df = pd.read_csv('data/snf.csv', sep="|", usecols=['CLM_ID','BENE_ID', 'CLM_FROM_DT'], dtype={'CLM_ID':'str','BENE_ID':'str','CLM_FROM_DT':'str'}, low_memory=False)

print(f'synthetic beneficiary {synthetic_bene_df.shape}')
print(f'synthetic inpatient {synthetic_inpatient_df.shape}')
print(f'synthetic outpatient {synthetic_outpatient_df.shape}')
print(f'synthetic carrier {synthetic_carrier_df.shape}')
print(f'synthetic dme {synthetic_dme_df.shape}')
print(f'synthetic hha {synthetic_hha_df.shape}')
print(f'synthetic hospice {synthetic_hospice_df.shape}')
print(f'synthetic snf {synthetic_snf_df.shape}')

synthetic beneficiary (8671, 1)
synthetic inpatient (58066, 3)
synthetic outpatient (575092, 3)
synthetic carrier (1121004, 3)
synthetic dme (103828, 3)
synthetic hha (6215, 3)
synthetic hospice (12107, 3)
synthetic snf (12548, 3)


# Synthetic number of claim lines per claim

## filter synthetic data

In [3]:
# CARRIER
values = [2022]
synthetic_carrier_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_carrier_df['CLM_FROM_DT'])
synthetic_carrier_df['CLM_FROM_YR'] = synthetic_carrier_df['CLM_FROM_DT'].dt.year
synthetic_carrier_df = synthetic_carrier_df[synthetic_carrier_df.CLM_FROM_YR.isin(values)]
synthetic_carrier_df = pd.merge(synthetic_bene_df, synthetic_carrier_df, on ='BENE_ID', how ="left")
synthetic_carrier_df = synthetic_carrier_df.groupby('CLM_ID').size().to_frame().reset_index().rename(columns={0: "tot_claim_lines"}) 
synthetic_carrier_df = synthetic_carrier_df[synthetic_carrier_df.tot_claim_lines < synthetic_carrier_df.tot_claim_lines.quantile(.999)]

# INPATIENT
values = [2022]
synthetic_inpatient_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_inpatient_df['CLM_FROM_DT'])
synthetic_inpatient_df['CLM_FROM_YR'] = synthetic_inpatient_df['CLM_FROM_DT'].dt.year
synthetic_inpatient_df = synthetic_inpatient_df[synthetic_inpatient_df.CLM_FROM_YR.isin(values)]
synthetic_inpatient_df = pd.merge(synthetic_bene_df, synthetic_inpatient_df, on ='BENE_ID', how ="left")
synthetic_inpatient_df = synthetic_inpatient_df.groupby('CLM_ID').size().to_frame().reset_index().rename(columns={0: "tot_claim_lines"}) 
synthetic_inpatient_df = synthetic_inpatient_df[synthetic_inpatient_df.tot_claim_lines < synthetic_inpatient_df.tot_claim_lines.quantile(.999)]

# OUTPATIENT
values = [2022]
synthetic_outpatient_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_outpatient_df['CLM_FROM_DT'])
synthetic_outpatient_df['CLM_FROM_YR'] = synthetic_outpatient_df['CLM_FROM_DT'].dt.year
synthetic_outpatient_df = synthetic_outpatient_df[synthetic_outpatient_df.CLM_FROM_YR.isin(values)]
synthetic_outpatient_df = pd.merge(synthetic_bene_df, synthetic_outpatient_df, on ='BENE_ID', how ="left")
synthetic_outpatient_df = synthetic_outpatient_df.groupby('CLM_ID').size().to_frame().reset_index().rename(columns={0: "tot_claim_lines"}) 
synthetic_outpatient_df = synthetic_outpatient_df[synthetic_outpatient_df.tot_claim_lines < synthetic_outpatient_df.tot_claim_lines.quantile(.999)]

# DME
values = [2022]
synthetic_dme_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_dme_df['CLM_FROM_DT'])
synthetic_dme_df['CLM_FROM_YR'] = synthetic_dme_df['CLM_FROM_DT'].dt.year
synthetic_dme_df = synthetic_dme_df[synthetic_dme_df.CLM_FROM_YR.isin(values)]
synthetic_dme_df = pd.merge(synthetic_bene_df, synthetic_dme_df, on ='BENE_ID', how ="left")
synthetic_dme_df = synthetic_dme_df.groupby('CLM_ID').size().to_frame().reset_index().rename(columns={0: "tot_claim_lines"}) 
# synthetic_dme_df = synthetic_dme_df[synthetic_dme_df.tot_claim_lines < synthetic_dme_df.tot_claim_lines.quantile(.999)]

# SNF
values = [2022]
synthetic_snf_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_snf_df['CLM_FROM_DT'])
synthetic_snf_df['CLM_FROM_YR'] = synthetic_snf_df['CLM_FROM_DT'].dt.year
synthetic_snf_df = synthetic_snf_df[synthetic_snf_df.CLM_FROM_YR.isin(values)]
synthetic_snf_df = pd.merge(synthetic_bene_df, synthetic_snf_df, on ='BENE_ID', how ="left")
synthetic_snf_df = synthetic_snf_df.groupby('CLM_ID').size().to_frame().reset_index().rename(columns={0: "tot_claim_lines"}) 
synthetic_snf_df = synthetic_snf_df[synthetic_snf_df.tot_claim_lines < synthetic_snf_df.tot_claim_lines.quantile(.999)]

# HOSPICE
values = [2022]
synthetic_hospice_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_hospice_df['CLM_FROM_DT'])
synthetic_hospice_df['CLM_FROM_YR'] = synthetic_hospice_df['CLM_FROM_DT'].dt.year
synthetic_hospice_df = synthetic_hospice_df[synthetic_hospice_df.CLM_FROM_YR.isin(values)]
synthetic_hospice_df = pd.merge(synthetic_bene_df, synthetic_hospice_df, on ='BENE_ID', how ="left")
synthetic_hospice_df = synthetic_hospice_df.groupby('CLM_ID').size().to_frame().reset_index().rename(columns={0: "tot_claim_lines"}) 
synthetic_hospice_df = synthetic_hospice_df[synthetic_hospice_df.tot_claim_lines < synthetic_hospice_df.tot_claim_lines.quantile(.999)]

# HHA
values = [2022]
synthetic_hha_df['CLM_FROM_DT'] = pd.to_datetime(synthetic_hha_df['CLM_FROM_DT'])
synthetic_hha_df['CLM_FROM_YR'] = synthetic_hha_df['CLM_FROM_DT'].dt.year
synthetic_hha_df = synthetic_hha_df[synthetic_hha_df.CLM_FROM_YR.isin(values)]
synthetic_hha_df = pd.merge(synthetic_bene_df, synthetic_hha_df, on ='BENE_ID', how ="left")
synthetic_hha_df = synthetic_hha_df.groupby('CLM_ID').size().to_frame().reset_index().rename(columns={0: "tot_claim_lines"}) 
# synthetic_hha_df = synthetic_hha_df[synthetic_hha_df.tot_claim_lines < synthetic_hha_df.tot_claim_lines.quantile(.999)]

## Create plots

In [4]:
# establish subplot axes
fig = make_subplots(rows=2, cols=4, horizontal_spacing = 0.07, vertical_spacing = 0.2, subplot_titles=("Carrier",
                                                    "Inpatient",
                                                    "Outpatient",
                                                    "DME",
                                                    "SNF",
                                                    "Hospice",
                                                    "HHA"))


# CARRIER 
# plot histogram
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_carrier_df['tot_claim_lines'],
), row=1, col=1)

# axes
fig.update_xaxes(title_text="Total Claim Lines", row=1, col=1)
fig.update_yaxes(title_text="Claims (%)", row=1, col=1)
fig.update_yaxes(range = [0,100])


# INPATIENT
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_inpatient_df['tot_claim_lines'],
), row=1, col=2)

fig.update_xaxes(title_text="Total Claim Lines", row=1, col=2)
fig.update_yaxes(title_text="Claims (%)", row=1, col=2)
fig.update_yaxes(range = [0,100])


# OUTPATIENT
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_outpatient_df['tot_claim_lines'],
), row=1, col=3)

fig.update_xaxes(title_text="Total Claim Lines", row=1, col=3)
fig.update_yaxes(title_text="Claims (%)", row=1, col=3)
fig.update_yaxes(range = [0,100])


# DME
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_dme_df['tot_claim_lines'],
), row=1, col=4)

fig.update_xaxes(title_text="Total Claim Lines", row=1, col=4)
fig.update_yaxes(title_text="Claims (%)", row=1, col=4)
fig.update_yaxes(range = [0,100])


# SNF
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_snf_df['tot_claim_lines'],
), row=2, col=1)

fig.update_xaxes(title_text="Total Claim Lines", row=2, col=1)
fig.update_yaxes(title_text="Claims (%)", row=2, col=1)
fig.update_yaxes(range = [0,100])


# HOSPICE
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_hospice_df['tot_claim_lines'],
), row=2, col=2)

fig.update_xaxes(title_text="Total Claim Lines", row=2, col=2)
fig.update_yaxes(title_text="Claims (%)", row=2, col=2)
fig.update_yaxes(range = [0,100])


# HHA
fig.append_trace(go.Histogram(
    histnorm='percent',
    hoverlabel = dict(namelength=0),
    marker_color='mediumseagreen',
    x=synthetic_hha_df['tot_claim_lines'],
), row=2, col=3)

fig.update_xaxes(title_text="Total Claim Lines", row=2, col=3)
fig.update_yaxes(title_text="Claims (%)", row=2, col=3)
fig.update_yaxes(range = [0,100])



# update plot layout
fig.update_layout(title_text="2022 Percentage of Synthetic Claim Lines per Claim", bargap=0.30, width=1100, height=800, showlegend=False)

In [5]:
print(f"Mean synthetic carrier claim lines: {synthetic_carrier_df['tot_claim_lines'].mean()}")
print(f"Mean synthetic inpatient claim lines: {synthetic_inpatient_df['tot_claim_lines'].mean()}")
print(f"Mean synthetic outpatient claim lines: {synthetic_outpatient_df['tot_claim_lines'].mean()}")
print(f"Mean synthetic dme claim lines: {synthetic_dme_df['tot_claim_lines'].mean()}")
print(f"Mean synthetic snf claim lines: {synthetic_snf_df['tot_claim_lines'].mean()}")
print(f"Mean synthetic hospice claim lines: {synthetic_hospice_df['tot_claim_lines'].mean()}")
print(f"Mean synthetic hha claim lines: {synthetic_hha_df['tot_claim_lines'].mean()}")

Mean synthetic carrier claim lines: 12.182962652119178
Mean synthetic inpatient claim lines: 3.0274645851402138
Mean synthetic outpatient claim lines: 1.3996940923477654
Mean synthetic dme claim lines: 2.695109078114004
Mean synthetic snf claim lines: 7.713675213675214
Mean synthetic hospice claim lines: 11.161073825503356
Mean synthetic hha claim lines: 12.65625
