In [279]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split

from pgmpy.estimators import HillClimbSearch, ExhaustiveSearch, BayesianEstimator
from pgmpy.estimators import ConstraintBasedEstimator, K2Score, BicScore, BDeuScore
from pgmpy.estimators import MaximumLikelihoodEstimator

from pgmpy.models import BayesianModel


In [287]:
septicemia2017 = pd.read_csv('../data/sparcs/septicemia/summary_2017.csv')
septicemia2017.head()

Unnamed: 0,apr_drg_code,facility_name,payment_typology_1,apr_severity_of_illness,patients,mean_length_of_stay,mean_total_charges_day,System,area_sqmi,Number of Beds
0,720,Bellevue Hospital Center,Blue Cross/Blue Shield,Major,4,14.0,6541.6675,Health+,0.77,912
1,720,Bellevue Hospital Center,Blue Cross/Blue Shield,Minor,1,3.0,6103.62,Health+,0.77,912
2,720,Bellevue Hospital Center,Blue Cross/Blue Shield,Moderate,7,4.857143,6945.73,Health+,0.77,912
3,720,Bellevue Hospital Center,Medicare,Extreme,45,13.133333,7090.089778,Health+,0.77,912
4,720,Bellevue Hospital Center,Medicare,Major,76,10.039474,6249.888289,Health+,0.77,912


In [310]:
def flatten_categories(df, col):
    for category in df[col].unique():
        df['is_' + str(category)] = df.apply(lambda x: 1 if x[col] == category else 0, axis=1)
    return df

septicemia2017 = flatten_categories(septicemia2017, 'payment_typology_1')
septicemia2017 = flatten_categories(septicemia2017, 'apr_severity_of_illness')
# septicemia2017 = flatten_categories(septicemia2017, 'facility_name')
septicemia2017 = flatten_categories(septicemia2017, 'System')

In [311]:
septicemia2017.columns

Index(['apr_drg_code', 'facility_name', 'payment_typology_1',
       'apr_severity_of_illness', 'patients', 'mean_length_of_stay',
       'mean_total_charges_day', 'System', 'area_sqmi', 'Number of Beds',
       'is_Blue Cross/Blue Shield', 'is_Medicare',
       'is_Private Health Insurance', 'is_Self-Pay', 'is_Major', 'is_Minor',
       'is_Moderate', 'is_Extreme', 'is_Bellevue Hospital Center',
       'is_Bronx-Lebanon Hospital Center - Concourse Division',
       'is_Brookdale Hospital Medical Center',
       'is_Brooklyn Hospital Center - Downtown Campus',
       'is_Calvary Hospital Inc', 'is_Coney Island Hospital',
       'is_Elmhurst Hospital Center', 'is_Flushing Hospital Medical Center',
       'is_Harlem Hospital Center', 'is_Interfaith Medical Center',
       'is_Jacobi Medical Center', 'is_Jamaica Hospital Medical Center',
       'is_Kings County Hospital Center',
       'is_Kingsbrook Jewish Medical Center', 'is_Lenox Hill Hospital',
       'is_Lincoln Medical & Mental Hea

In [313]:
data = septicemia2017[['patients', 'mean_length_of_stay',
                       'mean_total_charges_day', 'Number of Beds',
                       'is_Blue Cross/Blue Shield', 'is_Medicare',
                       'is_Private Health Insurance', 'is_Self-Pay', 
                       'is_Major', 'is_Minor', 'is_Moderate', 'is_Extreme' 
#                        ,'is_Bellevue Hospital Center','is_Bronx-Lebanon Hospital Center - Concourse Division',
#                        'is_Brookdale Hospital Medical Center',
#                        'is_Brooklyn Hospital Center - Downtown Campus',
#                        'is_Calvary Hospital Inc', 'is_Coney Island Hospital',
#                        'is_Elmhurst Hospital Center', 'is_Flushing Hospital Medical Center',
#                        'is_Harlem Hospital Center', 'is_Interfaith Medical Center',
#                        'is_Jacobi Medical Center', 'is_Jamaica Hospital Medical Center',
#                        'is_Kings County Hospital Center',
#                        'is_Kingsbrook Jewish Medical Center', 'is_Lenox Hill Hospital',
#                        'is_Lincoln Medical & Mental Health Center',
#                        'is_Long Island Jewish Forest Hills',
#                        'is_Long Island Jewish Medical Center', 'is_Maimonides Medical Center',
#                        'is_Metropolitan Hospital Center',
#                        'is_Montefiore Med Center - Jack D Weiler Hosp of A Einstein College Div',
#                        'is_Montefiore Medical Center-Wakefield Hospital',
#                        'is_Montefiore Medical Center - Henry & Lucy Moses Div',
#                        'is_Mount Sinai Beth Israel', 'is_Mount Sinai Brooklyn',
#                        'is_Mount Sinai Hospital','is_Mount Sinai Hospital - Mount Sinai Hospital of Queens',
#                        "is_Mount Sinai St. Luke's", 'is_Mount Sinai West','is_New York-Presbyterian/Lower Manhattan Hospital','is_New York Community Hospital of Brooklyn, Inc',
#                        'is_North Central Bronx Hospital', 'is_NYU Langone Hospital-Brooklyn',
#                        'is_NYU Langone Hospitals', 'is_NYU Langone Orthopedic Hospital',
#                        'is_Queens Hospital Center', 'is_Richmond University Medical Center',
#                        'is_SBH Health System', 'is_Staten Island University Hosp-North',
#                        'is_Staten Island University Hosp-South',
#                        'is_University Hospital of Brooklyn',
#                        'is_Woodhull Medical & Mental Health Center',
#                        'is_Wyckoff Heights Medical Center'
                       
                       , 'is_Health+', 'is_Mount Sinai','is_Others', 'is_SUNY', 'is_Northwell', 'is_Montefiore','is_NewYork-Presbyterian', 'is_NYU Langone'                      
                      ]].copy()
data.head()

Unnamed: 0,patients,mean_length_of_stay,mean_total_charges_day,Number of Beds,is_Blue Cross/Blue Shield,is_Medicare,is_Private Health Insurance,is_Self-Pay,is_Major,is_Minor,is_Moderate,is_Extreme,is_Health+,is_Mount Sinai,is_Others,is_SUNY,is_Northwell,is_Montefiore,is_NewYork-Presbyterian,is_NYU Langone
0,4,14.0,6541.6675,912,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
1,1,3.0,6103.62,912,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0
2,7,4.857143,6945.73,912,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0
3,45,13.133333,7090.089778,912,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0
4,76,10.039474,6249.888289,912,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0


In [314]:
for i in data.iloc[:,:]:
    data[i] = pd.cut(data[i], bins=10, labels=False)
data.head()

Unnamed: 0,patients,mean_length_of_stay,mean_total_charges_day,Number of Beds,is_Blue Cross/Blue Shield,is_Medicare,is_Private Health Insurance,is_Self-Pay,is_Major,is_Minor,is_Moderate,is_Extreme,is_Health+,is_Mount Sinai,is_Others,is_SUNY,is_Northwell,is_Montefiore,is_NewYork-Presbyterian,is_NYU Langone
0,0,2,0,7,9,0,0,0,9,0,0,0,9,0,0,0,0,0,0,0
1,0,0,0,7,9,0,0,0,0,9,0,0,9,0,0,0,0,0,0,0
2,0,0,0,7,9,0,0,0,0,0,9,0,9,0,0,0,0,0,0,0
3,0,2,0,7,0,9,0,0,0,0,0,9,9,0,0,0,0,0,0,0
4,1,1,0,7,0,9,0,0,9,0,0,0,9,0,0,0,0,0,0,0


In [315]:
# hc = HillClimbSearch(data, scoring_method = K2Score(data))
# k2_best_model = hc.estimate()
# print(k2_best_model.edges(), "\n")

In [316]:
hc = HillClimbSearch(data, scoring_method = BicScore(data))
bic_best_model = hc.estimate()
print(bic_best_model.edges(), "\n")

[('patients', 'is_Medicare'), ('mean_total_charges_day', 'is_NYU Langone'), ('Number of Beds', 'is_Montefiore'), ('Number of Beds', 'is_Northwell'), ('Number of Beds', 'is_NewYork-Presbyterian'), ('Number of Beds', 'is_Others'), ('is_Blue Cross/Blue Shield', 'is_Private Health Insurance'), ('is_Medicare', 'is_Private Health Insurance'), ('is_Medicare', 'is_Blue Cross/Blue Shield'), ('is_Medicare', 'is_Self-Pay'), ('is_Self-Pay', 'is_Private Health Insurance'), ('is_Self-Pay', 'is_Blue Cross/Blue Shield'), ('is_Major', 'is_Moderate'), ('is_Major', 'mean_length_of_stay'), ('is_Minor', 'is_Moderate'), ('is_Minor', 'is_Major'), ('is_Minor', 'is_Extreme'), ('is_Extreme', 'mean_length_of_stay'), ('is_Extreme', 'is_Moderate'), ('is_Extreme', 'is_Major'), ('is_Health+', 'mean_total_charges_day'), ('is_Health+', 'is_SUNY'), ('is_Mount Sinai', 'Number of Beds'), ('is_Mount Sinai', 'is_Health+'), ('is_Mount Sinai', 'is_SUNY'), ('is_Others', 'is_Health+'), ('is_Others', 'mean_total_charges_day'), 

In [317]:
def LL(x,model,verbose=False):
    loglike = 0
    for cpd in model.get_cpds():
        temp_cpd = cpd.copy()
        thevariable = temp_cpd.variable
        theparents = model.predecessors(thevariable)
        for parent in theparents:
            temp_cpd.reduce([(parent, x[parent])])
#         print(temp_cpd.get_values())
#         print(x[thevariable])
        if x[thevariable] < len(temp_cpd.get_values()): # I added this to stop it from failing
            theprob = temp_cpd.get_values()[x[thevariable],0]
            if verbose:
                print (thevariable,theparents,theprob)
            loglike += np.log(theprob)
    return loglike

In [318]:
model = BayesianModel( bic_best_model.edges() )
model.fit(data, estimator=MaximumLikelihoodEstimator)
exmp = data.apply(lambda x: LL(x, model), axis=1)

In [319]:
exmp2=pd.Series(exmp)
exmp2.index = septicemia2017.index
exmp2.sort_values().head(5)

454   -12.313544
186   -12.194546
228   -11.560066
299   -11.442463
450   -11.180102
dtype: float64

In [320]:
print("Top Anomalies")
septicemia2017.iloc[exmp2.sort_values().head(5).index] #11483.51,4901.12,15590.689779

Top Anomalies


Unnamed: 0,apr_drg_code,facility_name,payment_typology_1,apr_severity_of_illness,patients,mean_length_of_stay,mean_total_charges_day,System,area_sqmi,Number of Beds,...,is_Woodhull Medical & Mental Health Center,is_Wyckoff Heights Medical Center,is_Health+,is_Mount Sinai,is_Others,is_SUNY,is_Northwell,is_Montefiore,is_NewYork-Presbyterian,is_NYU Langone
454,720,NYU Langone Hospitals,Private Health Insurance,Major,111,6.711712,18033.614144,NYU Langone,1.88,844,...,0,0,0,0,0,0,0,0,0,1
186,720,Lenox Hill Hospital,Private Health Insurance,Moderate,82,2.682927,17904.115854,Northwell,1.582,632,...,0,0,0,0,0,0,1,0,0,0
228,720,Long Island Jewish Medical Center,Private Health Insurance,Moderate,87,4.0,12153.605862,Northwell,13.07,1025,...,0,0,0,0,0,0,1,0,0,0
299,720,Montefiore Medical Center - Henry & Lucy Moses...,Self-Pay,Extreme,1,5.0,21072.35,Montefiore,1.721,816,...,0,0,0,0,0,0,0,1,0,0
450,720,NYU Langone Hospitals,Medicare,Major,374,6.550802,15892.865374,NYU Langone,1.88,844,...,0,0,0,0,0,0,0,0,0,1
