In [106]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split

from pgmpy.estimators import HillClimbSearch, ExhaustiveSearch, BayesianEstimator
from pgmpy.estimators import ConstraintBasedEstimator, K2Score, BicScore, BDeuScore
from pgmpy.estimators import MaximumLikelihoodEstimator

from pgmpy.models import BayesianModel


In [267]:
septicemia2017 = pd.read_csv('../data/sparcs/septicemia/summary_2017.csv')
septicemia2017.head()

Unnamed: 0,apr_drg_code,facility_name,race,ethnicity,payment_typology_1,apr_severity_of_illness,age_group,patients,mean_length_of_stay,mean_total_charges_day,System,area_sqmi,Number of Beds
0,720,Bellevue Hospital Center,Black/African American,Not Span/Hispanic,Blue Cross/Blue Shield,Minor,50 to 69,1,3.0,6103.62,Health+,0.77,912
1,720,Bellevue Hospital Center,Black/African American,Not Span/Hispanic,Blue Cross/Blue Shield,Moderate,18 to 29,1,10.0,5558.83,Health+,0.77,912
2,720,Bellevue Hospital Center,Black/African American,Not Span/Hispanic,Blue Cross/Blue Shield,Moderate,50 to 69,1,4.0,6585.12,Health+,0.77,912
3,720,Bellevue Hospital Center,Black/African American,Not Span/Hispanic,Medicare,Extreme,50 to 69,4,7.5,6733.095,Health+,0.77,912
4,720,Bellevue Hospital Center,Black/African American,Not Span/Hispanic,Medicare,Extreme,70 or Older,5,7.4,7639.716,Health+,0.77,912


In [268]:
def flatten_categories(df, col):
    for category in df[col].unique():
        df['is_' + str(category)] = df.apply(lambda x: 1 if x[col] == category else 0, axis=1)
    return df

septicemia2017 = flatten_categories(septicemia2017, 'race')
septicemia2017 = flatten_categories(septicemia2017, 'ethnicity')
septicemia2017 = flatten_categories(septicemia2017, 'payment_typology_1')
septicemia2017 = flatten_categories(septicemia2017, 'apr_severity_of_illness')
septicemia2017 = flatten_categories(septicemia2017, 'age_group')

In [269]:
septicemia2017.columns

Index(['apr_drg_code', 'facility_name', 'race', 'ethnicity',
       'payment_typology_1', 'apr_severity_of_illness', 'age_group',
       'patients', 'mean_length_of_stay', 'mean_total_charges_day', 'System',
       'area_sqmi', 'Number of Beds', 'is_Black/African American',
       'is_Other Race', 'is_White', 'is_Multi-racial', 'is_Not Span/Hispanic',
       'is_Spanish/Hispanic', 'is_Unknown', 'is_Multi-ethnic',
       'is_Blue Cross/Blue Shield', 'is_Medicare',
       'is_Private Health Insurance', 'is_Self-Pay', 'is_Minor', 'is_Moderate',
       'is_Extreme', 'is_Major', 'is_50 to 69', 'is_18 to 29',
       'is_70 or Older', 'is_30 to 49', 'is_0 to 17'],
      dtype='object')

In [270]:
data = septicemia2017[['Number of Beds', 'patients', 'mean_length_of_stay', 'mean_total_charges_day',
                       'is_Black/African American','is_Other Race', 'is_White',
                       'is_Blue Cross/Blue Shield', 'is_Medicare','is_Private Health Insurance', 'is_Self-Pay', 
                       'is_Minor', 'is_Moderate','is_Extreme', 'is_Major'
                      ]].copy()
data.head()

Unnamed: 0,Number of Beds,patients,mean_length_of_stay,mean_total_charges_day,is_Black/African American,is_Other Race,is_White,is_Blue Cross/Blue Shield,is_Medicare,is_Private Health Insurance,is_Self-Pay,is_Minor,is_Moderate,is_Extreme,is_Major
0,912,1,3.0,6103.62,1,0,0,1,0,0,0,1,0,0,0
1,912,1,10.0,5558.83,1,0,0,1,0,0,0,0,1,0,0
2,912,1,4.0,6585.12,1,0,0,1,0,0,0,0,1,0,0
3,912,4,7.5,6733.095,1,0,0,0,1,0,0,0,0,1,0
4,912,5,7.4,7639.716,1,0,0,0,1,0,0,0,0,1,0


In [271]:
for i in data.iloc[:,:]:
    data[i] = pd.cut(data[i], bins=10, labels=False)
data.head()

Unnamed: 0,Number of Beds,patients,mean_length_of_stay,mean_total_charges_day,is_Black/African American,is_Other Race,is_White,is_Blue Cross/Blue Shield,is_Medicare,is_Private Health Insurance,is_Self-Pay,is_Minor,is_Moderate,is_Extreme,is_Major
0,7,0,0,0,9,0,0,9,0,0,0,9,0,0,0
1,7,0,1,0,9,0,0,9,0,0,0,0,9,0,0
2,7,0,0,1,9,0,0,9,0,0,0,0,9,0,0
3,7,0,0,1,9,0,0,0,9,0,0,0,0,9,0
4,7,0,0,1,9,0,0,0,9,0,0,0,0,9,0


In [273]:
# hc = HillClimbSearch(data, scoring_method = K2Score(data))
# k2_best_model = hc.estimate()
# print(k2_best_model.edges(), "\n")

In [272]:
hc = HillClimbSearch(data, scoring_method = BicScore(data))
bic_best_model = hc.estimate()
print(bic_best_model.edges(), "\n")

[('Number of Beds', 'mean_total_charges_day'), ('Number of Beds', 'is_Black/African American'), ('mean_length_of_stay', 'is_Extreme'), ('is_Black/African American', 'is_White'), ('is_Black/African American', 'is_Other Race'), ('is_Other Race', 'is_White'), ('is_White', 'is_Blue Cross/Blue Shield'), ('is_Blue Cross/Blue Shield', 'is_Private Health Insurance'), ('is_Blue Cross/Blue Shield', 'is_Medicare'), ('is_Blue Cross/Blue Shield', 'is_Self-Pay'), ('is_Medicare', 'is_Private Health Insurance'), ('is_Medicare', 'patients'), ('is_Medicare', 'mean_length_of_stay'), ('is_Private Health Insurance', 'is_Minor'), ('is_Self-Pay', 'is_Private Health Insurance'), ('is_Self-Pay', 'is_Medicare'), ('is_Minor', 'is_Moderate'), ('is_Minor', 'is_Major'), ('is_Extreme', 'is_Moderate'), ('is_Extreme', 'is_Major'), ('is_Extreme', 'is_Minor'), ('is_Major', 'is_Moderate')] 



In [233]:
def LL(x,model,verbose=False):
    loglike = 0
    for cpd in model.get_cpds():
        temp_cpd = cpd.copy()
        thevariable = temp_cpd.variable
        theparents = model.predecessors(thevariable)
        for parent in theparents:
            temp_cpd.reduce([(parent, x[parent])])
#         print(temp_cpd.get_values())
#         print(x[thevariable])
        if x[thevariable] < len(temp_cpd.get_values()): # I added this to stop it from failing
            theprob = temp_cpd.get_values()[x[thevariable],0]
            if verbose:
                print (thevariable,theparents,theprob)
            loglike += np.log(theprob)
    return loglike

In [276]:
model = BayesianModel( bic_best_model.edges() )
model.fit(data, estimator=MaximumLikelihoodEstimator)
exmp = data.apply(lambda x: LL(x, model), axis=1)

In [277]:
exmp2=pd.Series(exmp)
exmp2.index = septicemia2017.index
exmp2.sort_values().head(5)

1829   -13.921009
3083   -13.819834
3086   -13.093038
1387   -13.002910
435    -12.419732
dtype: float64

In [278]:
print("Top Anomalies")
septicemia2017.iloc[exmp2.sort_values().head(5).index] #11483.51,4901.12,15590.689779

Top Anomalies


Unnamed: 0,apr_drg_code,facility_name,race,ethnicity,payment_typology_1,apr_severity_of_illness,age_group,patients,mean_length_of_stay,mean_total_charges_day,...,is_Self-Pay,is_Minor,is_Moderate,is_Extreme,is_Major,is_50 to 69,is_18 to 29,is_70 or Older,is_30 to 49,is_0 to 17
1829,720,Montefiore Medical Center - Henry & Lucy Moses...,White,Unknown,Medicare,Extreme,50 to 69,1,39.0,23939.23,...,0,0,0,1,0,1,0,0,0,0
3083,720,NYU Langone Hospitals,White,Not Span/Hispanic,Medicare,Extreme,70 or Older,153,9.294118,18346.925752,...,0,0,0,1,0,0,0,1,0,0
3086,720,NYU Langone Hospitals,White,Not Span/Hispanic,Medicare,Major,70 or Older,226,6.575221,15590.689779,...,0,0,0,0,1,0,0,1,0,0
1387,720,Maimonides Medical Center,White,Not Span/Hispanic,Medicare,Extreme,70 or Older,352,10.900568,11483.509915,...,0,0,0,1,0,0,0,1,0,0
435,720,Elmhurst Hospital Center,Other Race,Unknown,Medicare,Major,50 to 69,2,37.5,6363.895,...,0,0,0,0,1,1,0,0,0,0
