In [352]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split

from pgmpy.estimators import HillClimbSearch, ExhaustiveSearch, BayesianEstimator
from pgmpy.estimators import ConstraintBasedEstimator, K2Score, BicScore, BDeuScore
from pgmpy.estimators import MaximumLikelihoodEstimator

from pgmpy.models import BayesianModel


In [356]:
septicemia2017 = pd.read_csv('../data/sparcs/septicemia/summary_2017.csv')
septicemia2017.head()

Unnamed: 0,apr_drg_code,facility_name,payment_typology_1,apr_severity_of_illness,patients,mean_length_of_stay,mean_total_charges_day,System,area_sqmi,Number of Beds
0,720,Bellevue Hospital Center,Blue Cross/Blue Shield,Major,4,14.0,6541.6675,Health+,0.77,912
1,720,Bellevue Hospital Center,Blue Cross/Blue Shield,Minor,1,3.0,6103.62,Health+,0.77,912
2,720,Bellevue Hospital Center,Blue Cross/Blue Shield,Moderate,7,4.857143,6945.73,Health+,0.77,912
3,720,Bellevue Hospital Center,Medicare,Extreme,45,13.133333,7090.089778,Health+,0.77,912
4,720,Bellevue Hospital Center,Medicare,Major,76,10.039474,6249.888289,Health+,0.77,912


In [370]:
heart_fail2017 = pd.read_csv('../data/sparcs/heart_failure/summary_2017.csv')
heart_fail2017.head()

Unnamed: 0,apr_drg_code,facility_name,payment_typology_1,apr_severity_of_illness,patients,mean_length_of_stay,mean_total_charges_day,System,area_sqmi,Number of Beds
0,194,Bellevue Hospital Center,Blue Cross/Blue Shield,Major,1,4.0,7072.22,Health+,0.77,912
1,194,Bellevue Hospital Center,Blue Cross/Blue Shield,Moderate,3,4.666667,6670.496667,Health+,0.77,912
2,194,Bellevue Hospital Center,Medicare,Extreme,11,10.909091,6845.906364,Health+,0.77,912
3,194,Bellevue Hospital Center,Medicare,Major,54,10.518519,5975.812407,Health+,0.77,912
4,194,Bellevue Hospital Center,Medicare,Minor,14,3.214286,7439.227857,Health+,0.77,912


In [380]:
def flatten_categories(df, col):
    new_cols = []
    for category in df[col].unique():
        new_cols.append('is_' + str(category))
        df['is_' + str(category)] = df.apply(lambda x: 1 if x[col] == category else 0, axis=1)
    return (df,new_cols)

def prep_data(df):
    df, payment_cols = flatten_categories(df, 'payment_typology_1')
    df, severe_cols  = flatten_categories(df, 'apr_severity_of_illness')

    cols = ['patients', 'mean_length_of_stay','mean_total_charges_day']
    data = df[ cols + payment_cols + severe_cols].copy()
    return data

In [None]:
data_s17 = prep_data(septicemia2017)
data_s17.head()

In [371]:
data_hf17 = prep_data(heart_fail2017) 
data_hf17.head()

Unnamed: 0,patients,mean_length_of_stay,mean_total_charges_day,is_Blue Cross/Blue Shield,is_Medicare,is_Private Health Insurance,is_Self-Pay,is_Major,is_Moderate,is_Extreme,is_Minor
0,1,4.0,7072.22,1,0,0,0,1,0,0,0
1,3,4.666667,6670.496667,1,0,0,0,0,1,0,0
2,11,10.909091,6845.906364,0,1,0,0,0,0,1,0
3,54,10.518519,5975.812407,0,1,0,0,1,0,0,0
4,14,3.214286,7439.227857,0,1,0,0,0,0,0,1


In [363]:
def LL(x,model,verbose=False):
    loglike = 0
    for cpd in model.get_cpds():
        temp_cpd = cpd.copy()
        thevariable = temp_cpd.variable
        theparents = model.predecessors(thevariable)
        for parent in theparents:
            temp_cpd.reduce([(parent, x[parent])])
        if x[thevariable] < len(temp_cpd.get_values()): # I added this to stop it from failing
            theprob = temp_cpd.get_values()[x[thevariable],0]
            if verbose:
                print (thevariable,theparents,theprob)
            loglike += np.log(theprob)
    return loglike

In [384]:
def get_anomaly_ranks(data):
    data = prep_data(data)

    for i in data.iloc[:,:]:
        data[i] = pd.cut(data[i], bins=10, labels=False)

    hc = HillClimbSearch(data, scoring_method = BicScore(data))
    bic_best_model = hc.estimate()
    print("Edges: ", bic_best_model.edges(), "\n")

    model = BayesianModel( bic_best_model.edges() )
    model.fit(data, estimator=MaximumLikelihoodEstimator)
    exmp = data.apply(lambda x: LL(x, model), axis=1)
    exmp2=pd.Series(exmp)
    exmp2.index = data.index
    return exmp2

In [385]:
anomaly_ranks = get_anomaly_ranks(septicemia2017)

Edges:  [('is_Blue Cross/Blue Shield', 'is_Medicare'), ('is_Blue Cross/Blue Shield', 'is_Private Health Insurance'), ('is_Medicare', 'patients'), ('is_Private Health Insurance', 'is_Medicare'), ('is_Self-Pay', 'is_Medicare'), ('is_Self-Pay', 'is_Private Health Insurance'), ('is_Self-Pay', 'is_Blue Cross/Blue Shield'), ('is_Major', 'is_Moderate'), ('is_Major', 'mean_length_of_stay'), ('is_Minor', 'is_Moderate'), ('is_Minor', 'is_Major'), ('is_Minor', 'is_Extreme'), ('is_Extreme', 'mean_length_of_stay'), ('is_Extreme', 'is_Moderate'), ('is_Extreme', 'is_Major')] 



In [387]:
print("Top Anomalies")
septicemia2017.iloc[anomaly_ranks.sort_values().head(5).index]

# print("Least Anomalous")
# septicemia2017.iloc[exmp2.sort_values(ascending=False).head(5).index] 

Top Anomalies


Unnamed: 0,apr_drg_code,facility_name,payment_typology_1,apr_severity_of_illness,patients,mean_length_of_stay,mean_total_charges_day,System,area_sqmi,Number of Beds,...,is_Moderate,is_Extreme,is_Health+,is_Mount Sinai,is_Others,is_SUNY,is_Northwell,is_Montefiore,is_NewYork-Presbyterian,is_NYU Langone
264,720,Montefiore Med Center - Jack D Weiler Hosp of ...,Medicare,Major,426,8.410798,12862.37716,Montefiore,0.449,421,...,0,0,0,0,0,0,0,1,0,0
228,720,Long Island Jewish Medical Center,Private Health Insurance,Moderate,87,4.0,12153.605862,Northwell,13.07,1025,...,1,0,0,0,0,0,1,0,0,0
186,720,Lenox Hill Hospital,Private Health Insurance,Moderate,82,2.682927,17904.115854,Northwell,1.582,632,...,1,0,0,0,0,0,1,0,0,0
60,720,Coney Island Hospital,Medicare,Major,169,11.266272,6650.69497,Health+,9.415,371,...,0,0,1,0,0,0,0,0,0,0
335,720,Mount Sinai Hospital,Medicare,Major,460,7.363043,10700.668174,Mount Sinai,1.116,1134,...,0,0,0,1,0,0,0,0,0,0


In [393]:
results = pd.DataFrame()
for year in [2015,2016,2017]:
    print("Year: ", year)
    septicemia = pd.read_csv('../data/sparcs/septicemia/summary_{}.csv'.format(year))
    anomaly_ranks = get_anomaly_ranks(septicemia)
    
    top = (septicemia.iloc[anomaly_ranks.sort_values().head(5).index]).copy()
    top['HowAnomalous'] = "Most"
    top['year'] = year
    
    bottom = (septicemia.iloc[exmp2.sort_values(ascending=False).head(5).index]).copy()
    bottom['HowAnomalous'] = 'Least'
    bottom['year'] = year
    
    results = results.append(top)
    results = results.append(bottom)

Year:  2015
Edges:  [('is_Blue Cross/Blue Shield', 'is_Private Health Insurance'), ('is_Medicare', 'patients'), ('is_Medicare', 'is_Private Health Insurance'), ('is_Medicare', 'is_Blue Cross/Blue Shield'), ('is_Medicare', 'is_Self-Pay'), ('is_Self-Pay', 'is_Private Health Insurance'), ('is_Self-Pay', 'is_Blue Cross/Blue Shield'), ('is_Major', 'is_Moderate'), ('is_Extreme', 'mean_length_of_stay'), ('is_Extreme', 'is_Moderate'), ('is_Extreme', 'is_Major'), ('is_Extreme', 'is_Minor'), ('is_Extreme', 'mean_total_charges_day'), ('is_Minor', 'is_Moderate'), ('is_Minor', 'is_Major')] 

Year:  2016
Edges:  [('mean_length_of_stay', 'is_Extreme'), ('is_Blue Cross/Blue Shield', 'is_Private Health Insurance'), ('is_Blue Cross/Blue Shield', 'is_Medicare'), ('is_Blue Cross/Blue Shield', 'is_Self-Pay'), ('is_Medicare', 'patients'), ('is_Medicare', 'is_Private Health Insurance'), ('is_Self-Pay', 'is_Private Health Insurance'), ('is_Self-Pay', 'is_Medicare'), ('is_Extreme', 'is_Major'), ('is_Extreme', 

In [394]:
results[results['HowAnomalous'] == 'Most']

Unnamed: 0,apr_drg_code,facility_name,payment_typology_1,apr_severity_of_illness,patients,mean_length_of_stay,mean_total_charges_day,System,area_sqmi,Number of Beds,is_Blue Cross/Blue Shield,is_Medicare,is_Private Health Insurance,is_Self-Pay,is_Major,is_Moderate,is_Extreme,is_Minor,HowAnomalous,year
444,720,NYU Hospitals Center,Private Health Insurance,Extreme,89,10.247191,18227.42135,NYU Langone,1.88,844,0,0,1,0,0,0,1,0,Most,2015
445,720,NYU Hospitals Center,Private Health Insurance,Major,91,6.252747,13007.3011,NYU Langone,1.88,844,0,0,1,0,1,0,0,0,Most,2015
87,720,Flushing Hospital Medical Center,Medicare,Major,187,10.235294,4082.647059,Others,14.16,299,0,1,0,0,1,0,0,0,Most,2015
49,720,Calvary Hospital Inc,Medicare,Extreme,3,27.0,2195.803333,Others,4.495,200,0,1,0,0,0,0,1,0,Most,2015
52,720,Calvary Hospital Inc,Private Health Insurance,Moderate,1,14.0,2117.36,Others,4.495,200,0,0,1,0,0,1,0,0,Most,2015
442,720,NYU Hospitals Center,Private Health Insurance,Minor,87,8.275862,15861.01966,NYU Langone,1.88,844,0,0,1,0,0,0,0,1,Most,2016
441,720,NYU Hospitals Center,Private Health Insurance,Major,75,5.24,13888.134,NYU Langone,1.88,844,0,0,1,0,1,0,0,0,Most,2016
85,720,Flushing Hospital Medical Center,Medicare,Major,167,11.305389,4033.157186,Others,14.16,299,0,1,0,0,1,0,0,0,Most,2016
444,720,NYU Hospitals Center,Self-Pay,Extreme,4,23.0,24503.615,NYU Langone,1.88,844,0,0,0,1,0,0,1,0,Most,2016
106,720,Harlem Hospital Center,Self-Pay,Major,3,39.666667,6112.89,Health+,1.517,282,0,0,0,1,1,0,0,0,Most,2016


In [399]:
results_heart = pd.DataFrame()
for year in [2015,2016,2017]:
    print("Year: ", year)
    heart = pd.read_csv('../data/sparcs/heart_failure/summary_{}.csv'.format(year))
    anomaly_ranks = get_anomaly_ranks(heart)
    
    top = (heart.iloc[anomaly_ranks.sort_values().head(5).index]).copy()
    top['HowAnomalous'] = "Most"
    top['year'] = year
    
    bottom = (heart.iloc[exmp2.sort_values(ascending=False).head(5).index]).copy()
    bottom['HowAnomalous'] = 'Least'
    bottom['year'] = year
    
    results_heart = results_heart.append(top)
    results_heart = results_heart.append(bottom)

Year:  2015
Edges:  [('mean_length_of_stay', 'is_Extreme'), ('mean_length_of_stay', 'is_Major'), ('is_Blue Cross/Blue Shield', 'is_Private Health Insurance'), ('is_Medicare', 'patients'), ('is_Medicare', 'is_Private Health Insurance'), ('is_Medicare', 'is_Blue Cross/Blue Shield'), ('is_Medicare', 'is_Self-Pay'), ('is_Self-Pay', 'is_Private Health Insurance'), ('is_Self-Pay', 'is_Blue Cross/Blue Shield'), ('is_Extreme', 'is_Moderate'), ('is_Extreme', 'is_Minor'), ('is_Extreme', 'is_Major'), ('is_Major', 'is_Moderate'), ('is_Major', 'is_Minor'), ('is_Minor', 'is_Moderate')] 

Year:  2016
Edges:  [('mean_length_of_stay', 'is_Extreme'), ('mean_length_of_stay', 'is_Minor'), ('is_Blue Cross/Blue Shield', 'is_Private Health Insurance'), ('is_Blue Cross/Blue Shield', 'is_Medicare'), ('is_Blue Cross/Blue Shield', 'is_Self-Pay'), ('is_Medicare', 'patients'), ('is_Medicare', 'is_Private Health Insurance'), ('is_Self-Pay', 'is_Private Health Insurance'), ('is_Self-Pay', 'is_Medicare'), ('is_Extrem

  from ipykernel import kernelapp as app


In [402]:
results_heart[results_heart['HowAnomalous'] == 'Least']

Unnamed: 0,apr_drg_code,facility_name,payment_typology_1,apr_severity_of_illness,patients,mean_length_of_stay,mean_total_charges_day,System,area_sqmi,Number of Beds,is_Blue Cross/Blue Shield,is_Medicare,is_Private Health Insurance,is_Self-Pay,is_Extreme,is_Major,is_Minor,is_Moderate,HowAnomalous,year
389,194,North Central Bronx Hospital,Self-Pay,Major,1,7.0,6597.87,Health+,3.22,213,0,0,0,1,0,1,0,0,Least,2015
443,194,SBH Health System,Blue Cross/Blue Shield,Major,1,13.0,4952.37,Others,3.596,422,1,0,0,0,0,1,0,0,Least,2015
247,194,Metropolitan Hospital Center,Self-Pay,Minor,1,2.0,9759.89,Health+,1.074,338,0,0,0,1,0,0,1,0,Least,2015
478,194,University Hospital of Brooklyn,Medicare,Minor,44,3.045455,5836.652727,SUNY,2.265,342,0,1,0,0,0,0,1,0,Least,2015
119,194,Interfaith Medical Center,Private Health Insurance,Major,5,3.4,5272.648,Others,2.895,287,0,0,1,0,0,1,0,0,Least,2015
389,194,"New York Community Hospital of Brooklyn, Inc",Medicare,Major,105,7.228571,5493.969048,NewYork-Presbyterian,4.632,134,0,1,0,0,0,1,0,0,Least,2016
443,194,Queens Hospital Center,Private Health Insurance,Moderate,7,4.142857,6919.341429,Health+,15.7,269,0,0,1,0,0,0,0,1,Least,2016
247,194,Memorial Hospital for Cancer and Allied Diseases,Private Health Insurance,Minor,1,2.0,10957.59,Others,0.382,514,0,0,1,0,0,0,1,0,Least,2016
478,194,Staten Island University Hosp-North,Private Health Insurance,Extreme,3,13.0,11115.71333,Northwell,14.9,472,0,0,1,0,1,0,0,0,Least,2016
119,194,Jacobi Medical Center,Blue Cross/Blue Shield,Extreme,2,8.0,5815.65,Health+,7.016,457,1,0,0,0,1,0,0,0,Least,2016
