In [197]:
import sys
import matplotlib.pyplot as plt
import numpy as np
import os

import networkx as nx

import pickle
from PIL import Image, ImageDraw
import matplotlib.lines as lines
from mpl_toolkits import mplot3d
from matplotlib.lines import Line2D

from tqdm import tqdm
import glob

import statistics 

#For network adjacency checks
from skimage.transform import rotate
from sklearn.neighbors import KDTree
from scipy.spatial import distance

#For PCA
import pandas as pd
import seaborn as sns
import json
import re

import warnings

np.seterr(all='raise')

{'divide': 'raise', 'over': 'raise', 'under': 'raise', 'invalid': 'raise'}

In [198]:
# [type(i) for i in list(census_df['S2704_C03_001E'])]
# list(census_df['S2704_C03_001E'])

In [199]:
with open('census_data.json', 'rb') as f:
    census_data = json.load(f)
with open('cms_Data.json', 'rb') as pickle_file:
    cms_data = json.load(pickle_file)
    
census_df = pd.DataFrame.from_dict(census_data['data'])
census_df.columns = census_data['columns']
census_df = census_df.replace('-', float('NaN'))
census_df = census_df.drop(columns=['GEO_ID'])
census_df = census_df.apply(pd.to_numeric)

cms_df = pd.DataFrame.from_dict(cms_data)
cms_df = cms_df.T
cms_df = cms_df[['avg_medicare_payment','infection_ratio','emergency_wait_time',
       'imaging_efficiency','mortality_30','revenue','total_discharges',
       'revenue_per_discharge', ]]
cms_df = cms_df.replace('none', float('NaN'))
cms_df = cms_df.apply(pd.to_numeric)

census_corr = census_df.corr()
cms_corr = cms_df.corr()

census_links = census_corr.stack().reset_index()
census_links.columns = ['var1', 'var2', 'value']

# Keep only correlation over a threshold and remove self correlation (cor(A,A)=1)
census_links_filtered=census_links.loc[ (abs(census_links['value']) > 0.3) & (census_links['var1'] != census_links['var2']) ]
census_links_filtered = census_links_filtered.drop_duplicates(subset=['value'])

cms_links = cms_corr.stack().reset_index()
cms_links.columns = ['var1', 'var2', 'value']

# Keep only correlation over a threshold and remove self correlation (cor(A,A)=1)
cms_links_filtered=cms_links.loc[ (abs(cms_links['value']) > 0.3) & (cms_links['var1'] != cms_links['var2']) ]
cms_links_filtered = cms_links_filtered.drop_duplicates(subset=['value'])

In [200]:
len(census_links_filtered)

27

In [201]:
len(cms_links_filtered)

21

In [202]:
# with open('cm_combined.data', 'rb') as pickle_file:
#     cm = pickle.load(pickle_file)

# corr = cm

# links = corr.stack().reset_index()
# links.columns = ['var1', 'var2', 'value']

# # Keep only correlation over a threshold and remove self correlation (cor(A,A)=1)
# links_filtered=links.loc[ (abs(links['value']) > 0.3) & (links['var1'] != links['var2']) ]

# links_filtered = links_filtered.drop_duplicates(subset=['value'])


In [203]:
# #Use for showing only 3D to 2D correlations

# twod = ['ncompk', 'mdeg_l', 'dmdeg_l', 'mdeg_h', 'dmdeg_h', 
#                 'fdim0',  'cnumh', 'cnumk','circum_h','area_l', 'circum_l', 'areavar','circumvar','asravar', 'circ1', 'asra1', 
#                 'area_h', 'circ0', 'asra0', 'circvar',]
# threed = ['protrusion_mean_len', 'protrusion_max_len','protrusion_num','discr_mean_dist', 'discr_max_dist', 'discr_mean_area', 
#       'discr_num','area', 'perimeter', 'aspect_ratio', 'roundness', 'circularity',]

# pop_index = []
# for index, row in links_filtered.iterrows():
    
#     if row['var1'] in twod:
#         if row['var2'] in threed:
#             continue
#     if row['var1'] in threed:
#         if row['var2'] in twod:
#             continue
#     pop_index.append(index)
# links_filtered = links_filtered.drop(pop_index)

In [204]:
census_unique_vars = pd.DataFrame(np.unique(census_corr.columns))
cms_unique_vars = pd.DataFrame(np.unique(cms_corr.columns))

In [205]:
# census_translate = {0: 'P1_001N', 	1:'P1_003N', 	2:'P1_004N', 	3:'P1_005N', 	4:'P1_006N', 	5:'P1_007N', 	6:'S1501_C01_015E', 	     
#          7:'S1901_C01_012E', 	8:'S2701_C03_001E',9: 'S2703_C03_001E', 	10:'S2704_C03_001E',}

census_translate = {'P1_001N':0, 'P1_003N':1, 'P1_004N':2, 'P1_005N':3, 'P1_006N':4, 'P1_007N':5, 'S1501_C01_015E':6,
         'S1901_C01_012E':7, 'S2701_C03_001E':8, 'S2703_C03_001E':9, 'S2704_C03_001E':10,}

# cms_translate = {0: 'avg_medicare_payment', 	1:'emergency_wait_time', 	2:'imaging_efficiency',
#                  3:'infection_ratio', 	4:'mortality_30', 	5:'revenue', 	6:'revenue_per_discharge', 	     
#          7:'total_discharges', }

cms_translate = {'avg_medicare_payment':0, 'emergency_wait_time':1, 'imaging_efficiency':2,
                 'infection_ratio':3, 'mortality_30':4, 'revenue':5, 'revenue_per_discharge':6,      
         'total_discharges':7, }

In [206]:
census_templinks = [{'source':link[0],'target':link[1],'value':link[2]} for link in census_links_filtered.to_numpy()]

# for i in range(len(census_templinks)):
#     census_templinks[i]['source'] = census_translate[census_templinks[i]['source']]

census_links_list = []
for link in census_templinks:
#      unique_vars.index[unique_vars[0]=='solidity'].to_list()[0]
    record = {"value":link['value'], "source": census_unique_vars.index[census_unique_vars[0]==link['source']].to_list()[0],
             "target": census_unique_vars.index[census_unique_vars[0]==link['target']].to_list()[0]}
    census_links_list.append(record)
    
    
cms_templinks = [{'source':link[0],'target':link[1],'value':link[2]} for link in cms_links_filtered.to_numpy()]

# for i in range(len(cms_templinks)):
#     cms_templinks[i]['source'] = cms_translate[cms_templinks[i]['source']]

cms_links_list = []
for link in cms_templinks:
#      unique_vars.index[unique_vars[0]=='solidity'].to_list()[0]
    record = {"value":link['value'], "source": cms_unique_vars.index[cms_unique_vars[0]==link['source']].to_list()[0],
             "target": cms_unique_vars.index[cms_unique_vars[0]==link['target']].to_list()[0]}
    cms_links_list.append(record)

In [216]:
census_translate = {'P1_001N':0, 'P1_003N':1, 'P1_004N':2, 'P1_005N':3, 'P1_006N':4, 'P1_007N':5, 'S1501_C01_015E':6,
         'S1901_C01_012E':7, 'S2701_C03_001E':8, 'S2703_C03_001E':9, 'S2704_C03_001E':10,}

cms_translate = {'avg_medicare_payment':0, 'emergency_wait_time':1, 'imaging_efficiency':2,
                 'infection_ratio':3, 'mortality_30':4, 'revenue':5, 'revenue_per_discharge':6,      
         'total_discharges':7, }


#label colors and edges
def grouper(var):
    if var in ['S2701_C03_001E', 'S2703_C03_001E','S2704_C03_001E', #Health Insurance
              'avg_medicare_payment']:
        return '0'
    elif var in ['P1_001N', 'P1_003N', 'P1_004N', 'P1_005N', 'P1_006N', 'P1_007N',  'P1_007N', #Race/Ethnicity
                 'emergency_wait_time']: 
        return '1'
    elif var in ['S1901_C01_012E', #Median Income
                'imaging_efficiency']: 
        return '2'
    elif var in ['S1501_C01_015E', #Education
                'infection_ratio']: 
        return '3'
    elif var in ['mortality_30']:
        return '4'
    elif var in ['revenue',]:
        return '5'
    elif var in ['revenue_per_discharge']:
        return '6'
    elif var in ['total_discharges']:
        return '7'
        

census_nodes_list = []
cms_nodes_list = []

#change names

for var in census_unique_vars.to_numpy():
    census_nodes_list.append({'name':var[0],'group':grouper(var)})
for var in cms_unique_vars.to_numpy():
    cms_nodes_list.append({'name':var[0],'group':grouper(var)})

In [217]:
census_nodes_list[0]['name'] = 'Total Population'
census_nodes_list[1]['name'] = '% White (race)'
census_nodes_list[2]['name'] = '% Black or AA (race)'
census_nodes_list[3]['name'] = '% American Indian or Alaskan (race)'
census_nodes_list[4]['name'] = '% Asian (race)'
census_nodes_list[5]['name'] = '% Hawaiian or PI (race)'
census_nodes_list[6]['name'] = 'Education (% BA,BS)'
census_nodes_list[7]['name'] = 'Median Income'
census_nodes_list[8]['name'] = 'Health Insurance (Total)'
census_nodes_list[9]['name'] = 'Private Health Insurance'
census_nodes_list[10]['name'] = 'Public Health Insurance'

In [218]:
census_nodes_list

[{'name': 'Total Population', 'group': '1'},
 {'name': '% White (race)', 'group': '1'},
 {'name': '% Black or AA (race)', 'group': '1'},
 {'name': '% American Indian or Alaskan (race)', 'group': '1'},
 {'name': '% Asian (race)', 'group': '1'},
 {'name': '% Hawaiian or PI (race)', 'group': '1'},
 {'name': 'Education (% BA,BS)', 'group': '3'},
 {'name': 'Median Income', 'group': '2'},
 {'name': 'Health Insurance (Total)', 'group': '0'},
 {'name': 'Private Health Insurance', 'group': '0'},
 {'name': 'Public Health Insurance', 'group': '0'}]

In [219]:
census_json_prep = {"links":census_links_list, "nodes":census_nodes_list}
cms_json_prep = {"links":cms_links_list, "nodes":cms_nodes_list}

census_json_prep.keys()
cms_json_prep.keys()

dict_keys(['links', 'nodes'])

In [220]:
census_json_dump = json.dumps(census_json_prep, indent=1, sort_keys=True)
cms_json_dump = json.dumps(cms_json_prep, indent=1, sort_keys=True)

In [221]:
filename_out = 'census_force_directed_graph_data.json'
json_out = open(filename_out,'w')
json_out.write(census_json_dump)
json_out.close()

filename_out = 'cms_force_directed_graph_data.json'
json_out = open(filename_out,'w')
json_out.write(cms_json_dump)
json_out.close()