In [2]:
import pandas as pd

In [3]:
from tqdm.notebook import tqdm
import sqlite3

## We want to eliminate "accidental" referrals, so filter the hop teaming data so that the transaction_count is at least 50 and the average_day_wait is less than 50.

##  Connect Hop Teaming dataset to sql

In [255]:
db = sqlite3.connect('data/Hop_Teaming_2018.sqlite')

for chunk in tqdm(pd.read_csv('data/DocGraph_Hop_Teaming_2018.csv', chunksize = 10000)):
    chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]      # Clean up the column names
    chunk = chunk[ (chunk[ 'transaction_count' ] >= 50) & (chunk[ 'average_day_wait' ] < 50)]
    chunk.to_sql('Hop_Teaming_2018', db, if_exists = 'append', index = False)            # Append the chunk to Hop_Teaming_2018 table

0it [00:00, ?it/s]

##  Connect nppes dataset to sql

In [None]:
from collections import defaultdict
types = defaultdict(None, {'Provider Business Practice Location Address Postal Code':str}) #default is a special kind of dictionary,
                                                    #None tells read_csv to keep the types of other column types 
                                                    #but for every other column, use None


db = sqlite3.connect('data/Hop_Teaming_2018.sqlite')

for chunk in tqdm(pd.read_csv('data/npidata_pfile_20050523-20230212.csv', 
                              dtype = types,
                              usecols = ['NPI','Entity Type Code',
                                         'Provider Organization Name (Legal Business Name)',
                                         'Provider Last Name (Legal Name)',
                                         'Provider First Name',
                                         'Provider Middle Name',
                                         'Provider Name Prefix Text',
                                         'Provider Name Suffix Text',
                                         'Provider Credential Text',
                                         'Provider First Line Business Practice Location Address',
                                         'Provider Second Line Business Practice Location Address',
                                         'Provider Business Practice Location Address City Name',
                                         'Provider Business Practice Location Address State Name',
                                         'Provider Business Practice Location Address Postal Code',
                                         'Healthcare Provider Taxonomy Code_1',
                                         'Healthcare Provider Primary Taxonomy Switch_1',
                                         'Healthcare Provider Taxonomy Code_2',
                                         'Healthcare Provider Primary Taxonomy Switch_2',
                                         'Healthcare Provider Taxonomy Code_3',
                                         'Healthcare Provider Primary Taxonomy Switch_3',
                                         'Healthcare Provider Taxonomy Code_4',
                                         'Healthcare Provider Primary Taxonomy Switch_4',
                                         'Healthcare Provider Taxonomy Code_5',
                                         'Healthcare Provider Primary Taxonomy Switch_5',
                                         'Healthcare Provider Taxonomy Code_6',
                                         'Healthcare Provider Primary Taxonomy Switch_6',
                                         'Healthcare Provider Taxonomy Code_7',
                                         'Healthcare Provider Primary Taxonomy Switch_7',
                                         'Healthcare Provider Taxonomy Code_8',
                                         'Healthcare Provider Primary Taxonomy Switch_8',
                                         'Healthcare Provider Taxonomy Code_9',
                                         'Healthcare Provider Primary Taxonomy Switch_9',
                                         'Healthcare Provider Taxonomy Code_10',
                                         'Healthcare Provider Primary Taxonomy Switch_10',
                                         'Healthcare Provider Taxonomy Code_11',
                                         'Healthcare Provider Primary Taxonomy Switch_11',
                                         'Healthcare Provider Taxonomy Code_12',
                                         'Healthcare Provider Primary Taxonomy Switch_12',
                                         'Healthcare Provider Taxonomy Code_13',
                                         'Healthcare Provider Primary Taxonomy Switch_13',
                                         'Healthcare Provider Taxonomy Code_14',
                                         'Healthcare Provider Primary Taxonomy Switch_14',
                                         'Healthcare Provider Taxonomy Code_15',
                                         'Healthcare Provider Primary Taxonomy Switch_15',],
                              chunksize = 10000)):
  
    chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]      # Clean up the column names
    chunk[ 'primary_taxonomy' ] = ""
    for index, row in chunk.iterrows():         #code below create a list of taxonomy values IF each row has more than one 'Y'
        chunk.loc[index, 'primary_taxonomy'] = ([ row[f'healthcare_provider_taxonomy_code_{n}'] for n in range(1,16) \
            if row[ f'healthcare_provider_primary_taxonomy_switch_{n}' ] == 'Y' ] \
                    +[row[f'healthcare_provider_taxonomy_code_1']])[0] #some rows has no switch = 'Y', 
                                                                            #which results in an empty list
        if isinstance(row['provider_business_practice_location_address_postal_code'], str):
            chunk.loc[index, 'provider_business_practice_location_address_postal_code'] = \
                row['provider_business_practice_location_address_postal_code'][0:5]   # take first 5 characters from the left
        else: 
            chunk.loc[index, 'provider_business_practice_location_address_postal_code'] = ""
      
            
    chunk.drop( columns=[ f'healthcare_provider_taxonomy_code_{n}' for n in range(1,16) ], inplace = True )
    chunk.drop( columns=[ f'healthcare_provider_primary_taxonomy_switch_{n}' for n in range(1,16) ], inplace = True )
    chunk = chunk[ ~ chunk['primary_taxonomy'].isnull() ]
    chunk.to_sql('nppes', db, if_exists = 'append', index = False) 
    #print(chunk)
    #break
    
 

##  Connect Taxonomy dataset to sql

In [279]:
db = sqlite3.connect('data/Hop_Teaming_2018.sqlite')

for chunk in tqdm(pd.read_csv('data/nucc_taxonomy_230.csv', encoding = 'unicode_escape', chunksize = 10000)):
    chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]      # Clean up the column names
    chunk.to_sql('taxonomy', db, if_exists = 'append', index = False)            

0it [00:00, ?it/s]

In [282]:
query = "SELECT * FROM taxonomy LIMIT 5"

with sqlite3.connect('data/Hop_Teaming_2018.sqlite') as db: 
    taxonomy_sqlite = pd.read_sql(query, db)

In [283]:
taxonomy_sqlite

Unnamed: 0,code,grouping,classification,specialization,definition,notes,display_name,section
0,193200000X,Group,Multi-Specialty,,A business group of one or more individual pra...,[7/1/2003: new],Multi-Specialty Group,Individual
1,193400000X,Group,Single Specialty,,A business group of one or more individual pra...,[7/1/2003: new],Single Specialty Group,Individual
2,207K00000X,Allopathic & Osteopathic Physicians,Allergy & Immunology,,An allergist-immunologist is trained in evalua...,"Source: American Board of Medical Specialties,...",Allergy & Immunology Physician,Individual
3,207KA0200X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Allergy,"A physician who specializes in the diagnosis, ...",Source: National Uniform Claim Committee,Allergy Physician,Individual
4,207KI0005X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Clinical & Laboratory Immunology,An allergy and immunology physician who specia...,"Source: National Uniform Claim Committee, 2022...",Clinical & Laboratory Immunology (Allergy & Im...,Individual


##  Connect ZIP CBSA dataset to sql

In [287]:
db = sqlite3.connect('data/Hop_Teaming_2018.sqlite')

zip_cbsa = pd.read_excel('data/ZIP_CBSA_122021.xlsx', index_col = None, header = 0, dtype={'zip': object})

# add table to database
zip_cbsa.to_sql('zip_cbsa', db, if_exists = 'append', index = False)

#create index
db.execute('CREATE INDEX zip ON zip_cbsa(zip)')

db.close()

In [288]:
query = "SELECT * FROM zip_cbsa LIMIT 5"

with sqlite3.connect('data/Hop_Teaming_2018.sqlite') as db: 
    zip_cbsa_sqlite = pd.read_sql(query, db)

In [289]:
zip_cbsa_sqlite

Unnamed: 0,zip,cbsa,usps_zip_pref_city,usps_zip_pref_state,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,683,41900,SAN GERMAN,PR,0.999842,1.0,1.0,0.999855
1,683,32420,SAN GERMAN,PR,0.000158,0.0,0.0,0.000145
2,923,41980,SAN JUAN,PR,1.0,1.0,1.0,1.0
3,1010,44140,BRIMFIELD,MA,0.976896,1.0,1.0,0.977816
4,1010,49340,BRIMFIELD,MA,0.023104,0.0,0.0,0.022184


## Connect Physician - Facility Affiliations dataset to sql

In [20]:
db = sqlite3.connect('Hop_Teaming_2018.sqlite')

for chunk in tqdm(pd.read_csv('data/Facility_Affiliation.csv', encoding = 'unicode_escape', chunksize = 10000)):
    chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]      # Clean up the column names
    chunk.to_sql('facility_affiliations', db, if_exists = 'append', index = False)  

0it [00:00, ?it/s]

In [24]:
query = "SELECT * FROM facility_affiliations LIMIT 5"

with sqlite3.connect('Hop_Teaming_2018.sqlite') as db: 
    facility_affiliations_sqlite = pd.read_sql(query, db)
    
facility_affiliations_sqlite

Unnamed: 0,npi,ind_pac_id,lst_nm,frst_nm,mid_nm,suff,facility_type,facility_afl_ccn,parent_ccn
0,1003000126,7517003643,ENKESHAFI,ARDALAN,,,Hospital,210003,
1,1003000126,7517003643,ENKESHAFI,ARDALAN,,,Hospital,210022,
2,1003000126,7517003643,ENKESHAFI,ARDALAN,,,Hospital,490063,
3,1003000126,7517003643,ENKESHAFI,ARDALAN,,,Hospital,490145,
4,1003000134,4284706367,CIBULL,THOMAS,L,,Hospital,140010,


## Connect TN General Hospital Info dataset to sql

In [22]:
db = sqlite3.connect('Hop_Teaming_2018.sqlite')

for chunk in tqdm(pd.read_csv('data/TN_Hospital_General_Info.csv', encoding = 'unicode_escape', chunksize = 10000)):
    chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]      # Clean up the column names
    chunk.to_sql('TN_Hospital_Info', db, if_exists = 'append', index = False)  

0it [00:00, ?it/s]

In [25]:
query = "SELECT * FROM TN_Hospital_Info LIMIT 5"

with sqlite3.connect('Hop_Teaming_2018.sqlite') as db: 
    TN_Hospital_Info_sqlite = pd.read_sql(query, db)
    
TN_Hospital_Info_sqlite

Unnamed: 0,facility_id,facility_name,address,city,state,zip_code,county_name,phone_number,hospital_type,hospital_ownership,...,count_of_readm_measures_better,count_of_readm_measures_no_different,count_of_readm_measures_worse,readm_group_footnote,pt_exp_group_measure_count,count_of_facility_pt_exp_measures,pt_exp_group_footnote,te_group_measure_count,count_of_facility_te_measures,te_group_footnote
0,440001,UNICOI COUNTY HOSPITAL,2030 TEMPLE HILL ROAD,ERWIN,TN,37650,UNICOI,(423) 743-3141,Acute Care Hospitals,Voluntary non-profit - Private,...,0,4,0,,8,Not Available,5.0,12,5,
1,440002,JACKSON-MADISON COUNTY GENERAL HOSPITAL,620 SKYLINE DRIVE,JACKSON,TN,38301,MADISON,(731) 541-5000,Acute Care Hospitals,Government - Hospital District or Authority,...,1,7,3,,8,8,,12,6,
2,440003,SUMNER REGIONAL MEDICAL CENTER,555 HARTSVILLE PIKE,GALLATIN,TN,37066,SUMNER,(615) 452-4210,Acute Care Hospitals,Proprietary,...,0,7,1,,8,8,,12,9,
3,440006,TRISTAR SKYLINE MEDICAL CENTER,3441 DICKERSON PIKE,NASHVILLE,TN,37207,DAVIDSON,(615) 769-2000,Acute Care Hospitals,Voluntary non-profit - Private,...,0,6,2,,8,8,,12,8,
4,440007,UNITY MEDICAL CENTER,481 INTERSTATE DRIVE,MANCHESTER,TN,37355,COFFEE,(931) 728-6354,Acute Care Hospitals,Voluntary non-profit - Private,...,1,5,0,,8,8,,12,4,


## Connect Louvain community detection result to sql for further analysis

In [28]:
db = sqlite3.connect('Hop_Teaming_2018.sqlite')

for chunk in tqdm(pd.read_csv('data/hop_neo4j_npi_providers.csv', encoding = 'unicode_escape', chunksize = 10000)):
    chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]      # Clean up the column names
    chunk.to_sql('louvain', db, if_exists = 'append', index = False)  

0it [00:00, ?it/s]

In [32]:
query = "SELECT * FROM louvain LIMIT 5"

with sqlite3.connect('Hop_Teaming_2018.sqlite') as db: 
    louvain_sqlite = pd.read_sql(query, db)
    
louvain_sqlite

Unnamed: 0,npi,communityid
0,1003013160,2983
1,1003013947,4727
2,1003019902,4418
3,1003028770,1642
4,1003031261,2983


In [30]:
# View tables in database
con = sqlite3.connect('Hop_Teaming_2018.sqlite') 
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('nppes',), ('Hop_Teaming_2018',), ('taxonomy',), ('zip_cbsa',), ('Hop_major_TN_hospitals_res_filter',), ('hop_major_TN_hospitals',), ('hop_neo4j',), ('facility_affiliations',), ('TN_Hospital_Info',), ('louvain',)]
