In [None]:
BASE_DIR='/home/thanuja/Dropbox/coursera/Milestone1/data/

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import *
from itertools import chain
from pyspark.sql import types as t
import numpy as np
from pyspark.sql.types import StructType,StructField, StringType,IntegerType
import pandas as pd

In [None]:
#pyspark intitialization
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .appName('cms_physicians_analysis') \
    .getOrCreate() 
# accessing spark context
sc = spark.sparkContext

In [None]:
# Taxonomies provided by CMS
hcp_taxonomies =spark.read.options(header='True').csv(BASE_DIR + "Medicare_Provider_and_Supplier_Taxonomy_Crosswalk_October_2021.csv")
#rename columns and remove whitespaces as appropriate.
hcp_taxonomies = hcp_taxonomies.withColumnRenamed("PROVIDER TAXONOMY DESCRIPTION:  TYPE, CLASSIFICATION, SPECIALIZATION","detail_desc")\
                               .withColumnRenamed("MEDICARE PROVIDER/SUPPLIER TYPE DESCRIPTION","hl_desc")\
                               .withColumnRenamed("MEDICARE SPECIALTY CODE","sp_code")\
                               .withColumnRenamed("PROVIDER TAXONOMY CODE","tx_code")
hcp_taxonomies = hcp_taxonomies.withColumn('tx_code', trim(hcp_taxonomies.tx_code))
hcp_taxonomies.show(truncate=False)

In [None]:
#create dictionaries with taxcodes as key and value being high level and detailed descriptions of specialities.
tx_codes = hcp_taxonomies.select(F.collect_list('tx_code')).first()[0]
detail_descs = hcp_taxonomies.select(F.collect_list('detail_desc')).first()[0]
tax_detail_dict = dict(zip(tx_codes, detail_descs))

hl_descs = hcp_taxonomies.select(F.collect_list('hl_desc')).first()[0]
tax_hl_dict = dict(zip(tx_codes, hl_descs))

# this file has mapping of specialities between CMS taxonomy file and DAC_NationalDownloadableFile.csv
cms_npi_map = pd.read_csv(BASE_DIR + "mapping_taxonomies.csv")

# dictionary of the mapping cms speciality and specialities listed in DAC_NationalDownloadableFile.csv
cms_nat_dict = dict(zip(cms_npi_map.CMS_SPECIALITY, cms_npi_map.NAT_SPECIALITY))

In [None]:
#physcians supplemental file (fname,lname,mname,address,state,city,zip,taxonomy,speciality)
hcp_suppl_file_df = spark.read.options(header='True').csv(BASE_DIR + "Physician Supplement File for all Program Years/OP_PH_PRFL_SPLMTL_P06302021.csv")

hcp_suppl_file_df = hcp_suppl_file_df.withColumnRenamed("Physician_Profile_ID", "ppid")\
                                           .withColumnRenamed("Physician_Profile_First_Name","f_name")\
                                           .withColumnRenamed("Physician_Profile_Last_Name","l_name")\
                                           .withColumnRenamed("Physician_Profile_Alternate_Middle_Name","ma_name")\
                                           .withColumnRenamed("Physician_Profile_Middle_Name","m_name")\
                                           .withColumnRenamed("Physician_Profile_Address_Line_1","adr_ln_1")\
                                           .withColumnRenamed("Physician_Profile_Address_Line_2","adr_ln_2")\
                                           .withColumnRenamed("Physician_Profile_City","city")\
                                           .withColumnRenamed("Physician_Profile_State","state")\
                                           .withColumnRenamed("Physician_Profile_Zipcode","zip")\
                                           .withColumnRenamed("Physician_Profile_OPS_Taxonomy_1","txcode_1")\
                                           .withColumnRenamed("Physician_Profile_OPS_Taxonomy_2","txcode_2")\
                                           .withColumnRenamed("Physician_Profile_OPS_Taxonomy_3","txcode_3")


hcp_suppl_file_df = hcp_suppl_file_df.withColumn('txcode_1', trim(hcp_suppl_file_df.txcode_1))

#separate 5 digit zip from 4 Codes
hcp_suppl_file_df = hcp_suppl_file_df.withColumn("zip", F.regexp_replace("zip", "-", " "))

#add specialities from DAC_NationalDownloadableFile (NAT) to supplemental file for 3 different taxonomy codes.
mapping_expr1 = create_map([lit(x) for x in chain(*tax_hl_dict.items())])
mapping_expr2 = create_map([lit(x) for x in chain(*tax_detail_dict.items())])
mapping_expr3 = create_map([lit(x) for x in chain(*cms_nat_dict.items())])
hcp_suppl_file_df = hcp_suppl_file_df.withColumn("cms_hl_speciality_1", mapping_expr1.getItem(col("txcode_1")))
hcp_suppl_file_df = hcp_suppl_file_df.withColumn("cms_detail_speciality_1", mapping_expr2.getItem(col("txcode_1")))
hcp_suppl_file_df = hcp_suppl_file_df.withColumn("nat_speciality_1", mapping_expr3.getItem(col("txcode_1")))

hcp_suppl_file_df = hcp_suppl_file_df.withColumn("cms_hl_speciality_2", mapping_expr1.getItem(col("txcode_2")))
hcp_suppl_file_df = hcp_suppl_file_df.withColumn("cms_detail_speciality_2", mapping_expr2.getItem(col("txcode_2")))
hcp_suppl_file_df = hcp_suppl_file_df.withColumn("nat_speciality_2", mapping_expr3.getItem(col("txcode_2")))

hcp_suppl_file_df = hcp_suppl_file_df.withColumn("cms_hl_speciality_3", mapping_expr1.getItem(col("txcode_3")))
hcp_suppl_file_df = hcp_suppl_file_df.withColumn("cms_detail_speciality_3", mapping_expr2.getItem(col("txcode_3")))
hcp_suppl_file_df = hcp_suppl_file_df.withColumn("nat_speciality_3", mapping_expr3.getItem(col("txcode_3")))


#concat columns and get unique values for each column combinations.

hcp_suppl_file_df = hcp_suppl_file_df.withColumn(
    "state_city", concat_ws(" ", hcp_suppl_file_df['state'],
                              hcp_suppl_file_df['city'],
                              hcp_suppl_file_df['zip']))
hcp_suppl_file_df = hcp_suppl_file_df.withColumn(
    "address", concat_ws(" ", hcp_suppl_file_df['adr_ln_1'],
                              hcp_suppl_file_df['adr_ln_2']))
hcp_suppl_file_df = hcp_suppl_file_df.withColumn(
    "taxonomies", F.concat_ws(' ',
                F.array_distinct(F.split(concat_ws(" ", hcp_suppl_file_df['txcode_1'],
                              hcp_suppl_file_df['txcode_2'],
                              hcp_suppl_file_df['txcode_3']), ' '))))
hcp_suppl_file_df = hcp_suppl_file_df.withColumn(
    "middle_name", F.concat_ws(' ',
                F.array_distinct(F.split(concat_ws(" ", hcp_suppl_file_df['m_name'],
                              hcp_suppl_file_df['ma_name'],
                              substring('m_name', 0, 1),
                              substring('ma_name', 0, 1),
                       ), ' '))))
hcp_suppl_file_df.select("ppid", "f_name", "l_name",
                          "address", "state_city", "taxonomies", "middle_name").show(truncate=False)


In [None]:
#format zip into 5 digit zip and 4 codes
def formatzip(zip_str):
    if zip_str is None:
        return ''
    l = len(zip_str)
    if l==5:
        return zip_str
    else:
        zip_str = '0' * (9-l) + zip_str
        zip_str_lst = [(zip_str[i:i+5]) for i in range(0, l, 5)]
        zip_str = " ".join(zip_str_lst)
    return zip_str

# creating unique combinations of combined columns
def concat_uniq(X):
    return F.concat_ws(' ', F.array_distinct(F.split(X, ' ')))

#NPI file
npi_file_df = spark.read.options(header='True').csv(BASE_DIR + 'npidata_pfile_20050523-20211212.csv')

npi_file_df = npi_file_df.withColumnRenamed("Provider First Name", "f_name")\
                         .withColumnRenamed("Provider Last Name (Legal Name)", "l_name")\
                         .withColumnRenamed("Provider Middle Name", "m_name")\
                         .withColumnRenamed("Provider First Line Business Practice Location Address", "adr_ln_1")\
                         .withColumnRenamed("Provider Second Line Business Practice Location Address", "adr_ln_2")\
                         .withColumnRenamed("Provider Business Practice Location Address City Name", "city")\
                         .withColumnRenamed("Provider Business Practice Location Address State Name", "state")\
                         .withColumnRenamed("Provider Business Practice Location Address Postal Code", "zip")\
                         .withColumnRenamed("Healthcare Provider Taxonomy Code_1", "txcode_1")\
                         .withColumnRenamed("Healthcare Provider Taxonomy Code_2", "txcode_2")\
                         .withColumnRenamed("Healthcare Provider Taxonomy Code_3", "txcode_3")

formatzip_udf = udf(formatzip, StringType())
npi_file_df = npi_file_df.withColumn("zip", formatzip_udf(npi_file_df['zip']))

# filter only physicians
npi_file_df = npi_file_df.filter(npi_file_df['Entity Type Code'] == 1) 


npi_file_df = npi_file_df.groupBy("NPI","f_name","l_name")\
    .agg(concat_uniq(F.concat_ws(' ',F.collect_list('adr_ln_1'), F.collect_list('adr_ln_2'))).alias('address'),
        concat_uniq(F.concat_ws(' ',F.collect_list('state'), F.collect_list('city'), F.collect_list('zip'))).alias('state_city'),
        concat_uniq(F.concat_ws(' ',F.collect_list('txcode_1'),F.collect_list('txcode_2'),F.collect_list('txcode_3'))).alias('taxonomies'),
        concat_uniq(F.concat_ws(' ',F.collect_list('m_name'), F.collect_list(substring('m_name', 0, 1)))).alias('middle_name')
    )

npi_file_df.select("NPI", "f_name", "l_name","address", "state_city", "taxonomies", "middle_name").show(truncate=False)


In [None]:
#sort hcp_suppl_file_df by fname and lname
hcp_suppl_file =hcp_suppl_file_df.sort('f_name', 'l_name').select("ppid", "f_name", "l_name",
                                             "address", "state_city", "taxonomies", "middle_name")
hcp_suppl_file.show(truncate=False)

In [None]:
#sort npi_file_df by fname and lname
npi_file = npi_file_df.sort('f_name', 'l_name').select("NPI", "f_name", "l_name","address", "state_city", "taxonomies", "middle_name")
npi_file.show(truncate=False)

In [None]:
# write sorted dataframes to csvs

hcp_suppl_file.coalesce(1).write.option("header","true").csv(BASE_DIR + "data_processing/combined_out/hcp_suppl",sep=',')
npi_file.coalesce(1).write.option("header","true").csv(BASE_DIR + "data_processing/combined_out/hcp_npi",sep=',')