In [0]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark
# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"
# Start a SparkSession
import findspark
findspark.init()

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("healthcare_fraud").getOrCreate()

In [0]:
from pyspark import SparkFiles
# Load in user_data.csv from S3 into a DataFrame
url = "https://healthcare-fraud-project.s3.amazonaws.com/Opiod_drug_names.csv"
spark.sparkContext.addFile(url)
df = spark.read.option('header', 'true').csv(SparkFiles.get("Opiod_drug_names.csv"), inferSchema=True, sep=',', timestampFormat="mm/dd/yy")


In [0]:
df.count()


93

In [0]:
from pyspark import SparkFiles
# Load in user_data.csv from S3 into a DataFrame
url = "https://healthcare-fraud-project.s3.amazonaws.com/OP_DTL_GNRL_PGYR2016_P01172020.csv"
spark.sparkContext.addFile(url)
df = spark.read.option('header', 'true').csv(SparkFiles.get("OP_DTL_GNRL_PGYR2016_P01172020.csv"), inferSchema=True, sep=',', timestampFormat="mm/dd/yy")

In [0]:
col_dropped = ['Teaching_Hospital_CCN','Teaching_Hospital_ID','Teaching_Hospital_Name','Physician_Middle_Name','Physician_Name_Suffix','Recipient_Province','Recipient_Postal_Code','Recipient_Country','Physician_License_State_code2','Physician_License_State_code3','Physician_License_State_code4','Physician_License_State_code5','Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name','Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID','Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name','Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State','Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Country','Number_of_Payments_Included_in_Total_Amount','Date_of_Payment','Form_of_Payment_or_Transfer_of_Value','Nature_of_Payment_or_Transfer_of_Value','City_of_Travel','State_of_Travel','Country_of_Travel','Name_of_Third_Party_Entity_Receiving_Payment_or_Transfer_of_Value','Charity_Indicator','Third_Party_Equals_Covered_Recipient_Indicator','Contextual_Information','Associated_Drug_or_Biological_NDC_1','Associated_Drug_or_Biological_NDC_2','Covered_or_Noncovered_Indicator_3','Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_3','Product_Category_or_Therapeutic_Area_3','Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3','Associated_Drug_or_Biological_NDC_3','Covered_or_Noncovered_Indicator_4','Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_4','Product_Category_or_Therapeutic_Area_4','Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4','Associated_Drug_or_Biological_NDC_4','Covered_or_Noncovered_Indicator_5','Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_5','Product_Category_or_Therapeutic_Area_5','Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5','Associated_Drug_or_Biological_NDC_5']
df_smaller = df.select([column for column in df.columns if column not in col_dropped])

In [8]:
df_smaller.drop(['Change_Type','Covered_Recipient_Type'])

TypeError: ignored

In [0]:
df_tiny = df_smaller.na.drop()

In [0]:
df_tiny.show(10)

In [0]:
#payment data
df_tiny.count()


921635

In [0]:
url = "https://healthcare-fraud-project.s3.amazonaws.com/2004EXCL.csv"
spark.sparkContext.addFile(url)
exclusionList = spark.read.option('header', 'true').csv(SparkFiles.get("2004EXCL.csv"), inferSchema=True, sep=',', timestampFormat="mm/dd/yy")

In [0]:
#exclusion list
exclusionList.show()
exclusionList.count()

+---------+-----------+--------+--------------------+--------------------+--------------------+------+----------+--------+--------------------+---------------+-----+-----+--------+--------+--------+----------+--------+
| LASTNAME|  FIRSTNAME| MIDNAME|             BUSNAME|             GENERAL|           SPECIALTY|  UPIN|       NPI|     DOB|             ADDRESS|           CITY|STATE|  ZIP|EXCLTYPE|EXCLDATE|REINDATE|WAIVERDATE|WVRSTATE|
+---------+-----------+--------+--------------------+--------------------+--------------------+------+----------+--------+--------------------+---------------+-----+-----+--------+--------+--------+----------+--------+
|     null|       null|        |     ANATOMY RX, LLC|      OTHER BUSINESS|            PHARMACY|  null|1356713226|    null|         1544 PURDUE|    LOS ANGELES|   CA|90025|  1128a4|20200520|       0|         0|    null|
|     null|       null|        |CANARSIE A W A R ...|      OTHER BUSINESS|SUBSTANCE ABUSE REHA|  null|1497971741|    null|12

In [0]:
url = "https://healthcare-fraud-project.s3.amazonaws.com/Medicare_Provider_Utilization_and_Payment_Data__2016_Part_D_Prescriber.csv"
spark.sparkContext.addFile(url)
drugProvider = spark.read.option('header', 'true').csv(SparkFiles.get("Medicare_Provider_Utilization_and_Payment_Data__2016_Part_D_Prescriber.csv"), inferSchema=True, sep=',', timestampFormat="mm/dd/yy")

In [0]:
#drug provider data frame
drugProvider.show()
drugProvider.count()

+----------+----------------------------+-------------------------+-------------------+--------------------+---------------------+----------------+--------------------+--------------------+----------+-----------------+-----------------------+----------------+---------------+---------------+-----------------------------+----------------------+------------------+----------------------------+---------------------+--------------------+
|       npi|nppes_provider_last_org_name|nppes_provider_first_name|nppes_provider_city|nppes_provider_state|specialty_description|description_flag|           drug_name|        generic_name|bene_count|total_claim_count|total_30_day_fill_count|total_day_supply|total_drug_cost|bene_count_ge65|bene_count_ge65_suppress_flag|total_claim_count_ge65|ge65_suppress_flag|total_30_day_fill_count_ge65|total_day_supply_ge65|total_drug_cost_ge65|
+----------+----------------------------+-------------------------+-------------------+--------------------+--------------------