In [2]:
# HEALTHCARE PROJECT

In [3]:
#!pip install pyspark

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("BaylorScott_Irving Data Processing") \
    .getOrCreate()

# Define the file path
file_path = r'C:\Users\tarun\Documents\Semester 4\Analytics Practicum\POC\BaylorScott_Irving.csv'

# Load the CSV file with header
data_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(file_path)

# Select the specified columns
columns_to_include = ['description', 'code|1', 'payer_name', 'plan_name', 'standard_charge|gross', 'standard_charge|max', 'standard_charge|min']
df_subset = data_df.select(*columns_to_include)

# Show the first few rows of the filtered DataFrame
df_subset.show()

+--------------------+--------+----------+----------+---------------------+-------------------+-------------------+
|         description|  code|1|payer_name| plan_name|standard_charge|gross|standard_charge|max|standard_charge|min|
+--------------------+--------+----------+----------+---------------------+-------------------+-------------------+
|polidocanoL 1 % (...|63600031|     Aetna|Commercial|                120.0|               90.0|               40.8|
|EPINEPHrine 0.1 m...|63600001|     Aetna|Commercial|                35.44|              26.58|               12.4|
|EPINEPHrine 0.1 m...|63600001|     Aetna|Commercial|                 63.1|              47.32|              22.08|
|EPINEPHrine 0.1 m...|63600001|     Aetna|Commercial|                50.02|              37.52|              17.51|
|ibutilide fumarat...|63600031|     Aetna|Commercial|              1342.35|            1006.76|             180.73|
|ibutilide fumarat...|63600031|     Aetna|Commercial|              1646.

In [5]:
# Rename the columns
df_renamed = df_subset.withColumnRenamed('description', 'billing_description') \
                      .withColumnRenamed('code|1', 'billing_code') \
                      .withColumnRenamed('payer_name', 'insurance_provider') \
                      .withColumnRenamed('plan_name', 'insurance_plan') \
                      .withColumnRenamed('standard_charge|gross', 'gross_standard_charges') \
                      .withColumnRenamed('standard_charge|max', 'max_standard_charges') \
                      .withColumnRenamed('standard_charge|min', 'min_standard_charges')

# Show the first few rows of the renamed DataFrame
df_renamed.show()

+--------------------+------------+------------------+--------------+----------------------+--------------------+--------------------+
| billing_description|billing_code|insurance_provider|insurance_plan|gross_standard_charges|max_standard_charges|min_standard_charges|
+--------------------+------------+------------------+--------------+----------------------+--------------------+--------------------+
|polidocanoL 1 % (...|    63600031|             Aetna|    Commercial|                 120.0|                90.0|                40.8|
|EPINEPHrine 0.1 m...|    63600001|             Aetna|    Commercial|                 35.44|               26.58|                12.4|
|EPINEPHrine 0.1 m...|    63600001|             Aetna|    Commercial|                  63.1|               47.32|               22.08|
|EPINEPHrine 0.1 m...|    63600001|             Aetna|    Commercial|                 50.02|               37.52|               17.51|
|ibutilide fumarat...|    63600031|             Aetna| 

In [6]:
# Add a new column "hospital name" with value "UCLA_Resnick"
df_final = df_renamed.withColumn("hospital_name", lit("BaylorScott_Irving"))

# Show the first few rows of the final DataFrame
df_final.show()

+--------------------+------------+------------------+--------------+----------------------+--------------------+--------------------+------------------+
| billing_description|billing_code|insurance_provider|insurance_plan|gross_standard_charges|max_standard_charges|min_standard_charges|     hospital_name|
+--------------------+------------+------------------+--------------+----------------------+--------------------+--------------------+------------------+
|polidocanoL 1 % (...|    63600031|             Aetna|    Commercial|                 120.0|                90.0|                40.8|BaylorScott_Irving|
|EPINEPHrine 0.1 m...|    63600001|             Aetna|    Commercial|                 35.44|               26.58|                12.4|BaylorScott_Irving|
|EPINEPHrine 0.1 m...|    63600001|             Aetna|    Commercial|                  63.1|               47.32|               22.08|BaylorScott_Irving|
|EPINEPHrine 0.1 m...|    63600001|             Aetna|    Commercial|       