In [3]:
from pyspark.sql import SparkSession
#Initialize Spark Session
spark = SparkSession.builder.appName("Health Insurance").getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/18 10:36:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/18 10:36:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
import json
import yaml
from utils import dataframe_utils
from utils import request_utils
from urllib.parse import urlparse
import os


In [3]:
# Read config yaml file
with open("api_config.yaml") as f:
    config_data=yaml.safe_load(f)
# this is just for printing
json_data=json.dumps(config_data,indent=2)
print(json_data)

{
  "apis": {
    "health_insurance_rate_puf": {
      "description": "Public Use File API for Health Rate Data",
      "version": 1,
      "endpoint": "https://data.healthcare.gov/api/1/metastore/schemas/dataset/items/672d5f6a-b8a7-4ebe-87f6-67db641e192d",
      "method": "GET",
      "params": [],
      "response": {
        "format": "JSON"
      }
    }
  }
}


In [4]:
rate_api_url=config_data['apis']['health_insurance_rate_puf']['endpoint']

meta_data = request_utils.make_api_call(rate_api_url)
with open("meta_data_files/health_insurance_rate_puf.json",'w') as meta_file:
    json.dump(meta_data.json(),meta_file,indent=2)





In [13]:
# extract download link and format 
with open("meta_data_files/health_insurance_rate_puf.json",'r') as meta_file:
    meta_data=json.load(meta_file)
    format=meta_data['distribution'][0]['format']
    data_url=meta_data['distribution'][0]['downloadURL']
    print(data_url)
# make API call 
data_response = request_utils.make_api_call(data_url)

# Step 2: Extract filename from URL
parsed_url = urlparse(data_url)
filename = os.path.basename(parsed_url.path)  # gets 'Rate_PUF.csv'
# Optional: Verify it's a good response
if data_response.status_code == 200:
    with open(f"data/{filename}", "wb") as f:
        f.write(data_response.content)
    print("✅ File downloaded successfully.")
else:
    raise Exception(f"❌ Failed to download file. Status: {data_response.status_code}")








https://data.healthcare.gov/datafile/py2025/Rate_PUF.csv
✅ File downloaded successfully.


In [2]:
def dec(num):
    num-=1
print(dec(6))

None


In [None]:
# Step 3: Read using PySpark
read_file=dataframe_utils.read_data_spark(file_path=f"data/{filename}",file_format=format,spark=spark,header=True,inferSchema=True)
read_file.show(10)

[Stage 4:>                                                          (0 + 8) / 8]

+------------+---------+--------+----------+-------------------+-----------------+------------------+--------------+-------------+-------+-------------+--------------+---------------------+------+--------------------------------+---------------------------------+-----------------------------------------+---------------------+----------------------+------------------------------+
|BusinessYear|StateCode|IssuerId|SourceName|         ImportDate|RateEffectiveDate|RateExpirationDate|        PlanId| RatingAreaId|Tobacco|          Age|IndividualRate|IndividualTobaccoRate|Couple|PrimarySubscriberAndOneDependent|PrimarySubscriberAndTwoDependents|PrimarySubscriberAndThreeOrMoreDependents|CoupleAndOneDependent|CoupleAndTwoDependents|CoupleAndThreeOrMoreDependents|
+------------+---------+--------+----------+-------------------+-----------------+------------------+--------------+-------------+-------+-------------+--------------+---------------------+------+--------------------------------+-------

                                                                                

25/04/11 16:40:30 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 735273 ms exceeds timeout 120000 ms
25/04/11 16:40:30 WARN SparkContext: Killing executors is not supported by current scheduler.
25/04/11 16:40:30 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$

In [13]:
query={'age': 21, 'state': 'AK', 'tobacco': 'No Preference'}
filename = 'Rate_PUF.csv'
df= dataframe_utils.read_data_spark(file_path=f"data/{filename}",file_format="csv",spark=spark,header=True,inferSchema=True)
print(df.show())
df=df.filter(df["Age"].isNotNull())
if query['age']:
    df=df.filter(df['Age'] == query['age'])
if query['state']:
    df=df.filter(df['StateCode'] == query['state'])
if query['tobacco']:
    df=df.filter(df['Tobacco']== query['tobacco'])

print(df.limit(10).toPandas().to_string(index=False))

                                                                                

+------------+---------+--------+----------+-------------------+-----------------+------------------+--------------+-------------+-------------+-------------+--------------+---------------------+------+--------------------------------+---------------------------------+-----------------------------------------+---------------------+----------------------+------------------------------+
|BusinessYear|StateCode|IssuerId|SourceName|         ImportDate|RateEffectiveDate|RateExpirationDate|        PlanId| RatingAreaId|      Tobacco|          Age|IndividualRate|IndividualTobaccoRate|Couple|PrimarySubscriberAndOneDependent|PrimarySubscriberAndTwoDependents|PrimarySubscriberAndThreeOrMoreDependents|CoupleAndOneDependent|CoupleAndTwoDependents|CoupleAndThreeOrMoreDependents|
+------------+---------+--------+----------+-------------------+-----------------+------------------+--------------+-------------+-------------+-------------+--------------+---------------------+------+----------------------

In [22]:
df= dataframe_utils.read_data_spark(file_path=f"data/{filename}",file_format="csv",spark=spark,header=True,inferSchema=True)
df.createOrReplaceTempView("rates_table")
spark.sql("select distinct Tobacco from rates_table").show(truncate=False)

                                                                                

+-----------------------------+
|Tobacco                      |
+-----------------------------+
|No Preference                |
|Tobacco User/Non-Tobacco User|
|NULL                         |
+-----------------------------+

