In [0]:
from pyspark.sql import DataFrameWriter
import pandas as pd
from pyspark.sql.functions import *

In [0]:
dbutils.secrets.listScopes()

[SecretScope(name='scope31dec')]

In [0]:
dbutils.secrets.list(scope = 'scope31dec')

[SecretMetadata(key='blobkeysecret31dec')]

In [0]:
spark.conf.set("fs.azure.account.key.healthcareblob31dec.dfs.core.windows.net",
    dbutils.secrets.get(scope="scope31dec", key="blobkeysecret31dec"))

In [0]:
display(dbutils.fs.ls("abfss://rawdata@healthcareblob31dec.dfs.core.windows.net"))

path,name,size,modificationTime
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/Patient_records.csv,Patient_records.csv,5110,1704010961000
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/disease.csv,disease.csv,1489,1704010960000
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/group.csv,group.csv,4390,1704010960000
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/hospital.csv,hospital.csv,1328,1704010960000
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/subgroup.csv,subgroup.csv,561,1704010960000
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/subscriber.csv,subscriber.csv,12061,1704010960000


In [0]:
hospital_data = spark.read.csv("abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/hospital.csv", header = True, inferSchema= True)

In [0]:
hospital_data.show(10, False)

+-----------+-----------------------------------------------------------------+----------+-----------+-------+
|Hospital_id|Hospital_name                                                    |city      |state      |country|
+-----------+-----------------------------------------------------------------+----------+-----------+-------+
|H1000      |All India Institute of Medical Sciences                          |New Delhi |NaN        |India  |
|H1001      |Medanta The Medicity                                             |Gurgaon   |Haryana    |India  |
|H1002      |The Christian Medical College                                    |Vellore   |Tamil Nadu |India  |
|H1003      |PGIMER - Postgraduate Institute of Medical Education and Research|Chandigarh|Haryana    |India  |
|H1004      |Apollo Hospital - Chennai                                        |Chennai   |Tamil Nadu |India  |
|H1005      |P. D. Hinduja National Hospital & Medical Research Centre        |Mumbai    |Maharashtra|India  |
|

In [0]:
hospital_data.printSchema()

root
 |-- subgrp_sk: string (nullable = true)
 |-- subgrp_name: string (nullable = true)
 |-- monthly_premium: string (nullable = true)
 |-- subgrp_id: string (nullable = true)



In [0]:
#check for null values 
hospital_data.select([count(when(isnan(c) | col(c).isNull(), c)). alias(c) for c in hospital_data.columns]).show()

+-----------+-------------+----+-----+-------+
|Hospital_id|Hospital_name|city|state|country|
+-----------+-------------+----+-----+-------+
|          0|            0|   0|    4|      0|
+-----------+-------------+----+-----+-------+



In [0]:
#check duplicates
hospital_data.groupby(hospital_data.columns).count().where("count >1").show()

+-----------+-------------+----+-----+-------+-----+
|Hospital_id|Hospital_name|city|state|country|count|
+-----------+-------------+----+-----+-------+-----+
+-----------+-------------+----+-----+-------+-----+



In [0]:
#hospital_data = hospital_data.drop_duplicates() # not required, no duplicates in this dataset

In [0]:
hospital_data = hospital_data.replace('NaN', None)

In [0]:
#hospital_data.show(5, False)

+-----------+-----------------------------------------------------------------+----------+----------+-------+
|Hospital_id|Hospital_name                                                    |city      |state     |country|
+-----------+-----------------------------------------------------------------+----------+----------+-------+
|H1000      |All India Institute of Medical Sciences                          |New Delhi |NULL      |India  |
|H1001      |Medanta The Medicity                                             |Gurgaon   |Haryana   |India  |
|H1002      |The Christian Medical College                                    |Vellore   |Tamil Nadu|India  |
|H1003      |PGIMER - Postgraduate Institute of Medical Education and Research|Chandigarh|Haryana   |India  |
|H1004      |Apollo Hospital - Chennai                                        |Chennai   |Tamil Nadu|India  |
+-----------+-----------------------------------------------------------------+----------+----------+-------+
only showi

In [0]:
hospital_data = hospital_data.fillna({"state": "UT"})

In [0]:
#hospital_data.show(5, False)

+-----------+-----------------------------------------------------------------+----------+----------+-------+
|Hospital_id|Hospital_name                                                    |city      |state     |country|
+-----------+-----------------------------------------------------------------+----------+----------+-------+
|H1000      |All India Institute of Medical Sciences                          |New Delhi |UT        |India  |
|H1001      |Medanta The Medicity                                             |Gurgaon   |Haryana   |India  |
|H1002      |The Christian Medical College                                    |Vellore   |Tamil Nadu|India  |
|H1003      |PGIMER - Postgraduate Institute of Medical Education and Research|Chandigarh|Haryana   |India  |
|H1004      |Apollo Hospital - Chennai                                        |Chennai   |Tamil Nadu|India  |
+-----------+-----------------------------------------------------------------+----------+----------+-------+
only showi

In [0]:
#loading into staging data
#staging_path = "abfss://stagingdata@healthcareblob31dec.dfs.core.windows.net"
#hospital_data.write.format("csv").option("header","true").mode("append").option("path", hospital_data).save()

In [0]:
output_container_path = "abfss://stagingdata@healthcareblob31dec.dfs.core.windows.net"
output_blob_folder = "stagingdata/"
hospital_data.coalesce(1).write.mode("overwrite").option("header" , "true"). format("com.databricks.spark.csv").save(output_blob_folder)
files = dbutils.fs.ls(output_blob_folder)
outputfile = [x for x in files if x.name.startswith("part-")]
dbutils.fs.mv(outputfile[0].path, "%s/hospital_data_stage.csv"% output_container_path)

True