In [0]:
from pyspark.sql import DataFrameWriter
import pandas as pd
from pyspark.sql.functions import *

In [0]:
dbutils.secrets.listScopes()

[SecretScope(name='scope31dec')]

In [0]:
dbutils.secrets.list(scope = 'scope31dec')

[SecretMetadata(key='blobkeysecret31dec')]

In [0]:
spark.conf.set("fs.azure.account.key.healthcareblob31dec.dfs.core.windows.net",
    dbutils.secrets.get(scope="scope31dec", key="blobkeysecret31dec"))

In [0]:
display(dbutils.fs.ls("abfss://rawdata@healthcareblob31dec.dfs.core.windows.net"))

path,name,size,modificationTime
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/Patient_records.csv,Patient_records.csv,5110,1704010961000
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/claims.json,claims.json,16385,1704027744000
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/disease.csv,disease.csv,1489,1704010960000
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/group.csv,group.csv,4390,1704010960000
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/hospital.csv,hospital.csv,1328,1704010960000
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/subgroup.csv,subgroup.csv,561,1704010960000
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/subscriber.csv,subscriber.csv,12061,1704010960000


In [0]:
subscriber_data = spark.read.csv("abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/subscriber.csv", header = True, inferSchema= True)

In [0]:
subscriber_data.show(10, False)

+----------+----------+-----------+-----------------+----------+------+--------------+-------+------------+--------+---------+--------+----------+----------+
|sub_id    |first_name|last_name  |Street           |Birth_date|Gender|Phone         |Country|City        |Zip Code|Subgrp_id|Elig_ind|eff_date  |term_date |
+----------+----------+-----------+-----------------+----------+------+--------------+-------+------------+--------+---------+--------+----------+----------+
|SUBID10000|Harbir    |Vishwakarma|Baria Marg       |1924-06-30|Female|+91 0112009318|India  |Rourkela    |767058  |S107     |Y       |1944-06-30|1954-01-14|
|SUBID10001|Brahmdev  |Sonkar     |Lala Marg        |1948-12-20|Female|+91 1727749552|India  |Tiruvottiyur|34639   |S105     |Y       |1968-12-20|1970-05-16|
|SUBID10002|Ujjawal   |Devi       |Mammen Zila      |1980-04-16|Male  |+91 8547451606|India  |Berhampur   |914455  |S106     |N       |2000-04-16|2008-05-04|
|SUBID10003|Ballari   |Mishra     |Sahni Zila       

In [0]:
subscriber_data.printSchema()

root
 |-- sub_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- Birth_date: date (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Phone: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zip Code: integer (nullable = true)
 |-- Subgrp_id: string (nullable = true)
 |-- Elig_ind: string (nullable = true)
 |-- eff_date: date (nullable = true)
 |-- term_date: date (nullable = true)



In [0]:
# check null values
subscriber_data.select([count(when(col(c).isNull(), c)). alias(c) for c in subscriber_data.columns]).show()

+------+----------+---------+------+----------+------+-----+-------+----+--------+---------+--------+--------+---------+
|sub_id|first_name|last_name|Street|Birth_date|Gender|Phone|Country|City|Zip Code|Subgrp_id|Elig_ind|eff_date|term_date|
+------+----------+---------+------+----------+------+-----+-------+----+--------+---------+--------+--------+---------+
|     0|        27|        0|     0|         0|     0|    3|      0|   0|       0|        2|       4|       0|        0|
+------+----------+---------+------+----------+------+-----+-------+----+--------+---------+--------+--------+---------+



In [0]:
subscriber_data = subscriber_data.drop(col('Phone'))

In [0]:
subscriber_data= subscriber_data.fillna({"Elig_ind" : "N", "first_name" : "Guest/NA"})

In [0]:
subscriber_data.show(6)

+----------+----------+-----------+------------+----------+------+-------+------------+--------+---------+--------+----------+----------+
|    sub_id|first_name|  last_name|      Street|Birth_date|Gender|Country|        City|Zip Code|Subgrp_id|Elig_ind|  eff_date| term_date|
+----------+----------+-----------+------------+----------+------+-------+------------+--------+---------+--------+----------+----------+
|SUBID10000|    Harbir|Vishwakarma|  Baria Marg|1924-06-30|Female|  India|    Rourkela|  767058|     S107|       Y|1944-06-30|1954-01-14|
|SUBID10001|  Brahmdev|     Sonkar|   Lala Marg|1948-12-20|Female|  India|Tiruvottiyur|   34639|     S105|       Y|1968-12-20|1970-05-16|
|SUBID10002|   Ujjawal|       Devi| Mammen Zila|1980-04-16|  Male|  India|   Berhampur|  914455|     S106|       N|2000-04-16|2008-05-04|
|SUBID10003|   Ballari|     Mishra|  Sahni Zila|1969-09-25|Female|  India|Bihar Sharif|   91481|     S104|       N|1989-09-25|1995-06-05|
|SUBID10004|   Devnath|  Srivastav

In [0]:
subscriber_data = subscriber_data.withColumn("Subscriber_Age", (months_between (current_date(), col('Birth_date') )/12).cast("int"))

In [0]:
subscriber_data = subscriber_data.drop(col('Birth_date'))

In [0]:
subscriber_data.show(5, False)

+----------+----------+-----------+-----------+------+-------+------------+--------+---------+--------+----------+----------+--------------+
|sub_id    |first_name|last_name  |Street     |Gender|Country|City        |Zip Code|Subgrp_id|Elig_ind|eff_date  |term_date |Subscriber_Age|
+----------+----------+-----------+-----------+------+-------+------------+--------+---------+--------+----------+----------+--------------+
|SUBID10000|Harbir    |Vishwakarma|Baria Marg |Female|India  |Rourkela    |767058  |S107     |Y       |1944-06-30|1954-01-14|99            |
|SUBID10001|Brahmdev  |Sonkar     |Lala Marg  |Female|India  |Tiruvottiyur|34639   |S105     |Y       |1968-12-20|1970-05-16|75            |
|SUBID10002|Ujjawal   |Devi       |Mammen Zila|Male  |India  |Berhampur   |914455  |S106     |N       |2000-04-16|2008-05-04|43            |
|SUBID10003|Ballari   |Mishra     |Sahni Zila |Female|India  |Bihar Sharif|91481   |S104     |N       |1989-09-25|1995-06-05|54            |
|SUBID10004|D

In [0]:
subscriber_data.select("*").where(col("Subgrp_id").isNull()).show(5)

+----------+----------+---------+---------+------+-------+--------+--------+---------+--------+----------+----------+--------------+
|    sub_id|first_name|last_name|   Street|Gender|Country|    City|Zip Code|Subgrp_id|Elig_ind|  eff_date| term_date|Subscriber_Age|
+----------+----------+---------+---------+------+-------+--------+--------+---------+--------+----------+----------+--------------+
|SUBID10022|   Prakash|      Rao|   Sachar|Female|  India|Kottayam|  180680|     NULL|       N|1943-09-15|1948-10-19|           100|
|SUBID10049|   Paridhi|    Yadav|Sant Path|Female|  India|Jabalpur|  883754|     NULL|       N|1979-03-27|1985-06-01|            64|
+----------+----------+---------+---------+------+-------+--------+--------+---------+--------+----------+----------+--------------+



In [0]:
output_container_path = "abfss://stagingdata@healthcareblob31dec.dfs.core.windows.net"
output_blob_folder = "stagingdata/"
subscriber_data.coalesce(1).write.mode("overwrite").option("header" , "true"). format("com.databricks.spark.csv").save(output_blob_folder)
files = dbutils.fs.ls(output_blob_folder)
outputfile = [x for x in files if x.name.startswith("part-")]
dbutils.fs.mv(outputfile[0].path, "%s/subscriber_data_stage.csv"% output_container_path)

True