In [0]:
from pyspark.sql import DataFrameWriter
import pandas as pd
from pyspark.sql.functions import *

In [0]:
dbutils.secrets.listScopes()

[SecretScope(name='scope31dec')]

In [0]:
dbutils.secrets.list(scope = 'scope31dec')

[SecretMetadata(key='blobkeysecret31dec')]

In [0]:
spark.conf.set("fs.azure.account.key.healthcareblob31dec.dfs.core.windows.net",
    dbutils.secrets.get(scope="scope31dec", key="blobkeysecret31dec"))

In [0]:
display(dbutils.fs.ls("abfss://rawdata@healthcareblob31dec.dfs.core.windows.net"))

path,name,size,modificationTime
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/Patient_records.csv,Patient_records.csv,5110,1704010961000
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/disease.csv,disease.csv,1489,1704010960000
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/group.csv,group.csv,4390,1704010960000
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/hospital.csv,hospital.csv,1328,1704010960000
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/subgroup.csv,subgroup.csv,561,1704010960000
abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/subscriber.csv,subscriber.csv,12061,1704010960000


In [0]:
patient_data = spark.read.csv("abfss://rawdata@healthcareblob31dec.dfs.core.windows.net/Patient_records.csv", header = True, inferSchema= True)

In [0]:
patient_data.show(10, False)

+----------+------------+--------------+------------------+--------------+----------------+------------+-----------+
|Patient_id|Patient_name|patient_gender|patient_birth_date|patient_phone |disease_name    |city        |hospital_id|
+----------+------------+--------------+------------------+--------------+----------------+------------+-----------+
|187158    |Harbir      |Female        |1924-06-30        |+91 0112009318|Galactosemia    |Rourkela    |H1001      |
|112766    |Brahmdev    |Female        |1948-12-20        |+91 1727749552|Bladder cancer  |Tiruvottiyur|H1016      |
|199252    |Ujjawal     |Male          |1980-04-16        |+91 8547451606|Kidney cancer   |Berhampur   |H1009      |
|133424    |Ballari     |Female        |1969-09-25        |+91 0106026841|Suicide         |Bihar Sharif|H1017      |
|172579    |Devnath     |Female        |1946-05-01        |+91 1868774631|Food allergy    |Bidhannagar |H1019      |
|171320    |Atasi       |Male          |1967-10-02        |+91 9

In [0]:
patient_data.printSchema()

root
 |-- Patient_id: integer (nullable = true)
 |-- Patient_name: string (nullable = true)
 |-- patient_gender: string (nullable = true)
 |-- patient_birth_date: date (nullable = true)
 |-- patient_phone: string (nullable = true)
 |-- disease_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- hospital_id: string (nullable = true)



In [0]:
patient_data = patient_data.select("Patient_id", "Patient_name", "patient_gender", "patient_birth_date" ,"disease_name", "city", "hospital_id").distinct()

In [0]:
patient_data = patient_data.fillna({"Patient_name" : "Guest/NA"})

In [0]:
#check for null values 
#patient_data.select([count(when(isnan(c) | col(c).isNull(), c)). alias(c) for c in patient_data.columns]).show()

In [0]:
patient_data.show(20)

+----------+------------+--------------+------------------+----------------+--------------------+-----------+
|Patient_id|Patient_name|patient_gender|patient_birth_date|    disease_name|                city|hospital_id|
+----------+------------+--------------+------------------+----------------+--------------------+-----------+
|    130339|       Aakar|        Female|        1925-03-05|Drug consumption|        Bihar Sharif|      H1000|
|    167340|    Guest/NA|        Female|        1981-01-25|    Galactosemia|Surendranagar Dud...|      H1003|
|    148137|       Umang|        Female|        1963-07-14|     Pet allergy|            Haridwar|      H1002|
|    149367|    Guest/NA|          Male|        1925-06-12|    Head banging|           Bangalore|      H1013|
|    146382|  Dharmadaas|          Male|        1964-04-29|         Anthrax|Bhalswa Jahangir Pur|      H1019|
|    184479|      Bandhu|          Male|        1996-10-15|  Pollen allergy|           Chinsurah|      H1010|
|    13342

In [0]:
patient_data = patient_data.withColumn("Patient_Age", months_between (current_date(), col('patient_birth_date') )/12)

In [0]:
patient_data = patient_data.withColumn("Patient_Age", (months_between (current_date(), col('patient_birth_date') )/12).cast("int"))

In [0]:
#patient_data.show(6)

In [0]:
patient_data = patient_data.drop(col("patient_birth_date"))

In [0]:
#patient_data.show(6)

In [0]:
#check duplicates
patient_data.groupby(patient_data.columns).count().where("count >1").show()

+----------+------------+--------------+------------+----+-----------+-----------+-----+
|Patient_id|Patient_name|patient_gender|disease_name|city|hospital_id|Patient_Age|count|
+----------+------------+--------------+------------+----+-----------+-----------+-----+
+----------+------------+--------------+------------+----+-----------+-----------+-----+



In [0]:
# there may be no duplicates now but in future we may get duplicate values
patient_data = patient_data.dropDuplicates()

In [0]:
output_container_path = "abfss://stagingdata@healthcareblob31dec.dfs.core.windows.net"
output_blob_folder = "stagingdata/"
patient_data.coalesce(1).write.mode("overwrite").option("header" , "true"). format("com.databricks.spark.csv").save(output_blob_folder)
files = dbutils.fs.ls(output_blob_folder)
outputfile = [x for x in files if x.name.startswith("part-")]
dbutils.fs.mv(outputfile[0].path, "%s/patient_data_stage.csv"% output_container_path)

True