In [0]:
from pyspark.sql.functions import lit, cast, col
from pyspark.sql.types import *

In [0]:
# mount data
mount_point = "/mnt/adverse-events"

In [0]:
# unmount if needed
mount_point = mount_point
dbutils.fs.unmount(mount_point)

/mnt/adverse-events has been unmounted.


True

In [0]:
# connect to storage account and containers
configs = {"fs.azure.account.auth.type": "OAuth",
           "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
           "fs.azure.account.oauth2.client.id": "e99fd849-a6b3-49c5-b7a0-044016a9b700",
           "fs.azure.account.oauth2.client.secret": dbutils.secrets.get(scope = "key-vault-scope", key = "client-secret") ,
           "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/62a42a60-1f3b-41ee-92d8-d46ce7b9d9ef/oauth2/token/"}

# mount data
dbutils.fs.mount(
    source = "abfss://adverse-events@adverseeventsa.dfs.core.windows.net",
    mount_point = mount_point,
    extra_configs = configs
)

True

In [0]:
%fs
ls "/mnt/adverse-events/transformed-data/"

path,name,size,modificationTime
dbfs:/mnt/adverse-events/transformed-data/dim_action_taken.csv/,dim_action_taken.csv/,0,1730825044000
dbfs:/mnt/adverse-events/transformed-data/dim_administration_route.csv/,dim_administration_route.csv/,0,1730825045000
dbfs:/mnt/adverse-events/transformed-data/dim_country_codes.csv/,dim_country_codes.csv/,0,1730825046000
dbfs:/mnt/adverse-events/transformed-data/dim_drug_information_2015.csv/,dim_drug_information_2015.csv/,0,1730824646000
dbfs:/mnt/adverse-events/transformed-data/dim_drug_information_2016.csv/,dim_drug_information_2016.csv/,0,1730827026000
dbfs:/mnt/adverse-events/transformed-data/dim_drug_information_2017.csv/,dim_drug_information_2017.csv/,0,1730828794000
dbfs:/mnt/adverse-events/transformed-data/dim_drug_information_2018.csv/,dim_drug_information_2018.csv/,0,1730832360000
dbfs:/mnt/adverse-events/transformed-data/dim_drug_information_2019.csv/,dim_drug_information_2019.csv/,0,1730836627000
dbfs:/mnt/adverse-events/transformed-data/dim_drug_information_2020.csv/,dim_drug_information_2020.csv/,0,1730838976000
dbfs:/mnt/adverse-events/transformed-data/dim_drug_information_2021.csv/,dim_drug_information_2021.csv/,0,1730900480000


In [0]:
directory_path = "dbfs:/mnt/adverse-events/transformed-data/"
files = dbutils.fs.ls(directory_path)

In [0]:
def concat_file_years(files, table_name, output_df):
    for file in files:
        if table_name in file.name:
            df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(file.path)
            if 'metadata' in table_name:
                year = (file.name.split('.')[0]).split('_')[3]
            else:
                year = (file.name.split('.')[0]).split('_')[2]
        
            df = df.withColumn('year', lit(year).cast(IntegerType()))
            output_df = output_df.union(df)
    return output_df


In [0]:
# dim_patient
dim_patient_schema = StructType([
    StructField("patient_id", IntegerType(), True),
    StructField("patient_onset_age", DoubleType(), True),
    StructField("patient_age_group", StringType(), True),
    StructField("patient_sex", StringType(), True),
    StructField("patient_weight(kg)", DoubleType(), True),
    StructField("year", IntegerType(), True)
])

In [0]:

# Create an empty DataFrame with the defined schema
combined_df = spark.createDataFrame([], schema=dim_patient_schema)
dim_patient = concat_file_years(files, 'dim_patient', combined_df)
display(dim_patient)




patient_id,patient_onset_age,patient_age_group,patient_sex,patient_weight(kg),year
10219568,80.0,Elderly,Female,,2015
10444882,83.0,Elderly,Female,,2015
10521631,1.0,Child,Male,,2015
10595341,30.0,Adult,Female,42.6,2015
10686463,57.0,Adult,Female,,2015
10686464,49.0,Adult,Male,122.45,2015
10686465,50.0,Adult,Female,,2015
10686466,49.0,Adult,Male,,2015
10686467,69.0,Elderly,Female,,2015
10686468,63.0,Adult,Male,,2015


In [0]:
# fact_reaction
fact_reaction_schema = StructType([
    StructField("patient_id", IntegerType(), True),
    StructField("patient_reaction", StringType(), True),
    StructField("reaction_outcome", IntegerType(), True),
    StructField("reaction_severity", IntegerType(), True),
    StructField("action_taken", IntegerType(), True),
    StructField("medicinal_product", StringType(), True),
    StructField("start_dosage_date", DateType(), True),
    StructField("action_taken", DateType(), True),
    StructField("year", IntegerType(), True)
])

In [0]:
combined_df = spark.createDataFrame([], schema=fact_reaction_schema)
fact_reaction = concat_file_years(files, 'fact_reaction', combined_df)
display(fact_reaction)

patient_id,patient_reaction,reaction_outcome,reaction_severity,action_taken,medicinal_product,start_dosage_date,action_taken.1,year
10686488,knee arthroplasty,1.0,4,4.0,advair,,,2015
10686490,bladder cancer,3.0,6,,bacillus,2013-12-06,,2015
10686490,neuropathy peripheral,3.0,6,,bacillus,2013-12-06,,2015
10686490,vascular cauterisation,1.0,6,,bacillus,2013-12-06,,2015
10686490,neuralgia,3.0,6,,bacillus,2013-12-06,,2015
10686490,pruritus,1.0,6,,bacillus,2013-12-06,,2015
10686490,emergency care examination,1.0,6,,bacillus,2013-12-06,,2015
10686490,blood urine present,1.0,6,,bacillus,2013-12-06,,2015
10686493,myocardial infarction,5.0,1,,fluticasone,2011-08-15,,2015
10686495,monoparesis,1.0,6,4.0,advair,,,2015


In [0]:
# dim_patient
dim_report_metadata_schema = StructType([
    StructField("safety_report_id", IntegerType(), True),
    StructField("occurance_country", StringType(), True),
    StructField("date_report_received", DateType(), True),
    StructField("date_of_transmission", DateType(), True),
    StructField("reported_company", StringType(), True),
    StructField("year", IntegerType(), True)
])

In [0]:
combined_df = spark.createDataFrame([], schema=dim_report_metadata_schema)
dim_report_metadata = concat_file_years(files, 'dim_report_metadata', combined_df)
display(dim_report_metadata)

safety_report_id,occurance_country,date_report_received,date_of_transmission,reported_company,year
10219568,US,2015-07-05,2015-11-25,glaxosmithkline,2015
10444882,IL,2015-08-11,2015-11-25,actelion,2015
10521631,US,2015-01-15,2015-11-25,lundbeck,2015
10595341,,2015-03-09,2016-03-04,0199,2015
10686463,US,2015-01-01,2015-07-20,amgen,2015
10686464,US,2015-01-01,2015-07-20,amgen,2015
10686465,US,2015-01-01,2015-07-20,amgen,2015
10686466,US,2015-01-01,2015-07-20,amgen,2015
10686467,US,2015-01-01,2015-07-20,astrazeneca,2015
10686468,US,2015-01-01,2016-03-04,roche,2015


In [0]:
def concat_drug_info(files, table_name, output_df):
    for file in files:
        if table_name in file.name:
            df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(file.path)
            output_df = output_df.union(df)
    return output_df

In [0]:
# dim_patient
dim_drug_schema = StructType([
    StructField("medicinal_product", StringType(), True),
    StructField("indication", StringType(), True),
    StructField("administration_route", IntegerType(), True),
    StructField("dosage_form", StringType(), True)
])

In [0]:
combined_df = spark.createDataFrame([], schema=dim_drug_schema)
dim_drug_information = concat_drug_info(files, 'dim_drug_information', combined_df)
dim_drug_information = dim_drug_information.dropDuplicates(['medicinal_product'])
display(dim_drug_information)

medicinal_product,indication,administration_route,dosage_form
aaoluona,neoplasm malignant,41.0,injection
abasaglar,type 2 diabetes mellitus,58.0,
abataceptgenetical,rheumatoid arthritis,58.0,injection
abcid,gastrooesophageal reflux disease,48.0,
abella,contraception,48.0,tablet
abirateron,prostate cancer metastatic,48.0,tablet
abiraterona,prostate cancer metastatic,65.0,
abitrexate,lymphomatoid papulosis,48.0,
ablok,product used for unknown indication,65.0,
abrilada,ankylosing spondylitis,58.0,


In [0]:
fact_reaction.repartition(1).write.mode("overwrite").option("header", "true").csv(f"mnt/adverse-events/consumer-data/fact_reaction.csv")
dim_patient.repartition(1).write.mode("overwrite").option("header", "true").csv(f"mnt/adverse-events/consumer-data/dim_patient.csv")
dim_drug_information.repartition(1).write.mode("overwrite").option("header", "true").csv(f"mnt/adverse-events/consumer-data/dim_drug_information.csv")
dim_report_metadata.repartition(1).write.mode("overwrite").option("header", "true").csv(f"mnt/adverse-events/consumer-data/dim_report_metadata.csv")