In [0]:
storage_account = 'stgbus'
container_name = 'buscontain'
scope_name = 'bus-scope'
secret_name = 'busas'

In [0]:
display(dbutils.secrets.listScopes())

name
bus-scope


In [0]:
dbutils.secrets.list("bus-scope")

[SecretMetadata(key='busas')]

In [0]:
spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net","SAS")

In [0]:
spark.conf.set(f"fs.azure.sas.token.provider.type.{storage_account}.dfs.core.windows.net","org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")

In [0]:
spark.conf.set(f"fs.azure.sas.fixed.token.{storage_account}.dfs.core.windows.net",dbutils.secrets.get(f'{scope_name}',f'{secret_name}'))

In [0]:
businessDf = spark.read.json(f"abfs://{container_name}@{storage_account}.dfs.core.windows.net/landing/business.json")

In [0]:
businessDf.show(5)

+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+----------+------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|         city|               hours|is_open|  latitude|   longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+----------+------------+--------------------+-----------+------------+-----+-----+
|1616 Chapala St, ...|{NULL, NULL, NULL...|Pns2l4eNsfO8kk83d...|Doctors, Traditio...|Santa Barbara|                NULL|      0|34.4266787|-119.7111968|Abby Rappoport, L...|      93101|           7|  5.0|   CA|
|87 Grasso Plaza S...|{NULL, NULL, NULL...|mpf3x-BjTdTEA3yCZ...|Shipping Centers,...|       Affton|{8:0-18:30, 0:0-0...|      1| 38.551126|  -90.335695|    

In [0]:
businessDf.createOrReplaceTempView('business')

In [0]:
spark.sql('''select state, count(state) as cnt from business group by state order by cnt desc''').show()

validation 1 : to check if the given states are available in input file

In [0]:
stateDf = spark.read.csv(f"abfs://{container_name}@{storage_account}.dfs.core.windows.net/lookups/lookup_state.csv",header=True,inferSchema=True)

In [0]:
stateDf.createOrReplaceTempView('states')

In [0]:
actual_count = spark.sql('''
          select count(distinct b.state) as cnt
          from business b, states s 
          where s.state = b.state 
          ''').collect()

In [0]:
print(actual_count)

[Row(cnt=14)]


In [0]:
 actual_count =  actual_count[0]["cnt"]
 print(actual_count)

14


In [0]:
expected_count = stateDf.count()
print(expected_count)

14


In [0]:
validation_status1 = False 
if expected_count == actual_count :
    validation_status1 = True 
print(validation_status1)

True


Validation2 : number of records in the file should be atleast 300 to process further

In [0]:
expected_count = 300

In [0]:
actual_count = businessDf.count()
print(actual_count)

150346


In [0]:
validation_status2 = False 
if expected_count <= actual_count :
    validation_status2 = True 
print(validation_status2)

True


validation3 : no columns should have missing values > 200

In [0]:
missing_count = 200

In [0]:
from pyspark.sql.functions import col, isnull
validation_status3 = False
# Missing values
total_rows = businessDf.count()
for i in range(len(businessDf.columns)):
    column_name = businessDf.columns[i]
    n = businessDf.filter(isnull(col(column_name))).count()
    p = (n / total_rows) * 100
    print(f"Missing values in {column_name}: {p:.2f}%",' and total count: ',n)
    if column_name not in ['attributes','hours']: # since these columns are not required and would be removed in next phase
        if n > missing_count:
            validation_status3 = False
            break
        else:
            validation_status3 = True 
print(validation_status3)

Missing values in address: 0.00%  and total count:  0
Missing values in attributes: 9.14%  and total count:  13744
Missing values in business_id: 0.00%  and total count:  0
Missing values in categories: 0.07%  and total count:  103
Missing values in city: 0.00%  and total count:  0
Missing values in hours: 15.45%  and total count:  23223
Missing values in is_open: 0.00%  and total count:  0
Missing values in latitude: 0.00%  and total count:  0
Missing values in longitude: 0.00%  and total count:  0
Missing values in name: 0.00%  and total count:  0
Missing values in postal_code: 0.00%  and total count:  0
Missing values in review_count: 0.00%  and total count:  0
Missing values in stars: 0.00%  and total count:  0
Missing values in state: 0.00%  and total count:  0
True


In [0]:
validation_status = "failure"
if validation_status1 and validation_status2 and validation_status3:
    validation_status = "success" 
else:
    validation_status = "failure"
print(validation_status)

success


writing the validation_status to a file which will be used in ADF as a condition to run copyData

In [0]:
val_df = spark.createDataFrame([(validation_status,)], ["status"])

In [0]:
val_df.show()

+-------+
| status|
+-------+
|success|
+-------+



In [0]:
val_df.repartition(1).write.mode("overwrite").format("csv").option("header",True).save(f"abfs://{container_name}@{storage_account}.dfs.core.windows.net/validation/validation.csv")