In [1]:
import json
from datetime import datetime
from functools import reduce
from collections import defaultdict
from pyspark import SparkContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from awsglue.context import GlueContext

gc = GlueContext(SparkContext.getOrCreate())

ddf = gc.create_dynamic_frame_from_options("s3"\
                                          , {"paths": ["s3://wfercosta-spark/logs/DAILY_20220117.csv"]}\
                                          ,"csv"\
                                          ,{'withHeader':True})
        
df = ddf.toDF()

df = df.withColumn('date', F.to_date(df.date))
df = df.withColumn('timestamp', F.to_timestamp(df.timestamp))
df = df.withColumn('response_time', df.response_time.cast('int'))

df.show()

df.printSchema()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,,pyspark,idle,,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+------------+------------+-------+--------------------+------+-------------+-------------------+
|      date|     context|      family|version|            resource|status|response_time|          timestamp|
+----------+------------+------------+-------+--------------------+------+-------------+-------------------+
|2021-05-11|open-banking|    accounts|     v1|           /accounts|   200|          112|2022-01-17 13:50:00|
|2021-05-11|open-banking|    accounts|     v1|      /accounts/{id}|   200|          112|2022-01-17 13:50:00|
|2021-05-11|open-banking|    accounts|     v1|/accounts/{id}/tr...|   200|          112|2022-01-17 13:50:00|
|2021-05-11|open-banking|    accounts|     v1|/accounts/{id}/ba...|   200|          112|2022-01-17 13:50:00|
|2021-05-11|open-banking|       loans|     v1|              /loans|   200|          112|2022-01-17 13:50:00|
|2021-05-11|open-banking|       loans|     v1|         /loans/{id}|   200|          112|2022-01-17 13:50:00|
|2021-05-11|open-ba

In [2]:

df_latest = df

column_list = ['context', 'family', 'version', 'resource']

window = Window().partitionBy([F.col(x) for x in column_list]).orderBy([\
                                                                          F.col('family')
                                                                        , F.col('version')
                                                                        , F.col('resource')\
                                                                        , F.col('timestamp').desc()])

df_latest = df_latest.withColumn('row', F.row_number().over(window))\
    .filter(F.col('row') == 1)

df_latest.show()



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+------------+------------+-------+--------------------+------+-------------+-------------------+---+
|      date|     context|      family|version|            resource|status|response_time|          timestamp|row|
+----------+------------+------------+-------+--------------------+------+-------------+-------------------+---+
|2021-05-11|open-banking|    accounts|     v1|/accounts/{id}/tr...|   200|          112|2022-01-17 13:50:00|  1|
|2021-05-11|open-banking|    accounts|     v1|      /accounts/{id}|   200|          112|2022-01-17 13:50:00|  1|
|2021-05-11|open-banking|       loans|     v1|         /loans/{id}|   200|          112|2022-01-17 13:50:00|  1|
|      null|        null|        null|   null|                null|  null|         null|               null|  1|
|2021-05-11|open-banking|    accounts|     v1|           /accounts|   200|          112|2022-01-17 13:50:00|  1|
|2021-05-11|open-banking|       loans|     v1|              /loans|   200|          112|2022-01-

In [3]:


df_status = df_latest

is_unavailable = lambda status: (status >= 500) & (status < 600)

df_status = df_status.withColumn('unavailable', is_unavailable(df_status.status))
df_status = df_status.withColumn('resource', F.concat(F.lit('/'), df_status.context\
                                                      , F.lit('/'), df_status.family\
                                                      , F.lit('/'), df_status.version\
                                                      , df_status.resource))

df_status = df_status.select(['context', 'resource', 'unavailable'])

elements = df_status.toJSON().map(lambda j: json.loads(j)).collect()

print(json.dumps(elements, indent=2))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[
  {
    "context": "open-banking",
    "resource": "/open-banking/accounts/v1/accounts/{id}/transactions",
    "unavailable": false
  },
  {
    "context": "open-banking",
    "resource": "/open-banking/accounts/v1/accounts/{id}",
    "unavailable": false
  },
  {
    "context": "open-banking",
    "resource": "/open-banking/loans/v1/loans/{id}",
    "unavailable": false
  },
  {},
  {
    "context": "open-banking",
    "resource": "/open-banking/accounts/v1/accounts",
    "unavailable": false
  },
  {
    "context": "open-banking",
    "resource": "/open-banking/loans/v1/loans",
    "unavailable": false
  },
  {
    "context": "open-banking",
    "resource": "/open-banking/accounts/v1/accounts/{id}/balances",
    "unavailable": false
  },
  {
    "context": "open-banking",
    "resource": "/open-banking/credit-cards/v1/accounts",
    "unavailable": false
  },
  {
    "context": "open-banking",
    "resource": "/open-banking/credit-cards/v1/accounts/{id}/balances",
    "unavailable":

In [6]:

def group_by_context(acc, el):
    context = el['context']
    del el['context']
    acc[context].append(el)
    return acc

grouped = dict(reduce(group_by_context, elements, defaultdict(list)))

print(json.dumps(grouped, indent=2))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
'context'
Traceback (most recent call last):
  File "<stdin>", line 6, in group_by_context
KeyError: 'context'



In [5]:
def get_status(total, unavailables):
    diff =  total - unavailables
    
    if unavailables > 0:
        if diff == 0:
            return 'UNAVAILABLE'
        else:
            return 'PARTIAL_FAILURE'
    else:
        return 'OK'

def reduce_count_unavailables(acc, el):
    return acc + (1 if el['unavailable'] else 0)
    
    
statuses = list()

for key, values in grouped.items():
    
    count_endpoints = len(values)
    count_unavailables = reduce(reduce_count_unavailables, values, 0)
    
    status = get_status(count_endpoints, count_unavailables)
    
    unavailables_endpoints = list(map(lambda el: el['resource'], filter(lambda el: el['unavailable'], values)))
    
    document = {
        'context': key,
        'status': status,
        'unavailableEndpoints': unavailables_endpoints,
        'detectionTime': datetime.now().isoformat(),
        }
    
    statuses.append(document)
    

print(json.dumps(statuses, indent=2))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
name 'grouped' is not defined
Traceback (most recent call last):
NameError: name 'grouped' is not defined

