In [1]:
from pyspark import SparkContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from awsglue.context import GlueContext

gc = GlueContext(SparkContext.getOrCreate()) 
# df = gc.create_dynamic_frame_from_options(connection_type = "s3"\
#                                           , connection_options = {"paths": ["s3://wfercosta-spark/logs.csv"]}\
#                                           , format = "csv"\
#                                           , {'withHeader': True})

ddf = gc.create_dynamic_frame_from_options("s3"\
                                          , {"paths": ["s3://wfercosta-spark/DAILY_20211225.csv"]}\
                                          ,"csv"\
                                          ,{'withHeader':True})
        
df = ddf.toDF()
df = df.select(['date', 'context', 'family', 'version', 'resource'\
                , 'priority', 'status', 'response_time', 'timestamp'])

df = df.withColumn('date', F.to_date(df.date))
df = df.withColumn('timestamp', F.to_timestamp(df.timestamp))
df = df.withColumn('response_time', df.response_time.cast('int'))

df.show()
df.printSchema()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,,pyspark,idle,,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+------------+---------+-------+--------------------+----------+------+-------------+-------------------+
|      date|     context|   family|version|            resource|  priority|status|response_time|          timestamp|
+----------+------------+---------+-------+--------------------+----------+------+-------------+-------------------+
|2021-12-25|open-banking| accounts|     v1|/accounts/v1/acco...|    MEDIUM|   200|          500|2021-12-25 13:50:36|
|2021-12-25|open-banking| accounts|     v1|/accounts/v1/acco...|    MEDIUM|   500|          500|2021-12-25 13:51:36|
|2021-12-25|open-banking| accounts|     v1|/accounts/v1/acco...|    MEDIUM|   500|          500|2021-12-25 13:52:36|
|2021-12-25|open-banking| accounts|     v1|/accounts/v1/acco...|    MEDIUM|   500|          500|2021-12-25 13:53:36|
|2021-12-25|open-banking| accounts|     v1|/accounts/v1/acco...|    MEDIUM|   200|          500|2021-12-25 13:54:36|
|2021-12-25|open-banking| accounts|     v1|/accounts/v1/acco...|

In [2]:
#For each endpoint in day and by context
# date, context, family, version, resource, priority, total_downtime_sec, total_uptime_rate

df_aval = df

#Replicates the columns unneeded and replicates prev rows values on next
column_list = ['date', 'context', 'resource']

window = Window().partitionBy([F.col(x) for x in column_list]).orderBy([\
                                                                        F.col('resource')\
                                                                        , F.col('timestamp')])

df_aval = df_aval.withColumn('status_prev', F.lag('status').over(window))

#Filters intermediate row that is not a state transition
df_aval = df_aval.filter(df_aval.status_prev.isNull() \
                                | (df_aval.status_prev != df_aval.status))

df_aval = df_aval.withColumn("timestamp_prev", F.lag("timestamp").over(window))
df_aval = df_aval.withColumn("timestamp_prev", F.coalesce(df_aval.timestamp_prev, df_aval.timestamp)) 

df_aval = df_aval.filter(df_aval.status_prev.isNull() \
                         | ((df_aval.status >= 200) & (df_aval.status < 300)) \
                         | ((df_aval.status >= 400) & (df_aval.status < 500)))


# Calculates the downtime in seconds
df_aval = df_aval.withColumn('total_downtime_sec'\
                             , F.col('timestamp').cast('long') - F.col('timestamp_prev').cast('long'))

df_aval = df_aval.groupby(['date', 'context'\
                , 'family', 'version'\
                , 'resource', 'priority'])\
                    .agg(\
                         F.sum('total_downtime_sec').alias('total_downtime_sec'))


# Calculates the uptime rate 
calculate_uptime_rate = lambda downtime_sec: (((24 * 60 * 60) - downtime_sec)/(24 * 60 * 60))

df_aval = df_aval.withColumn('total_uptime_rate', calculate_uptime_rate(df_aval.total_downtime_sec))
df_aval = df_aval.withColumn('total_uptime_rate', F.round(df_aval.total_uptime_rate, 3))


df_aval.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+--------------+---------+-------+--------------------+----------+------------------+-----------------+
|      date|       context|   family|version|            resource|  priority|total_downtime_sec|total_uptime_rate|
+----------+--------------+---------+-------+--------------------+----------+------------------+-----------------+
|2021-12-26|  open-banking| accounts|     v1|/accounts/v1/acco...|    MEDIUM|                 0|              1.0|
|2021-12-25|open-insurance| accounts|     v1|/accounts/v1/acco...|    MEDIUM|               180|            0.998|
|2021-12-25|  open-banking|    admin|     v1|   /admin/v1/metrics|UNATTENDED|               180|            0.998|
|2021-12-25|  open-banking|discovery|     v1|/discovery/v1/status|      HIGH|               180|            0.998|
|2021-12-25|open-insurance|    admin|     v1|   /admin/v1/metrics|UNATTENDED|               180|            0.998|
|2021-12-25|open-insurance|discovery|     v1|/discovery/v1/status|      HIGH|   

In [48]:
# General in a day and context
# date, context, avg_tps, peak_tps, total_nr_rejections, total_nr_errors, total_uptime_rate, total_downtime_sec, total_scheduled_outage

#df.groupby('context', 'resource').agg(min(df.latency), max(df.latency), avg(df.latency)).show()

# df_final = df.groupby('contex', 'family', 'version', 'resource').agg()

# df_final.show()

df_gen = df

# Calculates the right time interval for each row
calculate_interval = lambda field: (F.round(field.cast('long') / 60) * 60.0)\
                        .cast("timestamp")

df_gen = df_gen.withColumn('timestamp_intvl_1_min', calculate_interval(df_gen.timestamp))


count_if = lambda condition: F.sum(F.when(condition, 1).otherwise(0))


df_gen = df_gen.groupby(['date', 'context', 'timestamp_intvl_1_min'])\
            .agg(\
                 F.count(F.lit(1)).alias('tpm')\
                 , count_if(F.col('status') == 429).alias('total_nr_rejections') \
                 , count_if(F.col('status') > 500).alias('total_nr_errors'))


df_gen = df_gen.withColumn('avg_tps', F.round(df_gen.tpm / 60, 3))


df_gen = df_gen.groupby(['date', 'context'])\
            .agg(\
                   F.round(F.avg('avg_tps'), 3).alias('avg_tps')\
                 , F.round(F.max('avg_tps'), 3).alias('peak_tps')\
                 , F.sum('total_nr_rejections').alias('total_nr_rejections')\
                 , F.sum('total_nr_errors').alias('total_nr_errors'))


df_gen_aval = df_aval.groupby(['date', 'context'])\
            .agg(\
                  F.sum('total_downtime_sec').alias('total_downtime_sec')
                , F.sum('total_uptime_rate').alias('total_uptime_rate'))


df_gen = df_gen.join(df_gen_aval, ['date', 'context'])

df_gen.show()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+--------------+-------+--------+-------------------+---------------+------------------+------------------+
|      date|       context|avg_tps|peak_tps|total_nr_rejections|total_nr_errors|total_downtime_sec| total_uptime_rate|
+----------+--------------+-------+--------+-------------------+---------------+------------------+------------------+
|2021-12-26|  open-banking|  0.017|   0.017|                  0|              0|                 0|               1.0|
|2021-12-25|  open-banking|   0.05|    0.05|                  0|              0|               540|2.9939999999999998|
|2021-12-25|open-insurance|   0.05|    0.05|                  0|              0|               540|2.9939999999999998|
+----------+--------------+-------+--------+-------------------+---------------+------------------+------------------+

In [4]:
# Gneral in a day and by Priority and context
# date, context, priority, total_nr_invocations, avg_response

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…