In [49]:
import json
import datetime
from pyspark import SparkContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from awsglue.context import GlueContext

gc = GlueContext(SparkContext.getOrCreate())

ddf = gc.create_dynamic_frame_from_options("s3"\
                                          , {"paths": ["s3://wfercosta-spark/logs/DAILY_20211125.csv"]}\
                                          ,"csv"\
                                          ,{'withHeader':True})
        
df = ddf.toDF()
df = df.select(['date', 'context', 'family', 'version', 'resource'\
                , 'priority', 'status', 'response_time', 'timestamp'])

df = df.withColumn('date', F.to_date(df.date))
df = df.withColumn('timestamp', F.to_timestamp(df.timestamp))
df = df.withColumn('response_time', df.response_time.cast('int'))

df.show()
df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+------------+--------+-------+--------------------+--------+------+-------------+-------------------+
|      date|     context|  family|version|            resource|priority|status|response_time|          timestamp|
+----------+------------+--------+-------+--------------------+--------+------+-------------+-------------------+
|2022-02-17|open-banking|accounts|     v1|/accounts/v1/acco...|  MEDIUM|   500|         1000|2022-02-17 13:50:36|
|2022-02-17|open-banking|accounts|     v1|/accounts/v1/acco...|  MEDIUM|   500|         1000|2022-02-17 13:51:36|
|2022-02-17|open-banking|accounts|     v1|/accounts/v1/acco...|  MEDIUM|   429|         1000|2022-02-17 13:52:36|
|2022-02-17|open-banking|accounts|     v1|/accounts/v1/acco...|  MEDIUM|   429|         1000|2022-02-17 13:53:36|
+----------+------------+--------+-------+--------------------+--------+------+-------------+-------------------+

root
 |-- date: date (nullable = true)
 |-- context: string (nullable = true)
 |-- fami

In [None]:
#For each endpoint in day and by context
# date, context, family, version, resource, priority, total_downtime_sec, total_uptime_rate

df_aval = df

column_list = ['date', 'context', 'resource']


# Find distribution of data
count_if = lambda condition: F.sum(F.when(condition, 1).otherwise(0))

df_aval_class = df_aval.groupby('date', 'context', 'resource')\
                .agg( \
                   F.sum(F.lit(1)).alias('nr_total') \
                 , count_if(F.col('status') < 500).alias('nr_success'))


df_aval_class = df_aval_class.withColumn('distribution', \
                                         F.when(df_aval_class.nr_total - df_aval_class.nr_success == 0, 'SUCCESS')\
                                         .when(df_aval_class.nr_total - df_aval_class.nr_success == df_aval_class.nr_total, 'ERROR')
                                         .otherwise('MIXED'))

df_aval = df_aval.join(df_aval_class, ['date', 'context', 'resource'], 'outer').drop('nr_total', 'nr_success')


#Process distribution ERROR
window = Window().partitionBy([F.col(x) for x in column_list]).orderBy([\
                                                                        F.col('resource')\
                                                                        , F.col('timestamp').desc()])

df_aval_error = df_aval.filter(df_aval.distribution == 'ERROR')

df_aval_error = df_aval_error.withColumn('row', F.row_number().over(window))\
    .filter(F.col('row') == 1)


df_aval_error = df_aval_error.withColumn('timestamp_', F.date_trunc('day', F.col('timestamp')))
df_aval_error = df_aval_error.withColumn('total_downtime_sec', F.col('timestamp').cast('long') - F.col('timestamp_').cast('long'))
df_aval_error = df_aval_error.withColumn('total_uptime_rate', F.lit(0))
df_aval_error = df_aval_error.drop('row', 'timestamp', 'timestamp_', 'response_time', 'distribution', 'status')

#Process distribution SUCCESS

df_aval_success = df_aval.filter(df_aval.distribution == 'SUCCESS')\
                            .select('date','context','family','version','resource','priority')\
                            .distinct()

df_aval_success = df_aval_success.withColumn('total_uptime_rate', F.lit(1))
df_aval_success = df_aval_success.withColumn('total_downtime_sec', F.lit(0))

#Process distribution MIXED

#Replicates the columns unneeded and replicates prev rows values on next

window = Window().partitionBy([F.col(x) for x in column_list]).orderBy([\
                                                                        F.col('resource')\
                                                                        , F.col('timestamp')])


df_aval_mixed = df_aval.filter(df_aval.distribution == 'MIXED')

df_aval_mixed = df_aval_mixed.withColumn('state', F.when(df_aval_mixed.status >= 500, 'ERROR').otherwise('SUCCESS'))

df_aval_mixed = df_aval_mixed.withColumn('state_prev', F.lag('state').over(window))

#Filters intermediate row that is not a state transition
df_aval_mixed = df_aval_mixed.filter(df_aval_mixed.state_prev.isNull() \
                                | (df_aval_mixed.state_prev != df_aval_mixed.state))

df_aval_mixed = df_aval_mixed.withColumn("timestamp_prev", F.lag("timestamp").over(window))
df_aval_mixed = df_aval_mixed.withColumn("timestamp_prev", F.coalesce(df_aval_mixed.timestamp_prev, df_aval_mixed.timestamp)) 

df_aval_mixed = df_aval_mixed.filter(df_aval_mixed.state_prev.isNull() \
                         | (df_aval_mixed.state == 'SUCCESS'))


# Calculates the downtime in seconds
df_aval_mixed = df_aval_mixed.withColumn('total_downtime_sec'\
                             , F.col('timestamp').cast('long') - F.col('timestamp_prev').cast('long'))

df_aval_mixed = df_aval_mixed.groupby(['date', 'context'\
                , 'family', 'version'\
                , 'resource', 'priority'])\
                    .agg(\
                         F.sum('total_downtime_sec').alias('total_downtime_sec'))


# Calculates the uptime rate 
calculate_uptime_rate = lambda downtime_sec: (((24 * 60 * 60) - downtime_sec)/(24 * 60 * 60))

df_aval_mixed = df_aval_mixed.withColumn('total_uptime_rate', calculate_uptime_rate(df_aval_mixed.total_downtime_sec))
df_aval_mixed = df_aval_mixed.withColumn('total_uptime_rate', F.round(df_aval_mixed.total_uptime_rate, 3))


selected_columns = ['date', 'context', 'family', 'version', 'resource'\
                    , 'priority', 'total_downtime_sec', 'total_uptime_rate']

df_aval_mixed = df_aval_mixed.select(selected_columns)
df_aval_error = df_aval_error.select(selected_columns)
df_aval_success = df_aval_success.select(selected_columns)

df_aval = df_aval_mixed.union(df_aval_error)
df_aval = df_aval.union(df_aval_success)


df_aval.show(truncate=False)
df_aval.printSchema()

print(json.dumps(df_aval.toJSON().map(lambda j: json.loads(j)).collect(), indent=2))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
# General in a day and context
# date, context, avg_tps, peak_tps, total_nr_rejections, total_nr_errors, total_uptime_rate, total_downtime_sec, total_scheduled_outage

df_gen = df

# Calculates the right time interval for each row
calculate_interval = lambda field: (F.round(field.cast('long') / 60) * 60.0)\
                        .cast("timestamp")

df_gen = df_gen.withColumn('timestamp_intvl_1_min', calculate_interval(df_gen.timestamp))


count_if = lambda condition: F.sum(F.when(condition, 1).otherwise(0))


df_gen = df_gen.groupby(['date', 'context', 'timestamp_intvl_1_min'])\
            .agg(\
                 F.count(F.lit(1)).alias('tpm')\
                 , count_if(F.col('status') == 429).alias('total_nr_rejections') \
                 , count_if(F.col('status') >= 500).alias('total_nr_errors'))


df_gen = df_gen.withColumn('avg_tps', F.round(df_gen.tpm / 60, 3))


df_gen = df_gen.groupby(['date', 'context'])\
            .agg(\
                   F.round(F.avg('avg_tps'), 3).alias('avg_tps')\
                 , F.round(F.max('avg_tps'), 3).alias('peak_tps')\
                 , F.sum('total_nr_rejections').alias('total_nr_rejections')\
                 , F.sum('total_nr_errors').alias('total_nr_errors'))


df_gen_aval = df_aval.groupby(['date', 'context'])\
            .agg(\
                  F.sum('total_downtime_sec').alias('total_downtime_sec')\
                , F.sum('total_uptime_rate').alias('total_uptime_rate'))


df_gen = df_gen.join(df_gen_aval, ['date', 'context'])


df_gen.show()
df_gen.printSchema()

print(json.dumps(df_gen.toJSON().map(lambda j: json.loads(j)).collect(), indent=2))

In [None]:
# General in a day and by Priority and context
# date, context, priority, total_nr_invocations, avg_response

df_gen_pri = df.groupby(['date', 'context', 'priority'])\
                .agg(\
                       F.count(F.lit(1)).alias('total_nr_invocations')\
                     , F.avg('response_time').alias('avg_response'))

df_gen_pri.show()
df_gen_pri.printSchema()


print(json.dumps(df_gen_pri.toJSON().map(lambda j: json.loads(j)).collect(), indent=2))