In [83]:
import json
import re
from awsglue.context import GlueContext
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructField, StructType, StringType, LongType


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [84]:


regex = r'\/([0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12})\/?|\/(urn:[a-z0-9A-z]*:[a-z0-9A-z]+)\/?|\/([0-9]+)\/?'

examples = ['/open-banking/accounts/v1/accounts-id/123456789'
            ,'/open-banking/accounts/v1/accounts-id/123456789/a'
            ,'/open-banking/accounts/v1/accounts-uuid/92490b80-6e37-11ec-ae98-f1480fe51782'
            ,'/open-banking/accounts/v1/accounts-uuid/92490b80-6e37-11ec-ae98-f1480fe51782/'
            ,'/open-banking/accounts/v1/accounts-urn/urn:isbn:0451450523'
            ,'/open-banking/accounts/v1/accounts-urn/urn:isbn:0451450523/'
            ,'/open-banking/accounts/v1/accounts-urn/urn:isbn:0451450523/balances'
            ,'/open-banking/accounts/v1/accounts-urn-uuid/urn:isbn:0451450523/balances/92490b80-6e37-11ec-ae98-f1480fe51782'
            ,'/open-banking/accounts/v1/accounts-urn-uuid-id/urn:isbn:0451450523/balances/92490b80-6e37-11ec-ae98-f1480fe51782/transactions/1'
            ,'/open-banking/customers/v1/business/qualifications'
           ]


def replacer(match):
    original = match.group(0)
    for idx in range(1, 4):
        if match.group(idx):
            to_replace = match.group(idx)
            return original.replace(to_replace, '{param}')
    return original

examples = [re.sub(regex, replacer, example) for example in examples]

examples

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['/open-banking/accounts/v1/accounts-id/{param}', '/open-banking/accounts/v1/accounts-id/{param}/a', '/open-banking/accounts/v1/accounts-uuid/{param}', '/open-banking/accounts/v1/accounts-uuid/{param}/', '/open-banking/accounts/v1/accounts-urn/{param}', '/open-banking/accounts/v1/accounts-urn/{param}/', '/open-banking/accounts/v1/accounts-urn/{param}/balances', '/open-banking/accounts/v1/accounts-urn-uuid/{param}/balances/{param}', '/open-banking/accounts/v1/accounts-urn-uuid-id/{param}/balances/{param}/transactions/{param}', '/open-banking/customers/v1/business/qualifications']

In [None]:
session = SparkSession.builder.appName("test").getOrCreate() 
spark = session.sparkContext
gc = GlueContext(spark)

file =  's3://wfercosta-spark/logs/*.gz'

schema = StructType([
    StructField("date", StringType(), True),
    StructField("time", StringType(), True),
    StructField("x-edge-location", StringType(), True),
    StructField("sc-bytes", StringType(), True),
    StructField("c-ip", StringType(), True),
    StructField("cs-method", StringType(), True),
    StructField("cs(Host)", StringType(), True),
    StructField("cs-uri-stem", StringType(), True),
    StructField("sc-status", StringType(), True),
    StructField("cs(Referer)", StringType(), True),
    StructField("cs(User-Agent)", StringType(), True),
    StructField("cs-uri-query", StringType(), True),
    StructField("cs(Cookie)", StringType(), True),
    StructField("x-edge-result-type", StringType(), True),
    StructField("x-edge-request-id", StringType(), True),
    StructField("x-host-header", StringType(), True),
    StructField("cs-protocol", StringType(), True),
    StructField("cs-bytes", StringType(), True),
    StructField("time-taken", StringType(), True),
    StructField("x-forwarded-for", StringType(), True),
    StructField("ssl-protocol", StringType(), True),
    StructField("ssl-cipher", StringType(), True),
    StructField("x-edge-response-result-type", StringType(), True),
    StructField("cs-protocol-version", StringType(), True),
    StructField("fle-status", StringType(), True),
    StructField("fle-encrypted-fields", StringType(), True),
    StructField("c-port", StringType(), True),
    StructField("time-to-first-byte", StringType(), True),
    StructField("x-edge-detailed-result-type", StringType(), True),
    StructField("sc-content-type", StringType(), True),
    StructField("sc-content-len", StringType(), True),
    StructField("sc-range-start", StringType(), True),
    StructField("sc-range-end", StringType(), True),
])


df = session.read.csv(file, sep=' ', header=False, schema=schema)
df = df.withColumn("filename", F.input_file_name())
df = df.where(F.col('date').startswith("#") == False)
df = df.select(['date', 'time', 'sc-status', 'cs-uri-stem', 'time-taken', 'filename'])


df = df.withColumnRenamed("sc-status","status")
df = df.withColumnRenamed("cs-uri-stem","resource")
df = df.withColumnRenamed("time-taken","response_time")
df = df.withColumn("timestamp", F.to_timestamp(F.concat(F.col('date'), F.lit(' '), F.col('time'))))
df = df.drop(F.col('time'))

df.show()