## Transform Orders Data - String to JSON
1. Pre-process the JSON String to fix the Data Quality Issues
1. Transform JSON String to JSON Object
1. Write transformed data to the silver schema

In [0]:
df_orders = spark.table('gizmobox.bronze.py_orders')
display(df_orders)

In [0]:
# 1.

from pyspark.sql import functions as F

df_fixed_orders =(
    df_orders
    .select(
        F.regexp_replace("value", '"order_date": (\\d{4}-\\d{2}-\\d{2})', '"order_date": "$1"').alias("fixed_value")
    )
)

display(df_fixed_orders)

In [0]:
# 2.

df_with_schema = (
    df_fixed_orders
    .select(
        F.schema_of_json(F.col("fixed_value")).alias("schema")
    )
)

display(df_with_schema.limit(1))

In [0]:
order_schema = '''STRUCT<customer_id: BIGINT, items: ARRAY<STRUCT<category: STRING, details: STRUCT<brand: STRING, color: STRING>, item_id: BIGINT, name: STRING, price: BIGINT, quantity: BIGINT>>, order_date: STRING, order_id: BIGINT, order_status: STRING, payment_method: STRING, total_amount: BIGINT, transaction_timestamp: STRING>'''

In [0]:
df_json_orders = (
    df_fixed_orders.select (
        F.from_json("fixed_value", order_schema).alias("json_value")
    )
)
display(df_json_orders)

In [0]:
# 3.

df_json_orders.writeTo("gizmobox.silver.py_orders_json").createOrReplace()

In [0]:
%sql
select * from gizmobox.silver.py_orders_json