# Transform Orders Data - String to JSON
1. Pre-Process the JSON String to fix the Data Quality Issues
2. Transform JSON String to JSON Object  
3. Write Transformed Data to the Silver Schema

In [0]:
orders_df = spark.table('gizmobox.bronze.py_orders')
display(orders_df)

### 1. Pre-Process the JSON String to fix the Data Quality Issues

In [0]:
from pyspark.sql.functions import *
fixed_orders_df = orders_df.select(
    regexp_replace('value', '"order_date": (\\d{4}-\\d{2}-\\d{2})', '"order_date": "$1"').alias('fixed_value')
)
display(fixed_orders_df)

### 2. Transform JSON String to JSON Object  
- schema_of_json function
- from_json function

In [0]:
df_with_schema = fixed_orders_df.select(
    schema_of_json(col("fixed_value")).alias('schema')
)
display(df_with_schema.limit(1))

In [0]:
orders_schema = """STRUCT<customer_id: BIGINT, items: ARRAY<STRUCT<category: STRING, details: STRUCT<brand: STRING, color: STRING>, item_id: BIGINT, name: STRING, price: BIGINT, quantity: BIGINT>>, order_date: STRING, order_id: BIGINT, order_status: STRING, payment_method: STRING, total_amount: BIGINT, transaction_timestamp: STRING>"""

In [0]:
json_orders_df = fixed_orders_df.select(
    from_json(col('fixed_value'), orders_schema).alias('json_value')
)
display(json_orders_df)

### 3. Write Transformed Data to the Silver Schema

In [0]:
json_orders_df.writeTo('gizmobox.silver.py_orders_json').createOrReplace()

In [0]:
%sql
select * from gizmobox.silver.py_orders_json;