# Transform Customers Data
1. Remove records with NULL customer_id
2. Remove exact duplicate records
3. Remove duplicate records based on created_timestamp
4. CAST the columns to the correct Data type
5. Write Transformed data into Silver Schema

In [0]:
df = spark.sql('select * from gizmobox.bronze.py_customers')
display(df)

In [0]:
df = spark.table('gizmobox.bronze.py_customers')
display(df)

In [0]:
df = spark.read.table('gizmobox.bronze.py_customers')
display(df)

### 1. Remove records with NULL customer_id

In [0]:
df_filtered = df.filter(df.customer_id.isNotNull()) # df.filter('customer_id is not null')
display(df_filtered)

### 2. Remove exact duplicate records

In [0]:
df_distinct = df_filtered.distinct()
display(df_distinct)

In [0]:
df_distinct = df_filtered.dropDuplicates()
display(df_distinct)

### 3. Remove duplicate records based on created_timestamp

In [0]:
from pyspark.sql import functions as F
df_max_timestamp = df_distinct.groupBy("customer_id") \
  .agg(F.max("created_timestamp").alias('max_created_timestamp'))
display(df_max_timestamp)

In [0]:
df_distinct_customer = df_distinct.join(df_max_timestamp, (df_distinct.customer_id == df_max_timestamp.customer_id) &
                                        (df_distinct.created_timestamp == df_max_timestamp.max_created_timestamp), 'inner') \
                                            .select(df_distinct['*'])
display(df_distinct_customer)

### 4.CAST the columns to the correct Data type

In [0]:
df_casted_customer = df_distinct_customer.select(df_distinct_customer.created_timestamp.cast('timestamp'),
                                                 df_distinct_customer.customer_id,
                                                 df_distinct_customer.customer_name,
                                                 df_distinct_customer.date_of_birth.cast('date'),
                                                 df_distinct_customer.email,
                                                 df_distinct_customer.member_since.cast('date'),
                                                 df_distinct_customer.telephone)
display(df_casted_customer)

### 5. Write Transformed data into Silver Schema

In [0]:
df_casted_customer.writeTo("gizmobox.silver.py_customers").createOrReplace()

In [0]:
%sql
select * from gizmobox.silver.py_customers;
-- It has 50 records
-- All the data types are accurate as we developed above