In [0]:
from pyspark.sql.functions import *
df = spark.read.csv('/Volumes/my_catalog/source/pyspark_vol/us_customer_data_2.csv', header = True)
df = df.withColumn('email', trim(regexp_replace(col('email'), r'\s+', ' ')))
df = df.withColumn('registration_date', to_date(col('registration_date')))
df = df.withColumn('first_part', split(col('phone'), 'x')[0])\
    .withColumn('ext', 
                when(col('phone').contains('x'), 
                split(col('phone'), 'x')[1])
    .otherwise(None))
df = df.withColumn('first_part', regexp_replace(col('first_part'), r'\D', ''))
df = df.withColumn('country_code',
                   expr("CASE WHEN length(first_part)>10 THEN substring(first_part, 1, length(first_part)-10) ELSE NULL END"))

df = df.withColumn('phone_no', 
                   expr("substring(first_part, length(first_part)-9, 10)"))

df = df.drop('first_part', 'phone')

df = df.select('customer_id', 'name', 'email', 'country_code', 'phone_no', 'ext', 'address', 'registration_date', 'loyalty_status')
df.show(1)

+-----------+-------------+------------------+------------+--------+----+--------------------+-----------------+--------------+
|customer_id|         name|             email|country_code|phone_no| ext|             address|registration_date|loyalty_status|
+-----------+-------------+------------------+------------+--------+----+--------------------+-----------------+--------------+
|          1|Michelle Kidd|vayala@example.net|        NULL|    NULL|NULL|USNS Santiago, FP...|       2025-01-25|          Gold|
+-----------+-------------+------------------+------------+--------+----+--------------------+-----------------+--------------+
only showing top 1 row


In [0]:
df2 = spark.read.csv('/Volumes/my_catalog/source/pyspark_vol/transaction_data_1.csv', header = True)
df2 = df2.withColumn("amount", col('amount').cast('double'))
df2 = df2.withColumn("transaction_date", to_date("transaction_date"))
df2.show(1)

+--------------+-----------+-------+----------------+----------------+--------------+--------------+
|transaction_id|customer_id| amount|transaction_date|product_category|payment_method|store_location|
+--------------+-----------+-------+----------------+----------------+--------------+--------------+
|             1|        565|2992.47|      2025-03-10|          Sports|    Debit Card|      New York|
+--------------+-----------+-------+----------------+----------------+--------------+--------------+
only showing top 1 row


In [0]:
df3 = df.join(df2, on = 'customer_id', how = 'inner')
df3.display()

customer_id,name,email,country_code,phone_no,ext,address,registration_date,loyalty_status,transaction_id,amount,transaction_date,product_category,payment_method,store_location
1,Michelle Kidd,vayala@example.net,,,,"USNS Santiago, FPO AE 80872",2025-01-25,Gold,8,2652.57,2025-04-19,Home,Credit Card,New York
4,Kimberly Price,jessicaknight@example.com,1.0,9476334224.0,7930.0,"1631 Alexis Meadows, Lake Amanda, CA 75179",2024-12-08,Gold,427,1787.09,2025-04-28,Beauty,PayPal,Chicago
5,Matthew Phillips,qwilliams@example.com,1.0,8696505682.0,8385.0,"2274 Williams Heights Suite 895, Andersonhaven, OR 80565",2024-02-03,Gold,452,1983.52,2025-02-03,Sports,PayPal,Online
7,Louis Miller,patriciaelliott@example.org,1.0,4189314146.0,588.0,"02590 Marshall Well, Sheppardland, CT 88067",2024-09-30,Gold,857,697.57,2025-06-13,Beauty,Debit Card,Chicago
9,Matthew Harvey,bgaines@example.net,,7108544550.0,,"60123 Davis Ford, Mistychester, WA 69400",2024-10-31,Silver,300,918.34,2025-05-07,Electronics,PayPal,Miami
10,Rachel White,whitemichael@example.org,1.0,9207934515.0,302.0,"6182 Brown Mountain, South Gary, IL 29016",2024-02-25,Bronze,831,220.35,2025-05-21,Sports,PayPal,Los Angeles
12,Christine Delgado,yfarrell@example.net,,9322067195.0,8325.0,"PSC 4025, Box 2078, APO AA 61121",2024-04-18,Bronze,560,,2025-02-08,Beauty,Debit Card,Miami
13,Joseph Brennan,khernandez@example.com,,5493564430.0,,"8890 Keller Hills, North Paul, FM 64749",2024-04-28,Gold,967,631.85,2025-02-18,Home,Cash,Online
15,Laura Perez,sdavid@example.com,,5119002681.0,4189.0,"1518 Green Locks Suite 348, New Brent, FM 51849",2024-12-04,Gold,504,3194.72,2025-06-01,Beauty,Debit Card,Online
16,Erin Matthews,uhunter@example.net,,8892759024.0,61618.0,"67623 Morgan Hollow, Hillshire, IL 78665",2023-11-12,Bronze,635,492.19,2025-01-23,Clothing,Cash,Miami


In [0]:
df3.createOrReplaceTempView('customer')

In [0]:
%sql
select 
  payment_method, 
  round(sum(amount), 2) as revenue 
from customer
group by payment_method
order by revenue desc;

payment_method,revenue
Debit Card,635175.45
Credit Card,595437.25
Cash,593727.99
PayPal,581581.93
