In [0]:
#confirm spark or create one
try:
    print("spark exists. Spark version:", spark.version)
except NameError:
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.appName("pyspark-scaffold-day1").getOrCreate()
    print("Created SparkSession. Spark version:", spark.version)


In [0]:
# - read sample dataset
df = spark.read.csv("/databricks-datasets/retail-org/customers/customers.csv",
                    header=True, inferSchema=True)

df.printSchema()
df.show(5)
print("Row count (sample):", df.count())


In [0]:
# Show all column names
print(df.columns)

# Show row count & summary stats for numeric columns
#df.describe().show()


In [0]:
df.select("customer_id", "customer_name", "city", "state").show(5)

df2 = df.withColumnRenamed("customer_id", "cust_id")
df2.select("cust_id", "customer_name").show(5)


In [0]:
from pyspark.sql.functions import col, length, upper

# length of customer name
df3 = df.withColumn("name_length", length(col("customer_name")))

# uppercase city
df3 = df3.withColumn("city_upper", upper(col("city")))

df3.select("customer_name", "city", "city_upper", "name_length").show(5)


In [0]:
# customers from California
df.filter(col("state") == "CA").show(5)

# customers with more than 100 units purchased
df.filter(col("units_purchased") > 100).select("customer_name", "units_purchased").show(5)


In [0]:
df.orderBy(col("units_purchased").desc()).select("customer_name", "units_purchased").show(10)


In [0]:
# distinct states
df.select("state").distinct().show()

# count customers per state
df.groupBy("state").count().orderBy(col("count").desc()).show(10)


In [0]:
# ensure df is loaded
print("rows:", df.count())
print("columns:", df.columns)


In [0]:
from pyspark.sql.functions import col

# check duplicates on join key
df.groupBy("customer_id").count().filter(col("count")>1).show(5)


In [0]:
# build small dim from distinct loyalty_segment values
segments = [r["loyalty_segment"] for r in df.select("loyalty_segment").distinct().collect()]
dim_data = [(s, f"Tier for {s}") for s in segments]
loyalty_dim = spark.createDataFrame(dim_data, ["loyalty_segment","loyalty_desc"])
loyalty_dim.show()


In [0]:
inner_joined = df.join(loyalty_dim, on="loyalty_segment", how="inner")
inner_joined.select("customer_id","customer_name","loyalty_segment","loyalty_desc").show(5)
inner_joined


In [0]:
left_joined = df.join(loyalty_dim, on="loyalty_segment", how="left")
left_joined.select("customer_id","customer_name","loyalty_segment","loyalty_desc").show(5)


In [0]:
right_joined = df.join(loyalty_dim, on="loyalty_segment", how="right")
right_joined.show(5)


In [0]:
#returns rows from left that have a match
semi = df.join(loyalty_dim, on="loyalty_segment", how="left_semi")
semi.show(5)


In [0]:
anti = df.join(loyalty_dim, on="loyalty_segment", how="left_anti")
anti.select("customer_id","customer_name","loyalty_segment").show(5)


In [0]:
from pyspark.sql.functions import broadcast

# broadcast the small loyalty_dim
bcast_joined = df.join(broadcast(loyalty_dim), on="loyalty_segment", how="left")
bcast_joined.select("customer_id","loyalty_segment","loyalty_desc").show(5)

# see plan to confirm broadcast
bcast_joined.explain()


In [0]:
loyalty_dim_dedup = loyalty_dim.dropDuplicates(["loyalty_segment"])


In [0]:
#mini problem

#Create a small synthetic orders DataFrame by taking 300 customers from df and creating order_id and order_units (use monotonically_increasing_id() and random small ints).

#a) Find top 10 customers by total order_units (group orders, sum).
#b) Join the result with the customers df to get customer_name, state, and show top 10 with totals.

#c) Find customers who exist in customers but do NOT have any orders (use left_anti join) and show 5 of them.

from pyspark.sql.functions import monotonically_increasing_id, rand, floor, col, sum

# build orders (example approach)
sample_customers = df.select("customer_id").limit(300)  # or sample
orders = sample_customers.withColumn("order_id", monotonically_increasing_id()) \
                         .withColumn("order_units", (floor(rand()*10)+1).cast("int"))

# a) aggregate orders
cust_orders = orders.groupBy("customer_id").agg(sum("order_units").alias("total_units"))

# b) join with customers to get name/state
top10 = cust_orders.orderBy(col("total_units").desc()).limit(10)
top10_with_info = top10.join(df.select("customer_id","customer_name","state"), on="customer_id", how="left")
top10_with_info.show()

# c) customers without orders
no_orders = df.join(orders.select("customer_id"), on="customer_id", how="left_anti")
no_orders.show(5)


In [0]:
state = [r["state"] for r in df.select("state").distinct().collect()]
state1 = [s for s in state if s in ["CA", "NY", "TX", "FL"]]
state1
region_map = {
    "CA": "West Coast",
    "NY": "East Coast",
    "TX": "South Central",
    "FL": "South East"
}

dim_data =[]
for s in state1:
    region=region_map[s]
    dim_data.append((s, region))

dim_data

In [0]:
state_dim = spark.createDataFrame(dim_data, ["state","region_name"])



In [0]:
inner_joined1 = df.join(state_dim, on = "state",how = "inner")
inner_joined1.select("customer_id","region_name").show(5)
print("Total inner join rows are:",inner_joined1.count())


left_joined1 = df.join(state_dim, on = "state",how = "left")
left_joined1.select("customer_id","region_name").show(5)
print("Total left join rows are:",left_joined1.count())


left_anti_joined1 = df.join(state_dim, on = "state",how = "left_anti")
left_anti_joined1.select("customer_id","state").show(5)
print("Total left anti join rows are:",left_anti_joined1.count())
