<a href="https://colab.research.google.com/github/vamshap/PySpark-Challenges/blob/main/Repeated_New_Customer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DateType
from pyspark.sql.functions import lag, when, col
from pyspark.sql.window import Window
from datetime import datetime

# write a code to determine the customer is new or repeated for each day

# Create Spark session
spark = SparkSession.builder.appName("CustomerOrders").getOrCreate()

# Define schema for customer_orders
schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("order_date", DateType(), True),
    StructField("order_amount", IntegerType(), True)
])

# Create the data
data = [
    (1, 100, datetime.strptime('2022-01-01', '%Y-%m-%d'), 2000),
    (2, 200, datetime.strptime('2022-01-01', '%Y-%m-%d'), 2500),
    (3, 300, datetime.strptime('2022-01-01', '%Y-%m-%d'), 2100),
    (4, 100, datetime.strptime('2022-01-02', '%Y-%m-%d'), 2000),
    (5, 400, datetime.strptime('2022-01-02', '%Y-%m-%d'), 2200),
    (6, 500, datetime.strptime('2022-01-02', '%Y-%m-%d'), 2700),
    (7, 100, datetime.strptime('2022-01-03', '%Y-%m-%d'), 3000),
    (8, 400, datetime.strptime('2022-01-03', '%Y-%m-%d'), 1000),
    (9, 600, datetime.strptime('2022-01-03', '%Y-%m-%d'), 3000)
]

# Create DataFrame
customer_orders_df = spark.createDataFrame(data, schema=schema)

# Show the DataFrame
customer_orders_df.show()

window_spec = Window.partitionBy("customer_id").orderBy("order_date")

# Calculate PreviousCustomerField using lag
cte_flag_df = customer_orders_df.withColumn(
    "PreviousCustomerField",
    lag("customer_id").over(window_spec)
)

# Determine whether the customer is new or repeated
result_df = cte_flag_df.withColumn(
    "New/RepeatedCustomer",
    when(col("customer_id") == col("PreviousCustomerField"), "Repeated Customer")
    .otherwise("New Customer")
).select("customer_id", "order_date", "New/RepeatedCustomer").orderBy("order_id")

# Show result
result_df.show()






+--------+-----------+----------+------------+
|order_id|customer_id|order_date|order_amount|
+--------+-----------+----------+------------+
|       1|        100|2022-01-01|        2000|
|       2|        200|2022-01-01|        2500|
|       3|        300|2022-01-01|        2100|
|       4|        100|2022-01-02|        2000|
|       5|        400|2022-01-02|        2200|
|       6|        500|2022-01-02|        2700|
|       7|        100|2022-01-03|        3000|
|       8|        400|2022-01-03|        1000|
|       9|        600|2022-01-03|        3000|
+--------+-----------+----------+------------+

+-----------+----------+--------------------+
|customer_id|order_date|New/RepeatedCustomer|
+-----------+----------+--------------------+
|        100|2022-01-01|        New Customer|
|        200|2022-01-01|        New Customer|
|        300|2022-01-01|        New Customer|
|        100|2022-01-02|   Repeated Customer|
|        400|2022-01-02|        New Customer|
|        500|2022-01