In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from pyspark.sql.functions import to_date
from pyspark.sql import Window
import pyspark.sql.functions as F


data = [
    (1, 101, "2023-01-10", 200),
    (1, 102, "2023-02-05", 150),
    (2, 103, "2023-01-15", 300),
    (2, 104, "2023-01-10", 100),
    (3, 105, "2023-03-01", 250)
]


In [0]:
schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("order_id", IntegerType(), True),
    StructField("order_date", StringType(), True),
    StructField("amount", IntegerType(), True)
])
df = spark.createDataFrame(data, schema)


In [0]:
df = df.withColumn("order_date", to_date("order_date"))
df.show()

+-----------+--------+----------+------+
|customer_id|order_id|order_date|amount|
+-----------+--------+----------+------+
|          1|     101|2023-01-10|   200|
|          1|     102|2023-02-05|   150|
|          2|     103|2023-01-15|   300|
|          2|     104|2023-01-10|   100|
|          3|     105|2023-03-01|   250|
+-----------+--------+----------+------+



In [0]:
window = Window.partitionBy(F.col("customer_id")).orderBy(F.col("order_date"))
df.withColumn("row_num", F.row_number().over(window)).filter(F.col("row_num")==1)\
    .select(F.col("customer_id"), F.col("order_id"), F.col("order_date"), F.col("amount")).show()

+-----------+--------+----------+------+
|customer_id|order_id|order_date|amount|
+-----------+--------+----------+------+
|          1|     101|2023-01-10|   200|
|          2|     104|2023-01-10|   100|
|          3|     105|2023-03-01|   250|
+-----------+--------+----------+------+

