In [24]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window as W

In [25]:
spark = SparkSession.builder.appName("SparkSession").getOrCreate()

In [26]:
spark

In [27]:
data = [(10,'abc'),(20,'abc'),(None,'abc')]
schema = "id int, cd string"
df = spark.createDataFrame(data = data, schema = schema)
df.show()

+----+---+
|  id| cd|
+----+---+
|  10|abc|
|  20|abc|
|NULL|abc|
+----+---+



In [31]:
df.filter("id is null").show()

+----+---+
|  id| cd|
+----+---+
|NULL|abc|
+----+---+



In [35]:
df.filter(df.id.isNotNull()).show()

+---+---+
| id| cd|
+---+---+
| 10|abc|
| 20|abc|
+---+---+



In [39]:
df.createOrReplaceTempView("tbl")

In [41]:
sql = "select count(*),count(id),count(cd),count(distinct cd),sum(id) from tbl"
spark.sql(sql).show()

+--------+---------+---------+------------------+-------+
|count(1)|count(id)|count(cd)|count(DISTINCT cd)|sum(id)|
+--------+---------+---------+------------------+-------+
|       3|        2|        3|                 1|     30|
+--------+---------+---------+------------------+-------+



In [45]:
df.agg(F.count('*').alias('cnt'),
       F.count('id').alias('id_cnt'),
        F.count('id').alias('id_cnt'),
        F.countDistinct('cd').alias('dcd_cnt'),
        F.sum('id').alias('sum_cnt'),
      
      ).show()

+---+------+------+-------+-------+
|cnt|id_cnt|id_cnt|dcd_cnt|sum_cnt|
+---+------+------+-------+-------+
|  3|     2|     2|      1|     30|
+---+------+------+-------+-------+



In [48]:
sql = "SELECT count(id) from tbl group by cd having count(id) = 1"
spark.sql(sql).show()

+---------+
|count(id)|
+---------+
+---------+



In [54]:
df.groupBy("cd").agg(F.count('id').alias("cnt_id")).filter(F.col('cnt_id') == 1).show()

+---+------+
| cd|cnt_id|
+---+------+
+---+------+



In [55]:
dataset = [(10,'mango','2024-01-01',100),
(10,'orange','2024-01-02',120),
(11,'jeans','2024-01-03',200),
(11,'jeans','2024-01-03',250),
(11,'T-shirt','2024-01-04',200),
(12,'Banana','2024-01-04',50)]
data_schema = "id int, notes string,sales_date string,amount int"
dataframe = spark.createDataFrame(data = dataset, schema = data_schema)
dataframe.printSchema()

root
 |-- id: integer (nullable = true)
 |-- notes: string (nullable = true)
 |-- sales_date: string (nullable = true)
 |-- amount: integer (nullable = true)



In [56]:
dataframe.show()

+---+-------+----------+------+
| id|  notes|sales_date|amount|
+---+-------+----------+------+
| 10|  mango|2024-01-01|   100|
| 10| orange|2024-01-02|   120|
| 11|  jeans|2024-01-03|   200|
| 11|  jeans|2024-01-03|   250|
| 11|T-shirt|2024-01-04|   200|
| 12| Banana|2024-01-04|    50|
+---+-------+----------+------+



In [60]:
# Convert sales_date data types
from pyspark.sql.types import DateType
dataframe2 = dataframe.withColumn("sales_date",F.col('sales_date').cast(DateType()))
dataframe2.show()

+---+-------+----------+------+
| id|  notes|sales_date|amount|
+---+-------+----------+------+
| 10|  mango|2024-01-01|   100|
| 10| orange|2024-01-02|   120|
| 11|  jeans|2024-01-03|   200|
| 11|  jeans|2024-01-03|   250|
| 11|T-shirt|2024-01-04|   200|
| 12| Banana|2024-01-04|    50|
+---+-------+----------+------+



In [61]:
##  fetch the max sale amount for each date ID wise, also get sum of sales amount id wise --

In [75]:
dataframe2.select("id","notes","sales_date","amount").\
withColumn("DenseRank",F.dense_rank().over(W.partitionBy("id").orderBy(F.col("amount").desc()))).\
withColumn("Sum",F.sum('amount').over(W.partitionBy("id").orderBy(F.col("amount").asc()))).\
filter(F.col("DenseRank") ==1).show()


+---+------+----------+------+---------+---+
| id| notes|sales_date|amount|DenseRank|Sum|
+---+------+----------+------+---------+---+
| 10|orange|2024-01-02|   120|        1|220|
| 11| jeans|2024-01-03|   250|        1|650|
| 12|Banana|2024-01-04|    50|        1| 50|
+---+------+----------+------+---------+---+



In [87]:
agg_data = dataframe2.select("id","notes","sales_date","amount").groupBy("id").agg(
    F.sum("amount").alias("sum_amount"),
    F.max("amount").alias("max_amont"),
)


In [92]:
dataframe2.join(agg_data,(dataframe2.id == agg_data.id) & (dataframe2.amount == agg_data.max_amont),
                how="inner" ).show()

+---+------+----------+------+---+----------+---------+
| id| notes|sales_date|amount| id|sum_amount|max_amont|
+---+------+----------+------+---+----------+---------+
| 10|orange|2024-01-02|   120| 10|       220|      120|
| 11| jeans|2024-01-03|   250| 11|       650|      250|
| 12|Banana|2024-01-04|    50| 12|        50|       50|
+---+------+----------+------+---+----------+---------+



In [93]:
dataframe2.show()

+---+-------+----------+------+
| id|  notes|sales_date|amount|
+---+-------+----------+------+
| 10|  mango|2024-01-01|   100|
| 10| orange|2024-01-02|   120|
| 11|  jeans|2024-01-03|   200|
| 11|  jeans|2024-01-03|   250|
| 11|T-shirt|2024-01-04|   200|
| 12| Banana|2024-01-04|    50|
+---+-------+----------+------+



In [101]:
dataframe2.groupBy('id').agg(F.collect_list("notes").alias("notes")).withColumn("_str",F.concat_ws(",", F.col("notes"))).show()

+---+--------------------+-------------------+
| id|               notes|               _str|
+---+--------------------+-------------------+
| 10|     [mango, orange]|       mango,orange|
| 11|[jeans, jeans, T-...|jeans,jeans,T-shirt|
| 12|            [Banana]|             Banana|
+---+--------------------+-------------------+

