In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,expr,month,year,avg,max,count,when,rank
from pyspark.sql.window import Window

spark=SparkSession.builder.appName("productorders").getOrCreate()

data=[
(201,"yuva","laptop","electronics",2,45000,"2023-01-10"),
(202,"theshi","shirt","clothing",1,2000,"2023-02-12"),
(203,"kavya","table","furniture",1,12000,"2023-01-25"),
(204,"priya","smartphone","electronics",4,30000,"2023-03-01"),
(205,"mani","chair","furniture",2,4500,"2023-02-18"),
(206,"harish","bookshelf","furniture",5,7000,"2023-01-05"),
(207,"swathi","novel","books",2,500,"2023-01-20"),
(208,"priya","t-shirt","clothing",5,1000,"2023-03-10"),
(209,"ravi","tablet","electronics",1,25000,"2023-01-15"),
(210,"theshi","notebook","books",3,300,"2023-03-03"),
(211,"preetha","sofa","furniture",1,15000,"2023-02-22"),
(212,"Ram","jacket","clothing",2,3500,"2023-01-30")
]

columns=["orderid","customername","product","category","quantity","unitprice","orderdate"]

df=spark.createDataFrame(data,columns)
df.createOrReplaceTempView("orders_local")
df.createOrReplaceGlobalTempView("orders_global")



PART A

In [8]:
#1
spark.sql("select * from orders_local where category='electronics' and quantity>=2").show()


+-------+------------+----------+-----------+--------+---------+----------+
|orderid|customername|   product|   category|quantity|unitprice| orderdate|
+-------+------------+----------+-----------+--------+---------+----------+
|    201|        yuva|    laptop|electronics|       2|    45000|2023-01-10|
|    204|       priya|smartphone|electronics|       4|    30000|2023-03-01|
+-------+------------+----------+-----------+--------+---------+----------+



In [9]:
#2
spark.sql("select *,quantity*unitprice as totalamount from orders_local").show()


+-------+------------+----------+-----------+--------+---------+----------+-----------+
|orderid|customername|   product|   category|quantity|unitprice| orderdate|totalamount|
+-------+------------+----------+-----------+--------+---------+----------+-----------+
|    201|        yuva|    laptop|electronics|       2|    45000|2023-01-10|      90000|
|    202|      theshi|     shirt|   clothing|       1|     2000|2023-02-12|       2000|
|    203|       kavya|     table|  furniture|       1|    12000|2023-01-25|      12000|
|    204|       priya|smartphone|electronics|       4|    30000|2023-03-01|     120000|
|    205|        mani|     chair|  furniture|       2|     4500|2023-02-18|       9000|
|    206|      harish| bookshelf|  furniture|       5|     7000|2023-01-05|      35000|
|    207|      swathi|     novel|      books|       2|      500|2023-01-20|       1000|
|    208|       priya|   t-shirt|   clothing|       5|     1000|2023-03-10|       5000|
|    209|        ravi|    tablet

In [10]:
#3
spark.sql("select category,count(*) as ordercount from orders_local group by category").show()

+-----------+----------+
|   category|ordercount|
+-----------+----------+
|  furniture|         4|
|electronics|         3|
|   clothing|         3|
|      books|         2|
+-----------+----------+



In [11]:
#4
spark.sql("select * from orders_local where orderdate like '2023-01%'").show()


+-------+------------+---------+-----------+--------+---------+----------+
|orderid|customername|  product|   category|quantity|unitprice| orderdate|
+-------+------------+---------+-----------+--------+---------+----------+
|    201|        yuva|   laptop|electronics|       2|    45000|2023-01-10|
|    203|       kavya|    table|  furniture|       1|    12000|2023-01-25|
|    206|      harish|bookshelf|  furniture|       5|     7000|2023-01-05|
|    207|      swathi|    novel|      books|       2|      500|2023-01-20|
|    209|        ravi|   tablet|electronics|       1|    25000|2023-01-15|
|    212|         Ram|   jacket|   clothing|       2|     3500|2023-01-30|
+-------+------------+---------+-----------+--------+---------+----------+



In [12]:
#5
spark.sql("select category,avg(unitprice) as avgprice from orders_local group by category").show()

+-----------+------------------+
|   category|          avgprice|
+-----------+------------------+
|  furniture|            9625.0|
|electronics|33333.333333333336|
|   clothing|2166.6666666666665|
|      books|             400.0|
+-----------+------------------+



In [13]:
#6
spark.sql("select *,quantity*unitprice as totalamount from orders_local order by totalamount desc limit 1").show()

+-------+------------+----------+-----------+--------+---------+----------+-----------+
|orderid|customername|   product|   category|quantity|unitprice| orderdate|totalamount|
+-------+------------+----------+-----------+--------+---------+----------+-----------+
|    204|       priya|smartphone|electronics|       4|    30000|2023-03-01|     120000|
+-------+------------+----------+-----------+--------+---------+----------+-----------+



In [16]:
#7
spark.catalog.dropTempView("orders_local")


True

In [17]:
try:
  spark.sql("select * from orders_local").show()
except Exception as e:
  print("error:",e)

error: [TABLE_OR_VIEW_NOT_FOUND] The table or view `orders_local` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [orders_local], [], false



PART B

In [18]:
#1
spark.sql("select *,quantity*unitprice as totalamount from global_temp.orders_global where category='furniture' and quantity*unitprice>10000").show()

+-------+------------+---------+---------+--------+---------+----------+-----------+
|orderid|customername|  product| category|quantity|unitprice| orderdate|totalamount|
+-------+------------+---------+---------+--------+---------+----------+-----------+
|    203|       kavya|    table|furniture|       1|    12000|2023-01-25|      12000|
|    206|      harish|bookshelf|furniture|       5|     7000|2023-01-05|      35000|
|    211|     preetha|     sofa|furniture|       1|    15000|2023-02-22|      15000|
+-------+------------+---------+---------+--------+---------+----------+-----------+



In [19]:
#2
spark.sql("select *,case when quantity>3 then 'yes' else 'no' end as discountflag from global_temp.orders_global").show()

+-------+------------+----------+-----------+--------+---------+----------+------------+
|orderid|customername|   product|   category|quantity|unitprice| orderdate|discountflag|
+-------+------------+----------+-----------+--------+---------+----------+------------+
|    201|        yuva|    laptop|electronics|       2|    45000|2023-01-10|          no|
|    202|      theshi|     shirt|   clothing|       1|     2000|2023-02-12|          no|
|    203|       kavya|     table|  furniture|       1|    12000|2023-01-25|          no|
|    204|       priya|smartphone|electronics|       4|    30000|2023-03-01|         yes|
|    205|        mani|     chair|  furniture|       2|     4500|2023-02-18|          no|
|    206|      harish| bookshelf|  furniture|       5|     7000|2023-01-05|         yes|
|    207|      swathi|     novel|      books|       2|      500|2023-01-20|          no|
|    208|       priya|   t-shirt|   clothing|       5|     1000|2023-03-10|         yes|
|    209|        ravi

In [20]:
#3
spark.sql("select customername,count(distinct category) as categorycount from global_temp.orders_global group by customername having categorycount>1").show()

+------------+-------------+
|customername|categorycount|
+------------+-------------+
|       priya|            2|
|      theshi|            2|
+------------+-------------+



In [21]:
#4
spark.sql("select substring(orderdate,1,7) as month,count(*) as ordercount from global_temp.orders_global group by month").show()

+-------+----------+
|  month|ordercount|
+-------+----------+
|2023-03|         3|
|2023-02|         3|
|2023-01|         6|
+-------+----------+



In [23]:
#5
from pyspark.sql.functions import sum as _sum
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
products_df = spark.sql("""
    select product, sum(quantity) as totalsold
    from global_temp.orders_global
    group by product
""")

windowSpec = Window.orderBy(col("TotalSold").desc())
ranked = products_df.withColumn("Rank", rank().over(windowSpec))
ranked.show()

+----------+---------+----+
|   product|totalsold|Rank|
+----------+---------+----+
| bookshelf|        5|   1|
|   t-shirt|        5|   1|
|smartphone|        4|   3|
|  notebook|        3|   4|
|    laptop|        2|   5|
|     chair|        2|   5|
|     novel|        2|   5|
|    jacket|        2|   5|
|     shirt|        1|   9|
|     table|        1|   9|
|    tablet|        1|   9|
|      sofa|        1|   9|
+----------+---------+----+



In [28]:
from pyspark.sql import SparkSession

newspark = SparkSession.builder.appName("newsession").getOrCreate()
newspark.sql("select * from global_temp.orders_global where category='books'").show()


+-------+------------+--------+--------+--------+---------+----------+
|orderid|customername| product|category|quantity|unitprice| orderdate|
+-------+------------+--------+--------+--------+---------+----------+
|    207|      swathi|   novel|   books|       2|      500|2023-01-20|
|    210|      theshi|notebook|   books|       3|      300|2023-03-03|
+-------+------------+--------+--------+--------+---------+----------+



BONUS CHALLENGES

In [32]:
booksdf = spark.sql("select * from global_temp.orders_global where category='books'")
booksdf.createOrReplaceGlobalTempView("books_global")


In [35]:
#2
from pyspark.sql.window import Window
windowspec = Window.partitionBy("category").orderBy(col("totalqty").desc())
mostpurchased.withColumn("rank",rank().over(windowspec)) \
  .filter("rank=1") \
  .drop("rank") \
  .show()

+-----------+----------+--------+
|   category|   product|totalqty|
+-----------+----------+--------+
|      books|  notebook|       3|
|   clothing|   t-shirt|       5|
|electronics|smartphone|       4|
|  furniture| bookshelf|       5|
+-----------+----------+--------+



In [38]:
filtereddf = spark.sql("select * from global_temp.orders_global where category!='clothing'")
filtereddf.createOrReplaceTempView("filtered_orders")
spark.sql("select distinct category from filtered_orders").show()


+-----------+
|   category|
+-----------+
|  furniture|
|electronics|
|      books|
+-----------+

