In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
from pyspark.sql.functions import *

In [3]:
orders_df = spark.read \
.format("csv") \
.option("inferSchema", "true") \
.option("header","true") \
.load("/public/trendytech/datasets/order_data.csv")

In [4]:
orders_df.show()

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536378|     null|PACK OF 60 DINOSA...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|     null|PACK OF 60 PINK P...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|    84991|60 TEATIME FAIRY ...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|   84519A|TOMATO CHARLIE+LO...|       6|01-12-2010 9.37|     2.95|     14688|United Kingdom|
|   536378|   85183B|CHARLIE & LOLA WA...|      48|01-12-2010 9.37|     1.25|     14688|United Kingdom|
|   536378|   85071B|RED CHARLIE+LOLA ...|      96|01-12-2010 9.37|     0.38|     14688|United Kingdom|
|   536378|    21931|JUMBO STORAGE BAG...|      10|01-12-2010 9.

### Find total No of Rows, Invoice, total_quantity, & average price
#### [Programmatic WAY: More Like Object notation]

In [5]:
orders_df.select(count("*").alias("row_count"),countDistinct("InvoiceNo").alias("unique_invoice"),sum("quantity").alias("total_quantity"),avg("unitprice").alias("avg_price")).show()

+---------+--------------+--------------+----------------+
|row_count|unique_invoice|total_quantity|       avg_price|
+---------+--------------+--------------+----------------+
|   541782|         25858|       5175855|4.61156532332193|
+---------+--------------+--------------+----------------+



#### [USING SQL TYPE of WAY: More like expression notation]

In [6]:
orders_df.selectExpr("count (*) as row_count","count(Distinct (InvoiceNo)) as unique_invoice","sum(quantity) as total_quantity", "avg(unitprice) as avg_price").show()

+---------+--------------+--------------+-----------------+
|row_count|unique_invoice|total_quantity|        avg_price|
+---------+--------------+--------------+-----------------+
|   541782|         25858|       5175855|4.611565323321932|
+---------+--------------+--------------+-----------------+



### Spark SQL Table Method

In [7]:
orders_df.createOrReplaceTempView("orders")

In [8]:
spark.sql("select count(*) as row_count, count(Distinct(InvoiceNo)) as unique_invoice, sum(quantity) as total_quantity, avg(unitprice) as avg_price from orders").show()

+---------+--------------+--------------+-----------------+
|row_count|unique_invoice|total_quantity|        avg_price|
+---------+--------------+--------------+-----------------+
|   541782|         25858|       5175855|4.611565323321928|
+---------+--------------+--------------+-----------------+

