In [2]:
import findspark 
findspark.init()

In [3]:
from pyspark.conf import SparkConf
config = SparkConf()
config.setMaster("spark://192.168.11.71:7077").setAppName("ConfigurationApp")
config.set("spark.executor.memory", "4g")
config.set("spark.executor.cores", 2)
config.set("spark.cores.max", 2)
config.set("spark.driver.memory", "4g")


from pyspark.sql import SparkSession
spark = SparkSession.builder.config(conf=config).getOrCreate()

#from pyspark import SparkContext
#sc = SparkContext(conf=config)

In [5]:
from pyspark.sql.types import DoubleType,StructType, StringType, IntegerType, DateType
import pyspark.sql.functions as F

schema = StructType() \
         .add("InvoiceNo", StringType(), True) \
         .add("StockCode", StringType(), True) \
         .add("Description", StringType(), True) \
         .add("Quantity", IntegerType(), True) \
         .add("InvoiceDate", DateType(), True) \
         .add("UnitPrice", DoubleType(), True) \
         .add("CustomerID", IntegerType(), True) \
         .add("Country", StringType(), True)

In [27]:

dataSet = spark.read.format("csv") \
                .option("header", True) \
                .schema(schema) \
                .option("dateFormat", "MM/dd/yyyy HH:mm")\
                .load("hdfs://192.168.93.128:9000/ecommerce/data.csv")

dataSet.show(2)

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6| 2010-12-01|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6| 2010-12-01|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
only showing top 2 rows



In [10]:
# df.filter("InvoiceNo is NULL").count()
#df.filter("Quantity is NULL").count()
# df.filter("UnitPrice is NULL").count()
#df.filter("CustomerID is NULL").count()

135080

In [35]:

ecommerceDF = dataSet\
       .drop( F.col("StockCode"))\
       .drop( F.col("CustomerID"))\
       .drop( F.col("Country"))\
       .drop(F.col("Description"))\
       .filter("Quantity IS NOT NULL")\
       .filter("UnitPrice IS NOT NULL")\
       .filter("InvoiceNo IS NOT NULL") \
       .withColumn("Amount", F.col("Quantity") * F.col("UnitPrice"))\
       .drop("UnitPrice") 
        
             
ecommerceDF.show(2)

ecommerceDF.explain(True)

+---------+--------+-----------+------------------+
|InvoiceNo|Quantity|InvoiceDate|            Amount|
+---------+--------+-----------+------------------+
|   536365|       6| 2010-12-01|15.299999999999999|
|   536365|       6| 2010-12-01|             20.34|
+---------+--------+-----------+------------------+
only showing top 2 rows

== Parsed Logical Plan ==
Project [InvoiceNo#357, Quantity#360, InvoiceDate#361, Amount#701]
+- Project [InvoiceNo#357, Quantity#360, InvoiceDate#361, UnitPrice#362, (cast(Quantity#360 as double) * UnitPrice#362) AS Amount#701]
   +- Filter isnotnull(InvoiceNo#357)
      +- Filter (isnotnull(Quantity#360) && isnotnull(UnitPrice#362))
         +- Project [InvoiceNo#357, Quantity#360, InvoiceDate#361, UnitPrice#362]
            +- Project [InvoiceNo#357, Description#359, Quantity#360, InvoiceDate#361, UnitPrice#362]
               +- Project [InvoiceNo#357, Description#359, Quantity#360, InvoiceDate#361, UnitPrice#362, Country#364]
                  +- Proj

In [45]:
# groupBy with invoice id, aggregate the sum of (Quantity * UnitPrice)

ecommerceDF = dataSet\
       .select("InvoiceNo", "Quantity", "UnitPrice")\
       .filter("Quantity IS NOT NULL")\
       .filter("UnitPrice IS NOT NULL")\
       .filter("InvoiceNo IS NOT NULL") \
       .withColumn("Amount", F.col("Quantity") * F.col("UnitPrice"))\
       .drop("UnitPrice")\
       .drop("Quantity") \
        

ecommerceDF = dataSet\
       .select("InvoiceNo", "Quantity", "UnitPrice")\
       .filter("Quantity IS NOT NULL")\
       .filter("UnitPrice IS NOT NULL")\
       .filter("InvoiceNo IS NOT NULL") \
       .withColumn("Amount", F.col("Quantity") * F.col("UnitPrice"))\
       .select("InvoiceNo", "Amount")
             
     
aggInvoiceAmountDF = ecommerceDF.groupBy("InvoiceNo").agg(F.sum("Amount"))
aggInvoiceAmountDF.show(10)

+---------+------------------+
|InvoiceNo|       sum(Amount)|
+---------+------------------+
|   536596|             38.09|
|   536938|1680.8799999999999|
|   537252|26.349999999999998|
|   537691|            310.57|
|   538041|               0.0|
|   538184|458.91999999999985|
|   538517|320.28000000000003|
|   538879| 338.9799999999999|
|   539275|403.79999999999995|
|   539630|             751.0|
+---------+------------------+
only showing top 10 rows



In [42]:
# groupBy invoiceId to find how many unique invoices or distinct 
# InvoiceNo|Quantity|InvoiceDate|Amount
# .collect()
invoiceDf = dataSet\
       .select("InvoiceNo")\
       .filter("InvoiceNo IS NOT NULL")\
       .distinct()
        
invoiceDf.show()
invoiceDf.explain(True)
print("Count ",invoiceDf.count())

+---------+
|InvoiceNo|
+---------+
|   536596|
|   536938|
|   537252|
|   537691|
|   538041|
|   538184|
|   538517|
|   538879|
|   539275|
|   539630|
|   540499|
|   540540|
|  C540850|
|   540976|
|   541432|
|   541518|
|   541783|
|   542026|
|   542375|
|  C542604|
+---------+
only showing top 20 rows

== Parsed Logical Plan ==
Deduplicate [InvoiceNo#357]
+- Filter isnotnull(InvoiceNo#357)
   +- Project [InvoiceNo#357]
      +- Relation[InvoiceNo#357,StockCode#358,Description#359,Quantity#360,InvoiceDate#361,UnitPrice#362,CustomerID#363,Country#364] csv

== Analyzed Logical Plan ==
InvoiceNo: string
Deduplicate [InvoiceNo#357]
+- Filter isnotnull(InvoiceNo#357)
   +- Project [InvoiceNo#357]
      +- Relation[InvoiceNo#357,StockCode#358,Description#359,Quantity#360,InvoiceDate#361,UnitPrice#362,CustomerID#363,Country#364] csv

== Optimized Logical Plan ==
Aggregate [InvoiceNo#357], [InvoiceNo#357]
+- Project [InvoiceNo#357]
   +- Filter isnotnull(InvoiceNo#357)
      +- Relati

In [4]:
spark.stop()