In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType

In [2]:
scSpark = SparkSession.builder.appName("Spark Example").getOrCreate()

In [3]:
df_merged = scSpark.read.csv("df*.csv", header=True)

In [4]:
df_merged.limit(5).show()

+-----------+----------+-----------+
|customer_id|trans_date|tran_amount|
+-----------+----------+-----------+
|     CS5276|2014-09-02|         62|
|     CS4304|2014-04-03|         94|
|     CS5412|2011-11-07|         59|
|     CS3310|2012-01-02|         36|
|     CS4032|2012-05-15|         42|
+-----------+----------+-----------+



In [5]:
df_merged = df_merged.withColumn(
    "tran_amount", df_merged["tran_amount"].cast(IntegerType())
)

In [6]:
transactions_per_day = df_merged.groupBy("trans_date").sum("tran_amount")
transactions_per_day.show()

+----------+----------------+
|trans_date|sum(tran_amount)|
+----------+----------------+
|2014-05-27|             232|
|2013-03-14|              97|
|2014-12-13|             247|
|2014-02-22|              50|
|2012-10-21|             362|
|2014-07-14|             169|
|2012-03-04|             218|
|2015-02-27|             242|
|2014-12-11|             230|
|2015-02-28|             357|
|2015-02-26|             221|
|2012-03-09|             205|
|2011-07-16|             153|
|2012-01-12|              66|
|2013-12-28|             319|
|2015-02-08|             298|
|2014-11-01|             173|
|2014-03-17|             438|
|2013-11-08|              89|
|2012-05-30|             392|
+----------+----------------+
only showing top 20 rows



In [7]:
df_merged.createOrReplaceTempView("sales")

In [8]:
output = scSpark.sql("SELECT * from sales")
output.show()

+-----------+----------+-----------+
|customer_id|trans_date|tran_amount|
+-----------+----------+-----------+
|     CS5276|2014-09-02|         62|
|     CS4304|2014-04-03|         94|
|     CS5412|2011-11-07|         59|
|     CS3310|2012-01-02|         36|
|     CS4032|2012-05-15|         42|
|     CS5240|2014-11-01|        104|
|     CS5561|2011-11-14|         36|
|     CS4543|2012-04-05|         81|
|     CS4375|2013-08-30|         36|
|     CS6051|2013-12-06|         85|
|     CS2881|2014-05-06|         94|
|     CS2376|2012-01-25|         63|
|     CS1505|2012-08-10|         77|
|     CS1575|2013-07-16|         36|
|     CS5461|2014-05-01|         73|
|     CS2573|2012-03-03|         40|
|     CS1996|2012-04-27|         47|
|     CS2987|2013-12-14|         42|
|     CS4294|2014-11-20|         50|
|     CS2891|2012-08-04|         83|
+-----------+----------+-----------+
only showing top 20 rows



In [9]:
transactions_per_day = scSpark.sql(
    "SELECT trans_date, SUM(tran_amount) as total FROM sales GROUP BY trans_date;"
)
transactions_per_day.show()

+----------+-----+
|trans_date|total|
+----------+-----+
|2014-05-27|  232|
|2013-03-14|   97|
|2014-12-13|  247|
|2014-02-22|   50|
|2012-10-21|  362|
|2014-07-14|  169|
|2012-03-04|  218|
|2015-02-27|  242|
|2014-12-11|  230|
|2015-02-28|  357|
|2015-02-26|  221|
|2012-03-09|  205|
|2011-07-16|  153|
|2012-01-12|   66|
|2013-12-28|  319|
|2015-02-08|  298|
|2014-11-01|  173|
|2014-03-17|  438|
|2013-11-08|   89|
|2012-05-30|  392|
+----------+-----+
only showing top 20 rows

