In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")
print (spark.sparkContext)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/10 17:24:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


<SparkContext master=local[*] appName=pyspark-shell>


In [2]:
from pyspark.sql.functions import to_date, date_trunc, minute, unix_timestamp, from_unixtime, when, col, min, max, lit

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, LongType, TimestampType

sellsSchema = StructType([
    StructField("InvoiceNo", LongType(), True),
    StructField("StockCode", LongType(), True),
    StructField("Description", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("InvoiceDate", TimestampType(), True),
    StructField("UnitPrice", DoubleType(), True),
    StructField("CustomerID", LongType(), True),
    StructField("Country", StringType(), True)
])

In [4]:
sellsRaw = spark.read.option("header", "true").option("timestampFormat","mm/d/yyyy h:mm").csv("./Data/datautf8.csv", schema=sellsSchema)
sellsRaw.show()

                                                                                

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|     NULL|WHITE HANGING HEA...|       6|2010-01-01 08:26:00|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-01-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|     NULL|CREAM CUPID HEART...|       8|2010-01-01 08:26:00|     2.75|     17850|United Kingdom|
|   536365|     NULL|KNITTED UNION FLA...|       6|2010-01-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|     NULL|RED WOOLLY HOTTIE...|       6|2010-01-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-01-01 08:26:00|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS S

In [5]:
#  Total No Of Sells Records
sellsRaw.count()

                                                                                

541909

In [6]:
# Total No Of Sells Day Wise

sellsWithDate = sellsRaw.withColumn("InvoiceDateOnly",to_date(sellsRaw.InvoiceDate))

In [7]:
# sellsWithDate.show()
sellsWithDate.groupBy("InvoiceDateOnly").count().show()



+---------------+-----+
|InvoiceDateOnly|count|
+---------------+-----+
|     2011-01-30|15415|
|     2011-01-29|11925|
|     2011-01-23|16842|
|     2011-01-25|16232|
|     2011-01-03|12293|
|     2011-01-01|14423|
|     2011-01-14|15998|
|     2010-01-09| 2891|
|     2010-01-19|  522|
|     2011-01-04|19617|
|     2011-01-28|17265|
|     2011-01-24|16888|
|     2011-01-15|14341|
|     2010-01-06| 3878|
|     2010-01-10| 2758|
|     2011-01-06|20304|
|     2011-01-16|13813|
|     2010-01-07| 2963|
|     2010-01-01| 3108|
|     2011-01-05|19379|
+---------------+-----+
only showing top 20 rows



                                                                                

In [8]:
# Total No Of Sell Hourly Discretized
# If Timestamps minute is <=30 then round it to hour
# If Timestamps minute is > 30 then round it to hour + 1

SellsWithHourlyDescreteTimestamp = sellsRaw\
    .withColumn("minutes", minute(sellsRaw.InvoiceDate)) \
    .withColumn("unixInvoiceDate", unix_timestamp(sellsRaw.InvoiceDate)) \
    .withColumn("DescreteUnixInvoiceDate", when(col("minutes") <= 30, col("unixInvoiceDate")-60*col("minutes")).otherwise(col("unixInvoiceDate")+(60-col("minutes"))*60)) \
    .withColumn("DescreteInvoiceDate", from_unixtime(col("DescreteUnixInvoiceDate"))) \
    .drop("minutes", "unixInvoiceDate", "DescreteUnixInvoiceDate")

hourlyDescreteSells = SellsWithHourlyDescreteTimestamp.groupBy(SellsWithHourlyDescreteTimestamp.DescreteInvoiceDate)
hourlyDescreteSellsCounts = hourlyDescreteSells.count()

In [9]:
# As above timestamp has so many empty timeslots in between will add those to create continues time series
minDescreteInvoiceDate, maxDescreteInvoiceDate = hourlyDescreteSellsCounts.agg(min(hourlyDescreteSellsCounts.DescreteInvoiceDate), max(hourlyDescreteSellsCounts.DescreteInvoiceDate)).first()
print(minDescreteInvoiceDate)
print (maxDescreteInvoiceDate)
from datetime import datetime
unixminDescreteInvoiceDate = datetime.strptime(minDescreteInvoiceDate, "%Y-%m-%d %H:%M:%S").strftime('%s')
unixmaxDescreteInvoiceDate = datetime.strptime(maxDescreteInvoiceDate, "%Y-%m-%d %H:%M:%S").strftime('%s')


CompleteTimeSeries = spark.range(unixminDescreteInvoiceDate, unixmaxDescreteInvoiceDate, 60*60).select(col("id").cast("timestamp").alias("TimeSeries"), lit(0).alias("count"))
CompleteTimeSeries.show()

                                                                                

2010-01-01 00:00:00
2011-01-31 20:00:00
+-------------------+-----+
|         TimeSeries|count|
+-------------------+-----+
|2010-01-01 00:00:00|    0|
|2010-01-01 01:00:00|    0|
|2010-01-01 02:00:00|    0|
|2010-01-01 03:00:00|    0|
|2010-01-01 04:00:00|    0|
|2010-01-01 05:00:00|    0|
|2010-01-01 06:00:00|    0|
|2010-01-01 07:00:00|    0|
|2010-01-01 08:00:00|    0|
|2010-01-01 09:00:00|    0|
|2010-01-01 10:00:00|    0|
|2010-01-01 11:00:00|    0|
|2010-01-01 12:00:00|    0|
|2010-01-01 13:00:00|    0|
|2010-01-01 14:00:00|    0|
|2010-01-01 15:00:00|    0|
|2010-01-01 16:00:00|    0|
|2010-01-01 17:00:00|    0|
|2010-01-01 18:00:00|    0|
|2010-01-01 19:00:00|    0|
+-------------------+-----+
only showing top 20 rows



In [None]:
CompleteTimeseriesSellsCount = CompleteTimeSeries \
    .join(hourlyDescreteSellsCounts, CompleteTimeSeries.TimeSeries == hourlyDescreteSellsCounts.DescreteInvoiceDate, how="left_outer") \
    .select("TimeSeries", (when(hourlyDescreteSellsCounts.DescreteInvoiceDate.isNull(), CompleteTimeSeries["count"]).otherwise(hourlyDescreteSellsCounts["count"])).alias("count")) \
    .orderBy("TimeSeries")

CompleteTimeseriesSellsCount.show()

