## Initialize a Spark Session

In [1]:
import findspark
findspark.init()
import pyspark
# Creating a SparkSession in Python
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local")\
          .appName("Spark Streaming Demonstration")\
          .config("spark.some.config.option", "some-value")\
          .getOrCreate()
# keep the size of shuffles small
spark.conf.set("spark.sql.shuffle.partitions", "2") 

## Task 1: Discover a method to simulate a stream by utilizing data sourced from files


1. Define input path

In [2]:
# input path in local filesystem
inputPath = "./"

2. Get schema from input files

In [3]:
staticInputDF = (
  spark
    .read
    .csv(inputPath)
)

schema = staticInputDF.schema

In [4]:
schema

StructType([StructField('_c0', StringType(), True), StructField('_c1', StringType(), True), StructField('_c2', StringType(), True), StructField('_c3', StringType(), True), StructField('_c4', StringType(), True), StructField('_c5', StringType(), True), StructField('_c6', StringType(), True), StructField('_c7', StringType(), True), StructField('_c8', StringType(), True), StructField('_c9', StringType(), True), StructField('_c10', StringType(), True), StructField('_c11', StringType(), True), StructField('_c12', StringType(), True), StructField('_c13', StringType(), True), StructField('_c14', StringType(), True), StructField('_c15', StringType(), True), StructField('_c16', StringType(), True), StructField('_c17', StringType(), True), StructField('_c18', StringType(), True), StructField('_c19', StringType(), True)])

3. Tạo `streaming_df` và lấy ra các cột `Action` và `Time`

In [5]:
import pyspark.sql.functions as f

streaming_df = (
  spark
    .readStream
    .schema(schema)          # Set the schema of the csv data
#     .option("maxFilesPerTrigger", 1)
    .csv(inputPath)
)

streaming_df = streaming_df.select(f.col('_c0').alias('Action'), f.col('_c3').alias('Time'))

4. Create `streaming_df_count` for counting trips by drop-off datetime 

In [6]:
from pyspark.sql.functions import *      # for window() function
streaming_df_count = (
  streaming_df
    .groupBy(window(streaming_df.Time, "1 hour"))    
    .count()
)

print('is process Counting streaming?', streaming_df_count.isStreaming)

is process Counting streaming? True


 ## Task 2: Create query that aggregates the number of trips by dropoff datetime for each hour.

In [9]:
# This query stores the aggregation results in memory then visualize it
query = (
  streaming_df_count
    .writeStream
    .format("memory")         # console or memory(= store in-memory table) 
    .queryName("counts")      # counts = name of the in-memory table
    .outputMode("complete")   
    .option("truncate", "false")
    .start()
)
query.awaitTermination(600)

query.stop()

result = spark.sql('select * from counts order by window').show(truncate=False)

+------------------------------------------+-----+
|window                                    |count|
+------------------------------------------+-----+
|{2015-12-01 00:00:00, 2015-12-01 01:00:00}|1645 |
|{2015-12-01 01:00:00, 2015-12-01 02:00:00}|5780 |
|{2015-12-01 02:00:00, 2015-12-01 03:00:00}|3605 |
|{2015-12-01 03:00:00, 2015-12-01 04:00:00}|2426 |
|{2015-12-01 04:00:00, 2015-12-01 05:00:00}|2505 |
|{2015-12-01 05:00:00, 2015-12-01 06:00:00}|3858 |
|{2015-12-01 06:00:00, 2015-12-01 07:00:00}|10258|
|{2015-12-01 07:00:00, 2015-12-01 08:00:00}|19007|
|{2015-12-01 08:00:00, 2015-12-01 09:00:00}|7321 |
+------------------------------------------+-----+



6. Create folders and files to store each intervals 

In [10]:
import os

count = 360000
for row in result.collect():
#     print(row)
    newpath = r'./output-' + str(count) 
    if not os.path.exists(newpath):
        os.makedirs(newpath)
    with open(newpath + f"/output-{count}.txt", "w") as file:
        file.write(str(row['count']))
    count += 360000

AttributeError: 'NoneType' object has no attribute 'collect'