In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType
from time import sleep

In [2]:
spark_conf = SparkConf()
spark_conf.setMaster("spark://master:7077")
spark_conf.setAppName("Lab7_3")
spark_conf.set("spark.driver.memory", "2g")
spark_conf.set("spark.executor.cores", "1")
spark_conf.set("spark.driver.cores", "1")

# Create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()

In [3]:
# We need to set the following configuration whenever we need to use GCS.
# Setup hadoop fs configuration for schema gs://
hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "delabs-temp"
spark.conf.set("temporaryGcsBucket", bucket)

In [4]:
data_schema = StructType([
    StructField("Arrival_Time", LongType(), True),
    StructField("Creation_Time", LongType(), True),
    StructField("Device", StringType(), True),
    StructField("Index", LongType(), True),
    StructField("Model", StringType(), True),
    StructField("User", StringType(), True),
    StructField("gt", StringType(), True),
    StructField("x", DoubleType(), True),
    StructField("y", DoubleType(), True),
    StructField("z", DoubleType(), True),
])

In [5]:
# Read from a source 
sdf = spark.readStream.schema(data_schema).option("maxFilesPerTrigger", 1).json("../data/activity")

In [6]:
# Do a calculation
activity_counts = sdf.groupBy("gt").count()

In [7]:
def batch_function(df, batch_id):
   # Saving the data to BigQuery as batch processing sink -see, use write(), save(), etc.
    df.write.format("bigquery").option("table", "delabs.lab7.activitycounts").mode("overwrite").save()

In [8]:
# Write to a sink - here, the output is written to a Big Query Table
# Use your gcp bucket name. 
# ProcessingTime trigger with two-seconds micro-batch interval
activity_query = activity_counts.writeStream.outputMode("complete").trigger(processingTime = "2 seconds").foreachBatch(batch_function).start()

In [9]:
try:
    activity_query.awaitTermination()
except KeyboardInterrupt:
    activity_query.stop()
    # Stop the spark context
    spark.stop()
    
    print("Stoped the streaming query and the spark context")

Stoped the streaming query and the spark context


In [10]:
# Stop the spark context
spark.stop()