# End to End Pure Streaming Data-Pipeline for Apartment Maintenance Table Using Spark Structured Streaming on Databricks

###### Description: In this notebook we read apt_maintenance state rows from incoming csv files into a streamig dataframe, transform (clean, cast, rename) the data, add/update the latest state to a Databricks Delta table
###### Objective: (incoming csv files) --> "apt_maintenance_streamingDF" --> "results_df" --> "apt_maintenance_data"

In [0]:
import requests
import json
import optimus as op
import phonenumbers 
import re
import datetime
import time

from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import unix_timestamp, from_unixtime
from pyspark.sql import functions as F
from pyspark.sql.window import Window as W
from pyspark.sql import DataFrame
from pyspark.sql.functions import lit
from pyspark.sql.functions import rank, col

In [0]:
# Schema for Apartment Maintenance
apt_maintenance_schema = StructType([
            StructField("Maintenance_id", IntegerType(), False),
            StructField("Apartment_number", IntegerType(), True),
            StructField("Mdate", TimestampType(), False),
            StructField("Issue_reported", StringType(), True),
            StructField("Contractor_id", IntegerType(), True), 
            StructField("Resolution", StringType(), True), 
            StructField("Status", StringType(), True),
            StructField("Charges_incurred", StringType(), True),
            StructField("event_time", TimestampType(), True)])

apt_maintenance_udf_schema = StructType([
            StructField("Apartment_number", IntegerType(), True),
            StructField("Mdate", TimestampType(), False),
            StructField("Issue_reported", StringType(), True),
            StructField("Contractor_id", IntegerType(), True), 
            StructField("Resolution", StringType(), True), 
            StructField("Status", StringType(), True),
            StructField("Charges_incurred", StringType(), True),
            StructField("event_time", TimestampType(), True)])

###### Description: Get apt_maintenance csv files as a streaming "apt_maintenance_streamingDF" and process it on the fly and get transformed stream "apt_maintenance_df"
###### Objective: (incoming csv files) --> "apt_maintenance_streamingDF" --> "apt_maintenance_df"

In [0]:
# Get apt_maintenance Steaming DataFrame from csv files

# streaming starts here by reading the input files 
apt_maintenance_Path = "/FileStore/apartment/apartment_maintenance/inprogress/"
apt_maintenance_streamingDF = (
  spark
    .readStream
    .schema(apt_maintenance_schema)
    .option("maxFilesPerTrigger", "1")
    .option("header", "true")
    .option("multiLine", "true")
    .csv(apt_maintenance_Path)
)
# Clear invalid rows
apt_maintenance_df = apt_maintenance_streamingDF.select("*").where("Maintenance_id IS NOT NULL")
# Instantiation of DataTransformer class:
transformer = op.DataFrameTransformer(apt_maintenance_df)
# Replace NA with 0's
transformer.replace_na(0.0, columns="*")
# Clear accents: clear_accents only from name column and not everywhere 
transformer.clear_accents(columns='*')
# Remove special characters:  From all Columns 
# transformer.remove_special_chars(columns=['apt_maintenance_name', 'Address_line_1', 'City', 'Post_code', 'Region'])

##### This function parses the corresponding columns into a single column

In [0]:
def my_fun(Apartment_number, Mdate, Issue_reported, Contractor_id, Resolution, Status, Charges_incurred, event_time):
  return zip(Apartment_number, Mdate, Issue_reported, Contractor_id, Resolution, Status, Charges_incurred, event_time)

udf_Fun = udf(my_fun, ArrayType(apt_maintenance_udf_schema))

In [0]:
intermediate_df = ( apt_maintenance_df.withWatermark("event_time", "10 seconds")
            .groupBy("Maintenance_id")
            .agg(F.collect_list("Apartment_number").alias("Apartment_number"),
                 F.collect_list("Mdate").alias("Mdate"),
                 F.collect_list("Issue_reported").alias("Issue_reported"),
                 F.collect_list("Contractor_id").alias("Contractor_id"), 
                 F.collect_list("Resolution").alias("Resolution"), 
                 F.collect_list("Status").alias("Status"), 
                 F.collect_list("Charges_incurred").alias("Charges_incurred"), 
                 F.collect_list("event_time").alias("event_time"), 
                 F.max("event_time").alias("latest_event_time"))
            .select("Maintenance_id", 
                    F.explode(udf_Fun(F.column("Apartment_number"), 
                                      F.column("Mdate"), 
                                      F.column("Issue_reported"), 
                                      F.column("Contractor_id"), 
                                      F.column("Resolution"), 
                                      F.column("Status"), 
                                      F.column("Charges_incurred"), 
                                      F.column("event_time")))
                    .alias("data"), "latest_event_time"))

##### Filter the data where event_time is latest

In [0]:
results_df = (intermediate_df
              .select("Maintenance_id", 
                      "data.Apartment_number", 
                      "data.Mdate", 
                      "data.Issue_reported", 
                      "data.Contractor_id", 
                      "data.Resolution", 
                      "data.Status",
                      "data.Charges_incurred", 
                      "data.event_time", 
                      "latest_event_time")
              .where("data.event_time=latest_event_time")).orderBy("Maintenance_id")

##### Display final result
###### This result shows the latest state of all the unique apt_maintenance_id

In [0]:
display(results_df)

##### Below cells are optional if external functionality or storage is needed

###### Write the stream to a Databricks Delta table for storage

In [0]:
streaming_query = (results_df.writeStream
 .format("delta")
 .outputMode("complete")
 .option("mergeSchema", "true")
 .option("checkpointLocation", "/delta/apartment/apartment_maintenance/_checkpoints/streaming-agg")
 .start("/delta/apartment/apartment_maintenance_data"))

#### Read the Delta Table as a Static or Streaming DataFrame
#### This dataframe will always be Up-To-Date

In [0]:
apt_maintenance_data = spark.read.format("delta").load("/delta/apartment/apartment_maintenance_data").orderBy("Maintenance_id")

In [0]:
display(apt_maintenance_data)

### Do Some Live Streaming Graphs

In [0]:
apt_maintenance_data_stream = spark.readStream.format("delta").load("/delta/apartment/apartment_maintenance_data")

In [0]:
display(apt_maintenance_data_stream.groupBy("Status").count())