# End to End Pure Streaming Data-Pipeline for Landlord Table Using Spark Structured Streaming on Databricks

###### Description: In this notebook we read landlord state rows from incoming csv files into a streamig dataframe, transform (clean, cast, rename) the data, add/update the latest state to a Databricks Delta table
###### Objective: (incoming csv files) --> "landlord_streamingDF" --> "results_df" --> "landlord_data"

In [0]:
import requests
import json
import optimus as op
import phonenumbers 
import re
import datetime
import time

from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import unix_timestamp, from_unixtime
from pyspark.sql import functions as F
from pyspark.sql.window import Window as W
from pyspark.sql import DataFrame
from pyspark.sql.functions import lit
from pyspark.sql.functions import rank, col

In [0]:
# Schema for Landlord JSON
landlord_schema = StructType([
            StructField("Landlord_id", IntegerType(), False),
            StructField("Password", StringType(), True),
            StructField("Landlord_name", StringType(), False),
            StructField("Address_line_1", StringType(), False),
            StructField("City", StringType(), False),
            StructField("Post_code", StringType(), True),
            StructField("Region", StringType(), True),
            StructField("event_time", TimestampType(), True)])

landlord_udf_schema = StructType([
            StructField("Password", StringType(), True),
            StructField("Landlord_name", StringType(), False),
            StructField("Address_line_1", StringType(), False),
            StructField("City", StringType(), False),
            StructField("Post_code", StringType(), True),
            StructField("Region", StringType(), True),
            StructField("event_time", TimestampType(), True)])

###### Description: Get landlord csv files as a streaming "landlord_streamingDF" and process it on the fly and get transformed stream "landlord_df"
###### Objective: (incoming csv files) --> "landlord_streamingDF" --> "landlord_df"

In [0]:
# Get landlord Steaming DataFrame from csv files

# streaming starts here by reading the input files 
landlord_Path = "/FileStore/apartment/landlord/inprogress/"
landlord_streamingDF = (
  spark
    .readStream
    .schema(landlord_schema)
    .option("maxFilesPerTrigger", "1")
    .option("header", "true")
    .option("multiLine", "true")
    .csv(landlord_Path)
)
# Clear invalid rows
landlord_df = landlord_streamingDF.select("*").where("Landlord_id IS NOT NULL")
# Instantiation of DataTransformer class:
transformer = op.DataFrameTransformer(landlord_df)
# Replace NA with 0's
transformer.replace_na(0.0, columns="*")
# Clear accents: clear_accents only from name column and not everywhere 
transformer.clear_accents(columns='*')
# Remove special characters:  From all Columns 
transformer.remove_special_chars(columns=['Address_line_1', 'City', 'Post_code', 'Region'])

##### This function parses the corresponding columns into a single column

In [0]:
def my_fun(Password, Landlord_name, Address_line_1, City, Post_code, Region, event_time):
  return zip(Password, Landlord_name, Address_line_1, City, Post_code, Region, event_time)

udf_Fun = udf(my_fun, ArrayType(landlord_udf_schema))

In [0]:
Landlord_id, Password, Landlord_name, Address_line_1, City, Post_code, Region, event_time

In [0]:
intermediate_df = ( landlord_df.withWatermark("event_time", "10 seconds")
            .groupBy("Landlord_id")
            .agg(F.collect_list("Password").alias("Password"),
                 F.collect_list("Landlord_name").alias("Landlord_name"),
                 F.collect_list("Address_line_1").alias("Address_line_1"),
                 F.collect_list("City").alias("City"),
                 F.collect_list("Post_code").alias("Post_code"),
                 F.collect_list("Region").alias("Region"),
                 F.collect_list("event_time").alias("event_time"), 
                 F.max("event_time").alias("latest_event_time"))
            .select("Landlord_id", 
                    F.explode(udf_Fun(F.column("Password"), 
                                      F.column("Landlord_name"), 
                                      F.column("Address_line_1"), 
                                      F.column("City"), 
                                      F.column("Post_code"),
                                      F.column("Region"), 
                                      F.column("event_time")))
                    .alias("data"), "latest_event_time"))

##### Filter the data where event_time is latest

In [0]:
results_df = (intermediate_df
              .select("Landlord_id", 
                      "data.Password", 
                      "data.Landlord_name", 
                      "data.Address_line_1", 
                      "data.City", 
                      "data.Post_code", 
                      "data.Region",
                      "data.event_time", 
                      "latest_event_time")
              .where("data.event_time=latest_event_time")).orderBy("Landlord_id")

##### Display final result
###### This result shows the latest state of all the unique Building_id

In [0]:
display(results_df)

Landlord_id,Password,Landlord_name,Address_line_1,City,Post_code,Region,event_time,latest_event_time
28,Thq6P8m,Laurence Pickthorne,08574 Dennis Drive,Salt Lake City,84130,Utah,2018-05-08T11:40:12.000+0000,2018-05-08T11:40:12.000+0000
31,SUfXFDT1ZL,Alfy Swatheridge,61360 7th Hill,Shreveport,71130,Louisiana,2018-05-07T18:20:09.000+0000,2018-05-07T18:20:09.000+0000
33,5gdi4FdXOUR,Cesaro Brunker,3 Summit Place,Spartanburg,29305,South Carolina,2018-01-20T22:42:38.000+0000,2018-01-20T22:42:38.000+0000
59,2E58omBiH,Ambrosius Ridgewell,64 Shelley Junction,Jeffersonville,47134,Indiana,2017-05-14T23:13:02.000+0000,2017-05-14T23:13:02.000+0000
60,7hOECUg7,Gusty Garrould,202 American Crossing,Salem,97306,Oregon,2017-12-26T02:31:22.000+0000,2017-12-26T02:31:22.000+0000
88,RhTHkvl,Marci Schettini,90 Dwight Center,El Paso,79911,Texas,2016-11-30T20:10:32.000+0000,2016-11-30T20:10:32.000+0000
124,CzLHSynx4rQZ,Collen Blackstock,4 Graedel Junction,El Paso,88563,Texas,2018-06-05T08:46:57.000+0000,2018-06-05T08:46:57.000+0000
182,ssTScp,Northrop Haxbie,7057 Morning Pass,Milwaukee,53234,Wisconsin,2017-12-26T02:02:49.000+0000,2017-12-26T02:02:49.000+0000
191,JyOw4TYA,Marcella Brahms,5810 Quincy Parkway,Portland,97206,Oregon,2018-06-12T10:37:22.000+0000,2018-06-12T10:37:22.000+0000
193,nUHC1pem,Honoria Etoile,38 Glacier Hill Circle,Oklahoma City,73114,Oklahoma,2017-04-25T22:28:53.000+0000,2017-04-25T22:28:53.000+0000


##### Below cells are optional if external functionality or storage is needed

###### Write the stream to a Databricks Delta table for storage

In [0]:
streaming_query = (results_df.writeStream
 .format("delta")
 .outputMode("complete")
 .option("mergeSchema", "true")
 .option("checkpointLocation", "/delta/apartment/landlord/_checkpoints/streaming-agg")
 .start("/delta/apartment/landlord_data"))

#### Read the Delta Table as a Static or Streaming DataFrame
#### This dataframe will always be Up-To-Date

In [0]:
landlord_data = spark.read.format("delta").load("/delta/apartment/landlord_data").orderBy("Landlord_id")

In [0]:
display(landlord_data)

Landlord_id,Password,Landlord_name,Address_line_1,City,Post_code,Region,event_time,latest_event_time
28,Thq6P8m,Laurence Pickthorne,08574 Dennis Drive,Salt Lake City,84130,Utah,2018-05-08T11:40:12.000+0000,2018-05-08T11:40:12.000+0000
31,SUfXFDT1ZL,Alfy Swatheridge,61360 7th Hill,Shreveport,71130,Louisiana,2018-05-07T18:20:09.000+0000,2018-05-07T18:20:09.000+0000
33,5gdi4FdXOUR,Cesaro Brunker,3 Summit Place,Spartanburg,29305,South Carolina,2018-01-20T22:42:38.000+0000,2018-01-20T22:42:38.000+0000
59,2E58omBiH,Ambrosius Ridgewell,64 Shelley Junction,Jeffersonville,47134,Indiana,2017-05-14T23:13:02.000+0000,2017-05-14T23:13:02.000+0000
60,7hOECUg7,Gusty Garrould,202 American Crossing,Salem,97306,Oregon,2017-12-26T02:31:22.000+0000,2017-12-26T02:31:22.000+0000
88,RhTHkvl,Marci Schettini,90 Dwight Center,El Paso,79911,Texas,2016-11-30T20:10:32.000+0000,2016-11-30T20:10:32.000+0000
124,CzLHSynx4rQZ,Collen Blackstock,4 Graedel Junction,El Paso,88563,Texas,2018-06-05T08:46:57.000+0000,2018-06-05T08:46:57.000+0000
182,ssTScp,Northrop Haxbie,7057 Morning Pass,Milwaukee,53234,Wisconsin,2017-12-26T02:02:49.000+0000,2017-12-26T02:02:49.000+0000
191,JyOw4TYA,Marcella Brahms,5810 Quincy Parkway,Portland,97206,Oregon,2018-06-12T10:37:22.000+0000,2018-06-12T10:37:22.000+0000
193,nUHC1pem,Honoria Etoile,38 Glacier Hill Circle,Oklahoma City,73114,Oklahoma,2017-04-25T22:28:53.000+0000,2017-04-25T22:28:53.000+0000


### Do Some Live Streaming Graphs

In [0]:
landlord_data_stream = spark.readStream.format("delta").load("/delta/apartment/landlord_data")

In [0]:
display(landlord_data_stream.groupBy("Region").count())

Region,count
Utah,1
Ohio,1
Oregon,2
Texas,4
Pennsylvania,2
Nevada,1
Washington,2
Illinois,1
Oklahoma,1
Delaware,1
