# End to End Pure Streaming Data-Pipeline for Payment Table Using Spark Structured Streaming on Databricks

###### Description: In this notebook we read payment state rows from incoming csv files into a streamig dataframe, transform (clean, cast, rename) the data, add/update the latest state to a Databricks Delta table
###### Objective: (incoming csv files) --> "payment_streamingDF" --> "results_df" --> "payment_data"

In [0]:
import requests
import json
import optimus as op
import phonenumbers 
import re
import datetime
import time

from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import unix_timestamp, from_unixtime
from pyspark.sql import functions as F
from pyspark.sql.window import Window as W
from pyspark.sql import DataFrame
from pyspark.sql.functions import lit
from pyspark.sql.functions import rank, col

In [0]:
# Schema for Payment
payment_schema = StructType([
            StructField("Payment_id", IntegerType(), False),
            StructField("Pay_date", TimestampType(), True),
            StructField("Pay_amount", StringType(), False),
            StructField("Method", StringType(), True),
            StructField("Rent_id", IntegerType(), True),
            StructField("event_time", TimestampType(), True)])

payment_udf_schema = StructType([
            StructField("Pay_date", TimestampType(), True),
            StructField("Pay_amount", StringType(), False),
            StructField("Method", StringType(), True),
            StructField("Rent_id", IntegerType(), True),
            StructField("event_time", TimestampType(), True)])

###### Description: Get payment csv files as a streaming "payment_streamingDF" and process it on the fly and get transformed stream "payment_df"
###### Objective: (incoming csv files) --> "payment_streamingDF" --> "payment_df"

In [0]:
# Get payment Steaming DataFrame from csv files

# streaming starts here by reading the input files 
payment_Path = "/FileStore/apartment/payment/inprogress/"
payment_streamingDF = (
  spark
    .readStream
    .schema(payment_schema)
    .option("maxFilesPerTrigger", "1")
    .option("header", "true")
    .option("multiLine", "true")
    .csv(payment_Path)
)
# Clear invalid rows
payment_df = payment_streamingDF.select("*").where("Payment_id IS NOT NULL")
# Instantiation of DataTransformer class:
transformer = op.DataFrameTransformer(payment_df)
# Replace NA with 0's
transformer.replace_na(0.0, columns="*")
# Clear accents: clear_accents only from name column and not everywhere 
transformer.clear_accents(columns='*')
# Remove special characters:  From all Columns 
# transformer.remove_special_chars(columns=['payment_name', 'Address_line_1', 'City', 'Post_code', 'Region'])

##### This function parses the corresponding columns into a single column

In [0]:
def my_fun(Pay_date, Pay_amount, Method, Rent_id, event_time):
  return zip(Pay_date, Pay_amount, Method, Rent_id, event_time)

udf_Fun = udf(my_fun, ArrayType(payment_udf_schema))

In [0]:
intermediate_df = ( payment_df.withWatermark("event_time", "10 seconds")
            .groupBy("Payment_id")
            .agg(F.collect_list("Pay_date").alias("Pay_date"),
                 F.collect_list("Pay_amount").alias("Pay_amount"),
                 F.collect_list("Method").alias("Method"),
                 F.collect_list("Rent_id").alias("Rent_id"),
                 F.collect_list("event_time").alias("event_time"), 
                 F.max("event_time").alias("latest_event_time"))
            .select("Payment_id", 
                    F.explode(udf_Fun(F.column("Pay_date"), 
                                      F.column("Pay_amount"), 
                                      F.column("Method"), 
                                      F.column("Rent_id"), 
                                      F.column("event_time")))
                    .alias("data"), "latest_event_time"))

##### Filter the data where event_time is latest

In [0]:
results_df = (intermediate_df
              .select("Payment_id", 
                      "data.Pay_date", 
                      "data.Pay_amount", 
                      "data.Method", 
                      "data.Rent_id", 
                      "data.event_time", 
                      "latest_event_time")
              .where("data.event_time=latest_event_time")).orderBy("Payment_id")

##### Display final result
###### This result shows the latest state of all the unique payment_id

In [0]:
display(results_df)

Payment_id,Pay_date,Pay_amount,Method,Rent_id,event_time,latest_event_time
1,2017-02-21T22:33:58.000+0000,$4566.21,Cash,993,2018-11-19T09:09:03.000+0000,2018-11-19T09:09:03.000+0000
2,2017-07-19T17:34:02.000+0000,$261.34,Cash,67,2017-03-26T07:07:51.000+0000,2017-03-26T07:07:51.000+0000
3,2017-05-23T04:08:41.000+0000,$425.66,Debit card,743,2016-11-27T19:48:41.000+0000,2016-11-27T19:48:41.000+0000
4,2017-03-21T12:12:55.000+0000,$1947.94,Cash,250,2017-02-06T09:41:52.000+0000,2017-02-06T09:41:52.000+0000
5,2017-03-07T09:22:09.000+0000,$4770.52,Debit card,344,2018-07-30T00:15:49.000+0000,2018-07-30T00:15:49.000+0000
6,2017-10-25T11:32:09.000+0000,$831.23,Cash,858,2017-03-30T10:57:08.000+0000,2017-03-30T10:57:08.000+0000
7,2016-11-28T04:46:21.000+0000,$2401.36,Cheque,253,2018-05-09T06:45:35.000+0000,2018-05-09T06:45:35.000+0000
8,2017-02-24T20:27:04.000+0000,$1291.14,Cash,892,2018-01-28T04:39:07.000+0000,2018-01-28T04:39:07.000+0000
9,2017-04-10T04:16:00.000+0000,$5159.82,Cash,384,2017-12-30T18:51:37.000+0000,2017-12-30T18:51:37.000+0000
10,2017-03-20T06:26:09.000+0000,$1102.48,Cheque,715,2018-08-17T22:48:54.000+0000,2018-08-17T22:48:54.000+0000


##### Below cells are optional if external functionality or storage is needed

###### Write the stream to a Databricks Delta table for storage

In [0]:
streaming_query = (results_df.writeStream
 .format("delta")
 .outputMode("complete")
 .option("mergeSchema", "true")
 .option("checkpointLocation", "/delta/apartment/payment/_checkpoints/streaming-agg")
 .start("/delta/apartment/payment_data"))

#### Read the Delta Table as a Static or Streaming DataFrame
#### This dataframe will always be Up-To-Date

In [0]:
payment_data = spark.read.format("delta").load("/delta/apartment/payment_data").orderBy("Payment_id")

In [0]:
display(payment_data)

Payment_id,Pay_date,Pay_amount,Method,Rent_id,event_time,latest_event_time
1,2017-02-21T22:33:58.000+0000,$4566.21,Cash,993,2018-11-19T09:09:03.000+0000,2018-11-19T09:09:03.000+0000
2,2017-07-19T17:34:02.000+0000,$261.34,Cash,67,2017-03-26T07:07:51.000+0000,2017-03-26T07:07:51.000+0000
3,2017-05-23T04:08:41.000+0000,$425.66,Debit card,743,2016-11-27T19:48:41.000+0000,2016-11-27T19:48:41.000+0000
4,2017-03-21T12:12:55.000+0000,$1947.94,Cash,250,2017-02-06T09:41:52.000+0000,2017-02-06T09:41:52.000+0000
5,2017-03-07T09:22:09.000+0000,$4770.52,Debit card,344,2018-07-30T00:15:49.000+0000,2018-07-30T00:15:49.000+0000
6,2017-10-25T11:32:09.000+0000,$831.23,Cash,858,2017-03-30T10:57:08.000+0000,2017-03-30T10:57:08.000+0000
7,2016-11-28T04:46:21.000+0000,$2401.36,Cheque,253,2018-05-09T06:45:35.000+0000,2018-05-09T06:45:35.000+0000
8,2017-02-24T20:27:04.000+0000,$1291.14,Cash,892,2018-01-28T04:39:07.000+0000,2018-01-28T04:39:07.000+0000
9,2017-04-10T04:16:00.000+0000,$5159.82,Cash,384,2017-12-30T18:51:37.000+0000,2017-12-30T18:51:37.000+0000
10,2017-03-20T06:26:09.000+0000,$1102.48,Cheque,715,2018-08-17T22:48:54.000+0000,2018-08-17T22:48:54.000+0000


### Do Some Live Streaming Graphs

In [0]:
payment_data_stream = spark.readStream.format("delta").load("/delta/apartment/payment_data")

In [0]:
display(payment_data_stream.groupBy("Method").count())

Method,count
Cheque,13
Cash,25
Debit card,8
Credit card,4
