In [0]:
import os
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

In [0]:
#display files, to take a look:
for fname in os.listdir('/dbfs/output_dir/trade'):
    print(fname)

In [0]:
for fname in os.listdir('/dbfs/output_dir/trade/date=2020-08-05'):
    print(fname)

In [0]:
for fname in os.listdir('/dbfs/output_dir/trade/date=2020-08-06'):
    print(fname)

In [0]:
for fname in os.listdir('/dbfs/output_dir/quote/date=2020-08-06'):
    print(fname)

In [0]:
for fname in os.listdir('/dbfs/output_dir/trade/date=2020-08-05'):
    print(fname)

In [0]:
#4.1 Read Parquet Files
#from load step: corrected.write.mode("append").parquet("c:/sb/equity-market-data-analysis/output_dir/trade/date={}".format(date))
tradedf = spark.read.parquet("/output_dir/trade")
tradedf.createOrReplaceTempView("trades")
tradedf.show(5)

quotedf = spark.read.parquet("/output_dir/quote")
quotedf.createOrReplaceTempView("quotes")
quotedf.show(5)

In [0]:
#4.2 Create trade staging table
#4.2.1 use spark to read the trade table with date partition "2020-08-05'
query = """
    select trade_dt, symbol, exchange, event_tm, event_seq_nb, trade_pr from trades
    where trade_dt = '2020-08-06'
    """
df = spark.sql(query)
#4.2.2 Create a spark temporary view
df.createOrReplaceTempView("tmp_trade_moving_avg1")

In [0]:
df.show(5)

In [0]:
#4.2.3 Calculate The 30-min Moving Average using tmp_trade_moving_avg
query = """
    SELECT trade_dt, symbol, exchange, event_tm, event_seq_nb, trade_pr,
    avg(trade_pr) OVER(PARTITION BY symbol ORDER BY CAST(event_tm AS timestamp) 
        RANGE BETWEEN INTERVAL 30 MINUTES PRECEDING AND CURRENT ROW) as mov_avg_pr
    FROM tmp_trade_moving_avg1
    """
mov_avg_df = spark.sql(query)

In [0]:
mov_avg_df.show(5)

In [0]:
#4.2.4 Save the temporary view into Hive table for staging

#mov_avg_df.write.saveAsTable("tmp_trade_moving_avg", mode='overwrite')
mov_avg_df.createOrReplaceTempView("tmp_trade_moving_avg")

In [0]:
#4.3 Create Staging Table for the prior day's last trade
#4.3.1 Get the previous date value
date = datetime.strptime('2020-08-06', '%Y-%m-%d')
print(date)
prev_date_str = str(date.date() - timedelta(days=1))
print(prev_date_str)

#4.3.2 Use Spark to read the trade table with date partition prev_date_str 
query = """
    select trade_dt, symbol, exchange, event_tm, event_seq_nb, trade_pr from trades
    where trade_dt = '{}'
    """
df = spark.sql(query.format(prev_date_str))
df.show(5)

In [0]:
#4.3.3 Create spark temporary view
df.createOrReplaceTempView("tmp_last_trade1")

In [0]:
#4.3.4 Calculate last trade price using tmp_last_trade
query = """
    SELECT symbol, exchange, trade_pr AS close_pr
    FROM tmp_last_trade1 t1
    JOIN
    (SELECT MAX(event_tm) AS last_record FROM tmp_last_trade1
     GROUP BY symbol, exchange) t2
    ON t1.event_tm == last_record
    """

In [0]:
last_pr_df = spark.sql(query)

In [0]:
last_pr_df.show(5)

In [0]:
last_pr_df.createOrReplaceTempView("tmp_last_trade")

In [0]:
#4.4.1.2 Create spark temp view to union both tables
quote_union = spark.sql("""
    SELECT trade_dt, symbol, exchange, event_tm, event_seq_nb, NULL as arrival_tm, NULL as bid_size, 
        NULL as ask_pr, NULL as ask_size, NULL as latest, trade_pr, mov_avg_pr 
    FROM tmp_trade_moving_avg
    UNION 
    SELECT trade_dt, symbol, exchange, event_tm, event_seq_nb, arrival_tm, bid_size, ask_pr, ask_size,
        latest, NULL as trade_pr, NULL as mov_avg_pr 
    FROM quotes
    """)
quote_union.createOrReplaceTempView("quote_union")
quote_union.show(10)

In [0]:
#4.4 Populate the latest trade_pr and mov_avg_pr 
quote_union_update = spark.sql("""
SELECT quote_union.trade_dt, quote_union.symbol, quote_union.exchange, quote_union.event_tm, 
        quote_union.event_seq_nb, quote_union.arrival_tm, quote_union.bid_size, quote_union.ask_pr, 
        quote_union.ask_size, quote_union.latest, latest_trade.latest_trade_pr, latest_trade.mov_avg_pr
FROM quote_union
JOIN (SELECT quote.symbol, quote.exchange, quote.trade_pr as latest_trade_pr, quote.mov_avg_pr
    FROM quote_union quote
    JOIN (SELECT MAX(event_tm) as latest_event_tm FROM quote_union GROUP BY symbol, exchange) last_trade_tm
        ON quote.event_tm == last_trade_tm.latest_event_tm) latest_trade
    ON quote_union.symbol == latest_trade.symbol AND quote_union.exchange == latest_trade.exchange
    """)
quote_union_update.createOrReplaceTempView("quote_union_update")
quote_union_update.show(10)

In [0]:
#4.4.1.4 Filter for quote records
quote_update = spark.sql("""
SELECT *
FROM quote_union_update
WHERE ask_size IS NOT NULL
""")
quote_update.createOrReplaceTempView("quote_update")
quote_update.show(5)

In [0]:
#4.4.2 Join with table temp_last_trade to get the prior day close price
quote_final = spark.sql("""
SELECT * 
FROM quote_update 
LEFT JOIN tmp_last_trade
    USING(symbol, exchange)
""")
quote_final.show(5)

In [0]:
#4.4.3 Write the final dataframe into Azure Blob storage at corresponding partition
#temp local storage for jupyter notebook
#quote_final.write.parquet("c:/sb/equity-market-data-analysis/output_dir/quote-trade-analytical/date=2020-08-06")

storageAccountName = 'equitymarketblob'
storageAccountAccessKey = '0nboKYno6oyL7Op44qYarGAfeq1EmdnrBUHWvYUyfDfIdGF990OzOw8KsL7RdEofxLC7W0WaHB66+AStrlJdZA=='
ContainerName = 'bcontainer1'

spark.conf.set(f'fs.azure.account.key.{storageAccountName}.blob.core.windows.net', storageAccountAccessKey)
path='wasbs://{}@{}.blob.core.windows.net/output_dir/quote-trade-analytical/date=2020-08-06/'.format(ContainerName,storageAccountName)
df.write.parquet(path)