In [20]:
# !pip install findspark

In [21]:
import findspark
findspark.init()


In [22]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import from_json
from pyspark.sql.types import *

spark = SparkSession.builder \
          .appName('pre_process_media_data') \
          .getOrCreate()

spark.conf.set("spark.sql.session.timeZone", "UTC")

create_dim_time_csv = False


## Create Dim Time

In [23]:
# One time create a dim time csv file
# ------------------------------------------------
import pandas as pd  

# from pyspark.sql.functions import from_unixtime, col
from datetime import datetime

if create_dim_time_csv:
    # Define start and end timestamps
    start_timestamp = 1640995200 # 2022-01-01 00:00:00
    end_timestamp = 1683014400   # 2023-06-30 23:59:59

    # list of name, degree, score 
    rows = list(range(start_timestamp, end_timestamp + 60, 60))

    # dictionary of lists  
    dict = {'epoch': rows}  

    df = pd.DataFrame(dict) 

    # saving the dataframe 
    df.to_csv('./data/STG/datetime_data/dim_time.csv') 
else:
    print("Already created dim time CSV file")
    

Already created dim time CSV file


In [24]:
# # Read dim time csv file
# ------------------------------------------------
schema = StructType([
StructField("index", IntegerType()),
    StructField("epoch", IntegerType())                   
])

# Read csv files
time_df = spark.read \
        .option("header",True) \
        .csv("./data/STG/datetime_data/dim_time.csv", schema=schema)

# Adding timestamp column
time_df = time_df.withColumn("ts_",F.from_unixtime("epoch"))

# This column capture the date.
time_df = time_df.withColumn("date_",F.to_date("ts_"))

# This column capture the year.
time_df = time_df.withColumn("year_", F.year("ts_"))

# This column capture the month.
time_df = time_df.withColumn("month_", F.month("ts_"))

# # This column capture the day in the week.
time_df = time_df.withColumn("day_of_week", F.date_format("ts_", 'E'))

# # This column capture the day in the month.
time_df = time_df.withColumn("day_of_month", F.dayofmonth("ts_"))

# This column capture the time.
time_df = time_df.withColumn("time_", F.date_format('ts_', 'HH:mm:ss'))

# This column capture the hour.
time_df = time_df.withColumn("hour_",F.hour("ts_"))

# This column capture the minute.
time_df = time_df.withColumn("minute_", F.minute("ts_"))

time_df.createOrReplaceTempView("time_df")

time_df.printSchema()


root
 |-- index: integer (nullable = true)
 |-- epoch: integer (nullable = true)
 |-- ts_: string (nullable = true)
 |-- date_: date (nullable = true)
 |-- year_: integer (nullable = true)
 |-- month_: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- day_of_month: integer (nullable = true)
 |-- time_: string (nullable = true)
 |-- hour_: integer (nullable = true)
 |-- minute_: integer (nullable = true)



## Load Media Data

In [25]:
# CSV File Schema
schema = StructType([
StructField("title"                  , StringType()),   
StructField("url"                    , StringType()), 
StructField("time_published"         , StringType()),             
StructField("authors"                , StringType()),     
StructField("summary"                , StringType()),     
StructField("banner_image"           , StringType()),           
StructField("source"                 , StringType()),     
StructField("category_within_source" , StringType()),                     
StructField("source_domain"          , StringType()),           
StructField("topics"                 , StringType()),     
StructField("overall_sentiment_score", StringType()),                     
StructField("overall_sentiment_label", StringType()),                     
StructField("ticker_sentiment"       , StringType())                   
])

# Read csv files
df = spark.read \
        .option("header",True) \
        .option("multiline",True) \
        .option("quote", "\"") \
        .option("escape", "\"") \
        .csv("./data/alpha_vantage/news_data/*/*", schema=schema)

df = df.select(F.col("url"),
       F.col("time_published"),
       F.col("ticker_sentiment"))

df = df.withColumn("filename", F.input_file_name())

df = df.withColumn('file_ticker', F.split(df['filename'], '/').getItem(9))

df.printSchema()

df.createOrReplaceTempView("df")



root
 |-- url: string (nullable = true)
 |-- time_published: string (nullable = true)
 |-- ticker_sentiment: string (nullable = true)
 |-- filename: string (nullable = false)
 |-- file_ticker: string (nullable = true)





# Check how many files we have per ticker


In [26]:
# Check how many files we have per ticker
files_per_ticker_df = spark.sql("""
select  ROW_NUMBER() OVER (ORDER BY count(*) desc) AS ROWNUM,
        file_ticker, count(*)
from df
group by 2
order by 3 desc
""")

# +------+-----------+--------+
# |ROWNUM|file_ticker|count(1)|
# +------+-----------+--------+
# |1     |TSLA       |10227   |
# |2     |JPM        |9954    |
# |3     |AAPL       |9491    |
# |4     |NFLX       |8994    |
# |5     |BAC        |8634    |
# |6     |MSFT       |8617    |
# |7     |WFC        |7666    |
# |8     |META       |7402    |
# |9     |WMT        |7018    |
# |10    |AMZN       |6792    |
# |11    |PFE        |5708    |
# |12    |NVDA       |5684    |
# |13    |XOM        |3814    |
# |14    |NKE        |3670    |
# |15    |KO         |3376    |
# |16    |JNJ        |3171    |
# |17    |VZ         |2875    |
# |18    |MA         |2611    |
# |19    |COST       |2560    |
# |20    |DIS        |2472    |
# |21    |ABBV       |2430    |
# |22    |CVX        |2378    |
# |23    |HD         |2216    |
# |24    |PEP        |2180    |
# |25    |PG         |1812    |
# |26    |MCD        |1757    |
# |27    |CSCO       |1724    |
# |28    |UNH        |1681    |
# |29    |AVGO       |1649    |
# |30    |ACN        |1557    |
# |31    |V          |1375    |
# |32    |ADBE       |963     |
# |33    |ABT        |932     |
# |34    |TMO        |760     |
# |35    |PM         |722     |
# |36    |LLY        |703     |
# |37    |DHR        |443     |
# |38    |TXN        |434     |
# |39    |LIN        |213     |
# +------+-----------+--------+

files_per_ticker_df.createOrReplaceTempView('files_per_ticker_df')

# Parse ticker_sentiment JSON data


In [27]:
schema = ArrayType(
    StructType([StructField("ticker", StringType()), 
                StructField("relevance_score", StringType()), 
                StructField("ticker_sentiment_score", StringType()), 
                StructField("ticker_sentiment_label", StringType())]))


df = df.withColumn("ticker_sentiment_new", from_json(F.col("ticker_sentiment"), schema))

df = df.withColumn("ticker_sentiment_item",F.explode(F.col("ticker_sentiment_new")))

df = df.withColumn("ticker", F.col("ticker_sentiment_item").getItem("ticker"))

df = df.withColumn("relevance_score", F.col("ticker_sentiment_item").getItem("relevance_score"))

df = df.withColumn("ticker_sentiment_score", F.col("ticker_sentiment_item").getItem("ticker_sentiment_score"))

df = df.withColumn("ticker_sentiment_label", F.col("ticker_sentiment_item").getItem("ticker_sentiment_label"))

df = df.select(F.col("url"),
               F.col("time_published"),
               F.col("ticker"))

df.printSchema()
df.createOrReplaceTempView('df')

root
 |-- url: string (nullable = true)
 |-- time_published: string (nullable = true)
 |-- ticker: string (nullable = true)



# Aggregate Ticker Media Events Count 

In [28]:
ticker_count_per_article_df = spark.sql("""
with top_tickers
as 
(
    select * 
    from files_per_ticker_df
    where ROWNUM<36
),
s1
as
(
    select url,
           ticker,
           min(time_published) as time_published     
    from   df
    group by 1,2
)

select s1.url,
       s1.time_published,
       count(s1.ticker) as ticker_count_per_article
from   s1 join top_tickers
       on s1.ticker = top_tickers.file_ticker 
group by 1,2

""")

ticker_count_per_article_df.createOrReplaceTempView("ticker_count_per_article_df")

ticker_count_per_article_df.printSchema()

root
 |-- url: string (nullable = true)
 |-- time_published: string (nullable = true)
 |-- ticker_count_per_article: long (nullable = false)



In [29]:
# This column capture the year of the media article been published.
ticker_count_per_article_df = ticker_count_per_article_df.withColumn("t_article_published_year", 
                                                                     F.col("time_published")[0:4])

# This column capture the month of the media article been published.
ticker_count_per_article_df = ticker_count_per_article_df.withColumn("t_article_published_month", 
                                                                     F.col("time_published")[5:2])

# This column capture the day (in the month) of the media article been published.
ticker_count_per_article_df = ticker_count_per_article_df.withColumn("t_article_published_day", 
                                                                     F.col("time_published")[7:2])

# This column capture the hour of the media article been published.
ticker_count_per_article_df = ticker_count_per_article_df.withColumn("t_article_published_hour", 
                                                                     F.col("time_published")[10:2])

# This column capture the minute of the media article been published.
ticker_count_per_article_df = ticker_count_per_article_df.withColumn("t_article_published_min", 
                                                                     F.col("time_published")[12:2])

# This column capture the second of the media article been published.
ticker_count_per_article_df = ticker_count_per_article_df.withColumn("t_article_published_sec",
                                                                     F.col("time_published")[14:2])

# This column capture the date of the media article.
ticker_count_per_article_df = ticker_count_per_article_df.withColumn("t_article_published_date", 
                                                                     F.concat_ws('-',
                                                                                  F.col("t_article_published_year"),
                                                                                  F.col("t_article_published_month"),
                                                                                  F.col("t_article_published_day")
                                                                                )
                                                                    )

# This column capture the time of the media article.
ticker_count_per_article_df = ticker_count_per_article_df.withColumn("t_article_published_time", 
                                                                     F.concat_ws(':',
                                                                                  F.col("t_article_published_hour"),
                                                                                  F.col("t_article_published_min"),
                                                                                  F.col("t_article_published_sec")
                                                                                )
                                                                    )

# This column capture the timestamp of the media article.
ticker_count_per_article_df = ticker_count_per_article_df.withColumn("t_article_published_ts", 
                                                                     F.to_timestamp(
                                                                         F.concat_ws(' ',
                                                                                     F.col("t_article_published_date"),
                                                                                     F.col("t_article_published_time")
                                                                                    )
                                                                                   )
                                                                    )

# This column capture the time and date of the media article in epoch time.
ticker_count_per_article_df = ticker_count_per_article_df.withColumn("t_article_published_epoch", 
                                                                     F.unix_timestamp("t_article_published_ts"))


ticker_count_per_article_df = ticker_count_per_article_df.select(F.col("url"),
                                                                 F.col("t_article_published_date"),    
                                                                 F.col("t_article_published_time"),
                                                                 F.col("t_article_published_ts"),
                                                                 F.col("t_article_published_epoch"),                                                                 
                                                                 F.col("ticker_count_per_article")
                                                                )

ticker_count_per_article_df.createOrReplaceTempView("ticker_count_per_article_df")

ticker_count_per_article_df.printSchema()


root
 |-- url: string (nullable = true)
 |-- t_article_published_date: string (nullable = false)
 |-- t_article_published_time: string (nullable = false)
 |-- t_article_published_ts: timestamp (nullable = true)
 |-- t_article_published_epoch: long (nullable = true)
 |-- ticker_count_per_article: long (nullable = false)



In [30]:
# Adding for each time resolution (1,5,10,15,30 minute) the proper key (round up epoch key)

before_agg_df = spark.sql("""

select url,
       cast((t_article_published_epoch/60) as int)*60                 as  t_round_up_min_epoch,
       ticker_count_per_article          
from ticker_count_per_article_df

""")

before_agg_df.createOrReplaceTempView("before_agg_df")

In [31]:
before_agg_df.printSchema()

root
 |-- url: string (nullable = true)
 |-- t_round_up_min_epoch: integer (nullable = true)
 |-- ticker_count_per_article: long (nullable = false)



In [32]:
# Agregate data for 1 min resolution data sets.

agg_1min_data = spark.sql("""

select t_round_up_min_epoch          as epoch_key, 
       sum(ticker_count_per_article) as snp_ticker_count_epoch
from before_agg_df
group by 1

""")

agg_1min_data.printSchema()

agg_1min_data.summary().show()

agg_1min_data.createOrReplaceTempView("agg_1min_data")


root
 |-- epoch_key: integer (nullable = true)
 |-- snp_ticker_count_epoch: long (nullable = true)

23/08/19 17:17:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:17:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.




23/08/19 17:18:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.




23/08/19 17:18:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


[Stage 57:>                                                       (0 + 10) / 11]

23/08/19 17:18:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


                                                                                

+-------+--------------------+----------------------+
|summary|           epoch_key|snp_ticker_count_epoch|
+-------+--------------------+----------------------+
|  count|               75246|                 75246|
|   mean|1.6615252373271668E9|    2.0736916248039763|
| stddev|   8741929.383566512|      2.16896227901843|
|    min|          1646121600|                     1|
|    25%|          1653915600|                     1|
|    50%|          1661408760|                     1|
|    75%|          1668697980|                     2|
|    max|          1677444660|                    86|
+-------+--------------------+----------------------+



# Union 1 min Agg With Media Data

In [33]:
# agg_1min_data.count()
# 75,246

In [34]:
before_win_df = spark.sql("""
select t.*,
       coalesce(a.snp_ticker_count_epoch,0) as snp_media_events_count
from   time_df as t
       left join
       agg_1min_data as a
       on t.epoch = a.epoch_key
""")

before_win_df.createOrReplaceTempView('before_win_df')
before_win_df.printSchema()

root
 |-- index: integer (nullable = true)
 |-- epoch: integer (nullable = true)
 |-- ts_: string (nullable = true)
 |-- date_: date (nullable = true)
 |-- year_: integer (nullable = true)
 |-- month_: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- day_of_month: integer (nullable = true)
 |-- time_: string (nullable = true)
 |-- hour_: integer (nullable = true)
 |-- minute_: integer (nullable = true)
 |-- snp_media_events_count: long (nullable = false)



In [35]:

before_win_df.coalesce(10).write.mode("overwrite").parquet("./data/STG/media_aggregated_data/before_window_functions")


23/08/19 17:18:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , epoch
 Schema: index, epoch
Expected: index but found: 
CSV file: file:///Users/ybatash/Workspace/tau.ac.il/tau-final-project/data/STG/datetime_data/dim_time.csv


[Stage 81:>   (0 + 3) / 3][Stage 82:>  (7 + 7) / 67][Stage 83:>  (0 + 0) / 67]

23/08/19 17:18:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


[Stage 83:=====>                                                  (6 + 10) / 67]

23/08/19 17:18:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


[Stage 85:>                                                       (0 + 10) / 11]

23/08/19 17:18:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


[Stage 85:>               (0 + 10) / 11][Stage 87:>                 (0 + 0) / 1]

23/08/19 17:18:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


                                                                                

In [36]:
spark.catalog.clearCache()

# Create Diffrent Minute widows aggregate function

In [37]:
bwf_df = spark.read.parquet("./data/STG/media_aggregated_data/before_window_functions/part-*")
bwf_df.printSchema()

root
 |-- index: integer (nullable = true)
 |-- epoch: integer (nullable = true)
 |-- ts_: string (nullable = true)
 |-- date_: date (nullable = true)
 |-- year_: integer (nullable = true)
 |-- month_: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- day_of_month: integer (nullable = true)
 |-- time_: string (nullable = true)
 |-- hour_: integer (nullable = true)
 |-- minute_: integer (nullable = true)
 |-- snp_media_events_count: long (nullable = true)



In [38]:

window_spec = Window.orderBy(F.col("epoch"))

min_results_df = bwf_df.withColumn("rnk", F.dense_rank().over(window_spec)) \
    .withColumn("m_avg_snp_media_events_count_last_15min", 
                F.avg("snp_media_events_count").over(window_spec.rowsBetween(-15, -1))) \
    .withColumn("m_max_snp_media_events_count_last_15min", 
                F.max("snp_media_events_count").over(window_spec.rowsBetween(-15, -1))) \
    .withColumn("m_stddev_snp_media_events_count_last_15min", 
                F.stddev("snp_media_events_count").over(window_spec.rowsBetween(-15, -1))) \
    .withColumn("m_sum_snp_media_events_count_last_15min", 
                F.sum("snp_media_events_count").over(window_spec.rowsBetween(-15, -1))) \
    .withColumn("m_avg_snp_media_events_count_last_30min", 
                F.avg("snp_media_events_count").over(window_spec.rowsBetween(-30, -1))) \
    .withColumn("m_max_snp_media_events_count_last_30min", 
                F.max("snp_media_events_count").over(window_spec.rowsBetween(-30, -1))) \
    .withColumn("m_stddev_snp_media_events_count_last_30min", 
                F.stddev("snp_media_events_count").over(window_spec.rowsBetween(-30, -1))) \
    .withColumn("m_sum_snp_media_events_count_last_30min", 
                F.sum("snp_media_events_count").over(window_spec.rowsBetween(-30, -1))) \
    .withColumn("m_avg_snp_media_events_count_last_45min", 
                F.avg("snp_media_events_count").over(window_spec.rowsBetween(-45, -1))) \
    .withColumn("m_max_snp_media_events_count_last_45min", 
                F.max("snp_media_events_count").over(window_spec.rowsBetween(-45, -1))) \
    .withColumn("m_stddev_snp_media_events_count_last_45min", 
                F.stddev("snp_media_events_count").over(window_spec.rowsBetween(-45, -1))) \
    .withColumn("m_sum_snp_media_events_count_last_45min", 
                F.sum("snp_media_events_count").over(window_spec.rowsBetween(-45, -1))) \
    .withColumn("m_avg_snp_media_events_count_last_1hour", 
                F.avg("snp_media_events_count").over(window_spec.rowsBetween(-60, -1))) \
    .withColumn("m_max_snp_media_events_count_last_1hour", 
                F.max("snp_media_events_count").over(window_spec.rowsBetween(-60, -1))) \
    .withColumn("m_stddev_snp_media_events_count_last_1hour", 
                F.stddev("snp_media_events_count").over(window_spec.rowsBetween(-60, -1))) \
    .withColumn("m_sum_snp_media_events_count_last_1hour", 
                F.sum("snp_media_events_count").over(window_spec.rowsBetween(-60, -1))) \
    .withColumn("m_avg_snp_media_events_count_last_3hour", 
                F.avg("snp_media_events_count").over(window_spec.rowsBetween(-180, -1))) \
    .withColumn("m_max_snp_media_events_count_last_3hour", 
                F.max("snp_media_events_count").over(window_spec.rowsBetween(-180, -1))) \
    .withColumn("m_stddev_snp_media_events_count_last_3hour", 
                F.stddev("snp_media_events_count").over(window_spec.rowsBetween(-180, -1))) \
    .withColumn("m_sum_snp_media_events_count_last_3hour", 
                F.sum("snp_media_events_count").over(window_spec.rowsBetween(-180, -1)))\
    .select("epoch",
            "m_avg_snp_media_events_count_last_15min",
            "m_max_snp_media_events_count_last_15min",
            "m_stddev_snp_media_events_count_last_15min",
            "m_sum_snp_media_events_count_last_15min",
            "m_avg_snp_media_events_count_last_30min",
            "m_max_snp_media_events_count_last_30min",
            "m_stddev_snp_media_events_count_last_30min",
            "m_sum_snp_media_events_count_last_30min",
            "m_avg_snp_media_events_count_last_45min",
            "m_max_snp_media_events_count_last_45min",
            "m_stddev_snp_media_events_count_last_45min",
            "m_sum_snp_media_events_count_last_45min",
            "m_avg_snp_media_events_count_last_1hour",
            "m_max_snp_media_events_count_last_1hour",
            "m_stddev_snp_media_events_count_last_1hour",
            "m_sum_snp_media_events_count_last_1hour",
            "m_avg_snp_media_events_count_last_3hour",
            "m_max_snp_media_events_count_last_3hour",
            "m_stddev_snp_media_events_count_last_3hour",
            "m_sum_snp_media_events_count_last_3hour")


In [39]:

min_results_df.write.mode("overwrite").parquet("./data/STG/media_aggregated_data/minute_windows_aggregate_function")


23/08/19 17:18:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 1

                                                                                

#  Create Diffrent Days intervals widows aggregate function


In [40]:
bwf_df.createOrReplaceTempView("bwf_df")

In [41]:
snp_media_events_per_day_df = spark.sql("""

select  date_,
        sum(snp_media_events_count) as snp_media_events_count
from bwf_df
group by date_

""")


# df.filter(F.col("date_") == "2022-03-28").filter(F.col("snp_media_events_count")!=0).show(100000,False)

In [42]:
# # from pyspark.sql.window import Window
# # from pyspark.sql.functions import avg, dense_rank, col

window_spec = Window.orderBy(F.col("date_"))

days_results_df = snp_media_events_per_day_df.withColumn("rnk", F.dense_rank().over(window_spec)) \
    .withColumn("m_avg_snp_media_events_count_last_3days", 
                F.avg("snp_media_events_count").over(window_spec.rowsBetween(-3, -1))) \
    .withColumn("m_max_snp_media_events_count_last_3days", 
                F.max("snp_media_events_count").over(window_spec.rowsBetween(-3, -1))) \
    .withColumn("m_stddev_snp_media_events_count_last_3days", 
                F.stddev("snp_media_events_count").over(window_spec.rowsBetween(-3, -1))) \
    .withColumn("m_sum_snp_media_events_count_last_3days", 
                F.sum("snp_media_events_count").over(window_spec.rowsBetween(-3, -1)))\
    .withColumn("m_avg_snp_media_events_count_last_7days", 
                F.avg("snp_media_events_count").over(window_spec.rowsBetween(-7, -1))) \
    .withColumn("m_max_snp_media_events_count_last_7days", 
                F.max("snp_media_events_count").over(window_spec.rowsBetween(-7, -1))) \
    .withColumn("m_stddev_snp_media_events_count_last_7days", 
                F.stddev("snp_media_events_count").over(window_spec.rowsBetween(-7, -1))) \
    .withColumn("m_sum_snp_media_events_count_last_7days", 
                F.sum("snp_media_events_count").over(window_spec.rowsBetween(-7, -1))) \
    .withColumn("m_avg_snp_media_events_count_last_14days", 
                F.avg("snp_media_events_count").over(window_spec.rowsBetween(-14, -1))) \
    .withColumn("m_max_snp_media_events_count_last_14days", 
                F.max("snp_media_events_count").over(window_spec.rowsBetween(-14, -1))) \
    .withColumn("m_stddev_snp_media_events_count_last_14days", 
                F.stddev("snp_media_events_count").over(window_spec.rowsBetween(-14, -1))) \
    .withColumn("m_sum_snp_media_events_count_last_14days", 
                F.sum("snp_media_events_count").over(window_spec.rowsBetween(-14, -1)))\
    .withColumn("m_avg_snp_media_events_count_last_30days", 
                F.avg("snp_media_events_count").over(window_spec.rowsBetween(-30, -1))) \
    .withColumn("m_max_snp_media_events_count_last_30days", 
                F.max("snp_media_events_count").over(window_spec.rowsBetween(-30, -1))) \
    .withColumn("m_stddev_snp_media_events_count_last_30days", 
                F.stddev("snp_media_events_count").over(window_spec.rowsBetween(-30, -1))) \
    .withColumn("m_sum_snp_media_events_count_last_30days", 
                F.sum("snp_media_events_count").over(window_spec.rowsBetween(-30, -1)))\
    .withColumn("m_avg_snp_media_events_count_last_60days", 
                F.avg("snp_media_events_count").over(window_spec.rowsBetween(-60, -1))) \
    .withColumn("m_max_snp_media_events_count_last_60days", 
                F.max("snp_media_events_count").over(window_spec.rowsBetween(-60, -1))) \
    .withColumn("m_stddev_snp_media_events_count_last_60days", 
                F.stddev("snp_media_events_count").over(window_spec.rowsBetween(-60, -1))) \
    .withColumn("m_sum_snp_media_events_count_last_60days", 
                F.sum("snp_media_events_count").over(window_spec.rowsBetween(-60, -1))) \
        .select("date_",
                "m_avg_snp_media_events_count_last_3days",
                "m_max_snp_media_events_count_last_3days",
                "m_stddev_snp_media_events_count_last_3days",
                "m_sum_snp_media_events_count_last_3days",
                "m_avg_snp_media_events_count_last_7days",
                "m_max_snp_media_events_count_last_7days",
                "m_stddev_snp_media_events_count_last_7days",
                "m_sum_snp_media_events_count_last_7days",
                "m_avg_snp_media_events_count_last_14days",
                "m_max_snp_media_events_count_last_14days",
                "m_stddev_snp_media_events_count_last_14days",
                "m_sum_snp_media_events_count_last_14days",
                "m_avg_snp_media_events_count_last_30days",
                "m_max_snp_media_events_count_last_30days",
                "m_stddev_snp_media_events_count_last_30days",
                "m_sum_snp_media_events_count_last_30days",
                "m_avg_snp_media_events_count_last_60days",
                "m_max_snp_media_events_count_last_60days",
                "m_stddev_snp_media_events_count_last_60days",
                "m_sum_snp_media_events_count_last_60days")

In [43]:

days_results_df.write.mode("overwrite").parquet("./data/STG/media_aggregated_data/days_windows_aggregate_function")


23/08/19 17:18:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 17:18:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/19 1

# Create Final Media File

In [44]:

min_df = spark.read.parquet("./data/STG/media_aggregated_data/minute_windows_aggregate_function/*.parquet")
min_df.createOrReplaceTempView("min_df")
days_df = spark.read.parquet("./data/STG/media_aggregated_data/days_windows_aggregate_function/*.parquet")
days_df.createOrReplaceTempView("days_df")

min_df.printSchema()
days_df.printSchema()

root
 |-- epoch: integer (nullable = true)
 |-- m_avg_snp_media_events_count_last_15min: double (nullable = true)
 |-- m_max_snp_media_events_count_last_15min: long (nullable = true)
 |-- m_stddev_snp_media_events_count_last_15min: double (nullable = true)
 |-- m_sum_snp_media_events_count_last_15min: long (nullable = true)
 |-- m_avg_snp_media_events_count_last_30min: double (nullable = true)
 |-- m_max_snp_media_events_count_last_30min: long (nullable = true)
 |-- m_stddev_snp_media_events_count_last_30min: double (nullable = true)
 |-- m_sum_snp_media_events_count_last_30min: long (nullable = true)
 |-- m_avg_snp_media_events_count_last_45min: double (nullable = true)
 |-- m_max_snp_media_events_count_last_45min: long (nullable = true)
 |-- m_stddev_snp_media_events_count_last_45min: double (nullable = true)
 |-- m_sum_snp_media_events_count_last_45min: long (nullable = true)
 |-- m_avg_snp_media_events_count_last_1hour: double (nullable = true)
 |-- m_max_snp_media_events_count_las

In [45]:

media_df = bwf_df \
            .join(min_df, "epoch") \
            .join(days_df, "date_") \
            . select( "epoch",
                      "ts_",
                      "date_",
                      "year_",
                      "month_",
                      "day_of_week",
                      "day_of_month",
                      "time_",
                      "hour_",
                      "minute_",
                      "snp_media_events_count",
                      "m_avg_snp_media_events_count_last_15min",
                      "m_max_snp_media_events_count_last_15min",
                      "m_stddev_snp_media_events_count_last_15min",
                      "m_sum_snp_media_events_count_last_15min",
                      "m_avg_snp_media_events_count_last_30min",
                      "m_max_snp_media_events_count_last_30min",
                      "m_stddev_snp_media_events_count_last_30min",
                      "m_sum_snp_media_events_count_last_30min",
                      "m_avg_snp_media_events_count_last_45min",
                      "m_max_snp_media_events_count_last_45min",
                      "m_stddev_snp_media_events_count_last_45min",
                      "m_sum_snp_media_events_count_last_45min",
                      "m_avg_snp_media_events_count_last_1hour",
                      "m_max_snp_media_events_count_last_1hour",
                      "m_stddev_snp_media_events_count_last_1hour",
                      "m_sum_snp_media_events_count_last_1hour",
                      "m_avg_snp_media_events_count_last_3hour",
                      "m_max_snp_media_events_count_last_3hour",
                      "m_stddev_snp_media_events_count_last_3hour",
                      "m_sum_snp_media_events_count_last_3hour",
                      "m_avg_snp_media_events_count_last_3days",
                      "m_max_snp_media_events_count_last_3days",
                      "m_stddev_snp_media_events_count_last_3days",
                      "m_sum_snp_media_events_count_last_3days",
                      "m_avg_snp_media_events_count_last_7days",
                      "m_max_snp_media_events_count_last_7days",
                      "m_stddev_snp_media_events_count_last_7days",
                      "m_sum_snp_media_events_count_last_7days",
                      "m_avg_snp_media_events_count_last_14days",
                      "m_max_snp_media_events_count_last_14days",
                      "m_stddev_snp_media_events_count_last_14days",
                      "m_sum_snp_media_events_count_last_14days",
                      "m_avg_snp_media_events_count_last_30days",
                      "m_max_snp_media_events_count_last_30days",
                      "m_stddev_snp_media_events_count_last_30days",
                      "m_sum_snp_media_events_count_last_30days",
                      "m_avg_snp_media_events_count_last_60days",
                      "m_max_snp_media_events_count_last_60days",
                      "m_stddev_snp_media_events_count_last_60days",
                      "m_sum_snp_media_events_count_last_60days")
     
media_df.printSchema()


root
 |-- epoch: integer (nullable = true)
 |-- ts_: string (nullable = true)
 |-- date_: date (nullable = true)
 |-- year_: integer (nullable = true)
 |-- month_: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- day_of_month: integer (nullable = true)
 |-- time_: string (nullable = true)
 |-- hour_: integer (nullable = true)
 |-- minute_: integer (nullable = true)
 |-- snp_media_events_count: long (nullable = true)
 |-- m_avg_snp_media_events_count_last_15min: double (nullable = true)
 |-- m_max_snp_media_events_count_last_15min: long (nullable = true)
 |-- m_stddev_snp_media_events_count_last_15min: double (nullable = true)
 |-- m_sum_snp_media_events_count_last_15min: long (nullable = true)
 |-- m_avg_snp_media_events_count_last_30min: double (nullable = true)
 |-- m_max_snp_media_events_count_last_30min: long (nullable = true)
 |-- m_stddev_snp_media_events_count_last_30min: double (nullable = true)
 |-- m_sum_snp_media_events_count_last_30min: long (nullab

In [46]:
media_df.count()

700321

In [47]:

media_df.write.mode("overwrite").parquet("./data/DWH/dim_media")


                                                                                

# The End