In [1]:
# !pip install findspark

In [2]:
import findspark

findspark.init()


In [3]:
# Variables 
DATA_INTERVAL = 5 # 5 minute
MIN = 60
SEC = 60
pre_trade_d = (5.5*(MIN*SEC)) - (DATA_INTERVAL*SEC)
trade_d = 6.5*(MIN*SEC) - (DATA_INTERVAL*SEC)
post_trade_d = 4*(MIN*SEC) - (DATA_INTERVAL*SEC)

total_trade_d = pre_trade_d + trade_d + post_trade_d + (2*DATA_INTERVAL*SEC)

pre_trade_ub = 5.5*(MIN*SEC)
trade_ub = pre_trade_ub + 6.5*(MIN*SEC)
post_trade_ub = trade_ub + 4*(MIN*SEC)

print(f"pre_trade_d - {pre_trade_d}")
print(f"trade_d - {trade_d}")
print(f"post_trade_d - {post_trade_d}")


pre_trade_d - 19500.0
trade_d - 23100.0
post_trade_d - 14100


In [4]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder \
          .appName('pre_process_5min_data') \
          .config('spark.driver.extraJavaOptions', '-Duser.timezone=GMT') \
          .config('spark.executor.extraJavaOptions', '-Duser.timezone=GMT') \
          .config('spark.sql.session.timeZone', 'UTC') \
          .getOrCreate()


:: loading settings :: url = jar:file:/Users/ybatash/.sdkman/candidates/spark/3.5.0/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/ybatash/.ivy2/cache
The jars for the packages stored in: /Users/ybatash/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-b0f90b74-fa24-4e93-b82c-85257a7beb48;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.0.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.0.0 in central
	found org.apache.kafka#kafka-clients;2.4.1 in local-m2-cache
	found com.github.luben#zstd-jni;1.4.4-3 in local-m2-cache
	found org.lz4#lz4-java;1.7.1 in local-m2-cache
	found org.xerial.snappy#snappy-java;1.1.7.5 in local-m2-cache
	found org.slf4j#slf4j-api;1.7.30 in local-m2-cache
	found org.spark-project.spark#unused;1.0.0 in local-m2-cache
	found org.apache.commons#commons-pool2;2.6.2 in local-m2-cache
:: resolution report :: resolve 251ms :: artifacts dl 12ms
	:: modules in use:
	com.github.luben#zstd-jni;1.4.4-3 from local-m2-cache in [d

In [5]:
from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType, DoubleType

schema = StructType([
    StructField("time", TimestampType()),
    StructField("open", DoubleType()),
    StructField("high", DoubleType()),
    StructField("low", DoubleType()),
    StructField("close", DoubleType()),
    StructField("volume", IntegerType())    
])


# Read 5 min csv files
df = spark.read.option("header",True).csv("./data/alpha_vantage/SPY/interval=5min/*", schema=schema)

df.printSchema()

df.createOrReplaceTempView("df")

root
 |-- time: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: integer (nullable = true)



## Adding Time Features


In [6]:
PRE_TRADE=0
TRADE=1
POST_TRADE=2
UNKNOWN=-1

# This column capture the time and date of SPY index price in epoch time.
df = df.withColumn("t_trade_epoch",F.unix_timestamp("time"))

# This column capture the date of SPY index price.
df = df.withColumn("t_trade_date", F.to_date("time"))

# This column capture the hour of SPY index price.
df = df.withColumn("t_trade_hour", F.hour("time"))

# This column capture the hour of SPY index price.
df = df.withColumn("t_trade_minute", F.minute("time"))


# This column capture the day of the week of SPY index price.
df = df.withColumn("t_trade_day_of_week", F.dayofweek("time"))

# e - epoch
# This column capture the open epoch of SPY index price per day.
df = df.withColumn("t_trade_open_e", F.min("t_trade_epoch").over(Window.partitionBy("t_trade_date")))
df = df.withColumn("t_trade_open_h", F.hour(F.min("time").over(Window.partitionBy("t_trade_date"))))

# This column capture the close epoch of SPY index price per day.
df = df.withColumn("t_trade_close_e", F.max("t_trade_epoch").over(Window.partitionBy("t_trade_date")))
df = df.withColumn("t_trade_close_h", F.hour(F.max("time").over(Window.partitionBy("t_trade_date"))))

# This column capture the number of seconds pass from trade open and trade close.
df = df.withColumn("t_time_diff_between_trade_and_open_in_sec", F.col("t_trade_epoch")-F.col("t_trade_open_e"))

# This column capture what part of the trade we are (pre trade, trade, post trade)
df = df.withColumn("t_trade_part", F.when(F.col("t_time_diff_between_trade_and_open_in_sec") < pre_trade_ub,PRE_TRADE)
.when(F.col("t_time_diff_between_trade_and_open_in_sec") < trade_ub,TRADE)
.when(F.col("t_time_diff_between_trade_and_open_in_sec") <= post_trade_ub,POST_TRADE)             
.otherwise(UNKNOWN))

column_list = ["t_trade_date","t_trade_part"]
win_spec = Window.partitionBy([F.col(x) for x in column_list])

# e - epoch
# This column capture the open epoch of SPY index price per trading part in day.
df = df.withColumn("t_trade_part_open_e", F.min("t_trade_epoch").over(win_spec))
df = df.withColumn("t_trade_part_open_h", F.hour(F.min("time").over(win_spec)))

# This column capture the number of seconds pass from trade part open and trade.
df = df.withColumn("t_time_diff_between_trade_and_trade_part_open_in_sec", F.col("t_trade_epoch")-F.col("t_trade_part_open_e"))

df = df.withColumn("t_precent_of_time_from_start_of_trade_pase", F.when(F.col("t_trade_part") == PRE_TRADE, F.col("t_time_diff_between_trade_and_trade_part_open_in_sec")/pre_trade_d*100)
.when(F.col("t_trade_part") == TRADE, F.col("t_time_diff_between_trade_and_trade_part_open_in_sec")/trade_d*100)
.when(F.col("t_trade_part") == POST_TRADE, F.col("t_time_diff_between_trade_and_trade_part_open_in_sec")/post_trade_d*100)              
.otherwise(UNKNOWN))

df = df.withColumn("t_precent_of_time_from_start_of_trade_day", F.col("t_time_diff_between_trade_and_open_in_sec")/total_trade_d*100)


df.createOrReplaceTempView("df")

df.printSchema()

root
 |-- time: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: integer (nullable = true)
 |-- t_trade_epoch: long (nullable = true)
 |-- t_trade_date: date (nullable = true)
 |-- t_trade_hour: integer (nullable = true)
 |-- t_trade_minute: integer (nullable = true)
 |-- t_trade_day_of_week: integer (nullable = true)
 |-- t_trade_open_e: long (nullable = true)
 |-- t_trade_open_h: integer (nullable = true)
 |-- t_trade_close_e: long (nullable = true)
 |-- t_trade_close_h: integer (nullable = true)
 |-- t_time_diff_between_trade_and_open_in_sec: long (nullable = true)
 |-- t_trade_part: integer (nullable = false)
 |-- t_trade_part_open_e: long (nullable = true)
 |-- t_trade_part_open_h: integer (nullable = true)
 |-- t_time_diff_between_trade_and_trade_part_open_in_sec: long (nullable = true)
 |-- t_precent_of_time_from_start_of_trade_pase: double (nul

## Adding Price Features


In [7]:
# Adding Open Price Features

df = df.withColumn("rnk",F.dense_rank().over(Window.partitionBy("t_trade_date").orderBy("t_trade_epoch")))\
.withColumn("p_min_open_price_for_last_15min",F.min("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_avg_open_price_for_last_15min",F.avg("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_max_open_price_for_last_15min",F.max("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_stddev_open_price_for_last_15min",F.stddev("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_min_open_price_for_last_25min",F.min("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_avg_open_price_for_last_25min",F.avg("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_max_open_price_for_last_25min",F.max("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_stddev_open_price_for_last_25min",F.stddev("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_min_open_price_for_last_50min",F.min("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_avg_open_price_for_last_50min",F.avg("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_max_open_price_for_last_50min",F.max("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_stddev_open_price_for_last_50min",F.stddev("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))


In [8]:
# Adding high Price Features

df = df.withColumn("rnk",F.dense_rank().over(Window.partitionBy("t_trade_date").orderBy("t_trade_epoch")))\
.withColumn("p_min_high_price_for_last_15min",F.min("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_avg_high_price_for_last_15min",F.avg("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_max_high_price_for_last_15min",F.max("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_stddev_high_price_for_last_15min",F.stddev("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_min_high_price_for_last_25min",F.min("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_avg_high_price_for_last_25min",F.avg("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_max_high_price_for_last_25min",F.max("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_stddev_high_price_for_last_25min",F.stddev("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_min_high_price_for_last_50min",F.min("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_avg_high_price_for_last_50min",F.avg("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_max_high_price_for_last_50min",F.max("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_stddev_high_price_for_last_50min",F.stddev("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))

# df.createOrReplaceTempView("df")

# df.printSchema()

In [9]:
# Adding low Price Features

df = df.withColumn("rnk",F.dense_rank().over(Window.partitionBy("t_trade_date").orderBy("t_trade_epoch")))\
.withColumn("p_min_low_price_for_last_15min",F.min("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_avg_low_price_for_last_15min",F.avg("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_max_low_price_for_last_15min",F.max("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_stddev_low_price_for_last_15min",F.stddev("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_min_low_price_for_last_25min",F.min("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_avg_low_price_for_last_25min",F.avg("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_max_low_price_for_last_25min",F.max("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_stddev_low_price_for_last_25min",F.stddev("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_min_low_price_for_last_50min",F.min("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_avg_low_price_for_last_50min",F.avg("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_max_low_price_for_last_50min",F.max("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_stddev_low_price_for_last_50min",F.stddev("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))


# df.createOrReplaceTempView("df")

# df.printSchema()

In [10]:
# # Adding close Price Features

df = df.withColumn("rnk",F.dense_rank().over(Window.partitionBy("t_trade_date").orderBy("t_trade_epoch")))\
.withColumn("p_min_close_price_for_last_15min",F.min("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_avg_close_price_for_last_15min",F.avg("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_max_close_price_for_last_15min",F.max("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_stddev_close_price_for_last_15min",F.stddev("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_min_close_price_for_last_25min",F.min("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_avg_close_price_for_last_25min",F.avg("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_max_close_price_for_last_25min",F.max("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_stddev_close_price_for_last_25min",F.stddev("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_min_close_price_for_last_50min",F.min("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_avg_close_price_for_last_50min",F.avg("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_max_close_price_for_last_50min",F.max("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_stddev_close_price_for_last_50min",F.stddev("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))


In [11]:
# # Adding volume Price Features

df = df.withColumn("rnk",F.dense_rank().over(Window.partitionBy("t_trade_date").orderBy("t_trade_epoch")))\
.withColumn("p_min_volume_price_for_last_15min",F.min("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_avg_volume_price_for_last_15min",F.avg("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_max_volume_price_for_last_15min",F.max("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_stddev_volume_price_for_last_15min",F.stddev("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_min_volume_price_for_last_25min",F.min("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_avg_volume_price_for_last_25min",F.avg("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_max_volume_price_for_last_25min",F.max("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_stddev_volume_price_for_last_25min",F.stddev("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_min_volume_price_for_last_50min",F.min("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_avg_volume_price_for_last_50min",F.avg("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_max_volume_price_for_last_50min",F.max("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_stddev_volume_price_for_last_50min",F.stddev("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))


# Fill Null with 0

In [12]:
p_prefix_column_list = [f.name for f in df.schema.fields if f.name.startswith("p_")]

# Replace 0 for null for all integer columns
# df.na.fill(value=0).show()

# Replace 0 for null on only population column 
df = df.na.fill(value=0,subset=p_prefix_column_list)


In [14]:
# # Check if we have Nulls
# # ----------------------------
# df.select([F.count(F.when(F.isnan(c), c)).alias(c) for c in p_prefix_column_list]).show()


In [15]:

df.createOrReplaceTempView("df")

df.printSchema()

root
 |-- time: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: integer (nullable = true)
 |-- t_trade_epoch: long (nullable = true)
 |-- t_trade_date: date (nullable = true)
 |-- t_trade_hour: integer (nullable = true)
 |-- t_trade_minute: integer (nullable = true)
 |-- t_trade_day_of_week: integer (nullable = true)
 |-- t_trade_open_e: long (nullable = true)
 |-- t_trade_open_h: integer (nullable = true)
 |-- t_trade_close_e: long (nullable = true)
 |-- t_trade_close_h: integer (nullable = true)
 |-- t_time_diff_between_trade_and_open_in_sec: long (nullable = true)
 |-- t_trade_part: integer (nullable = false)
 |-- t_trade_part_open_e: long (nullable = true)
 |-- t_trade_part_open_h: integer (nullable = true)
 |-- t_time_diff_between_trade_and_trade_part_open_in_sec: long (nullable = true)
 |-- t_precent_of_time_from_start_of_trade_pase: double (nul

# Adding media data points

In [16]:

media_df = spark.read.parquet("./data/DWH/dim_media/*.snappy.parquet")

media_df.printSchema()

media_df.summary().show()

media_df.createOrReplaceTempView("media_df")

root
 |-- epoch: integer (nullable = true)
 |-- ts_: string (nullable = true)
 |-- date_: date (nullable = true)
 |-- year_: integer (nullable = true)
 |-- month_: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- day_of_month: integer (nullable = true)
 |-- time_: string (nullable = true)
 |-- hour_: integer (nullable = true)
 |-- minute_: integer (nullable = true)
 |-- snp_media_events_count: long (nullable = true)
 |-- m_avg_snp_media_events_count_last_15min: double (nullable = true)
 |-- m_max_snp_media_events_count_last_15min: long (nullable = true)
 |-- m_stddev_snp_media_events_count_last_15min: double (nullable = true)
 |-- m_sum_snp_media_events_count_last_15min: long (nullable = true)
 |-- m_avg_snp_media_events_count_last_30min: double (nullable = true)
 |-- m_max_snp_media_events_count_last_30min: long (nullable = true)
 |-- m_stddev_snp_media_events_count_last_30min: double (nullable = true)
 |-- m_sum_snp_media_events_count_last_30min: long (nullab

[Stage 9:>                                                          (0 + 1) / 1]

+-------+--------------------+-------------------+------------------+------------------+-----------+------------------+--------+-----------------+------------------+----------------------+---------------------------------------+---------------------------------------+------------------------------------------+---------------------------------------+---------------------------------------+---------------------------------------+------------------------------------------+---------------------------------------+---------------------------------------+---------------------------------------+------------------------------------------+---------------------------------------+---------------------------------------+---------------------------------------+------------------------------------------+---------------------------------------+---------------------------------------+---------------------------------------+------------------------------------------+-------------------------------------

                                                                                

In [18]:
# Join both data freams
features_5min = df.join(media_df, df["t_trade_epoch"] == media_df["epoch"])\
.select( "time",
        "open",
        "high",
        "low",
        "close",
        "volume",
        "t_trade_epoch",
        "t_trade_date",
        "t_trade_hour",
        "t_trade_minute",
        "t_trade_day_of_week",
        "t_trade_open_e",
        "t_trade_open_h",
        "t_trade_close_e",
        "t_trade_close_h",
        "t_time_diff_between_trade_and_open_in_sec",
        "t_trade_part",
        "t_trade_part_open_e",
        "t_trade_part_open_h",
        "t_time_diff_between_trade_and_trade_part_open_in_sec",
        "t_precent_of_time_from_start_of_trade_pase",
        "t_precent_of_time_from_start_of_trade_day",
        "p_min_open_price_for_last_15min",
        "p_avg_open_price_for_last_15min",
        "p_max_open_price_for_last_15min",
        "p_stddev_open_price_for_last_15min",
        "p_min_open_price_for_last_25min",
        "p_avg_open_price_for_last_25min",
        "p_max_open_price_for_last_25min",
        "p_stddev_open_price_for_last_25min",
        "p_min_open_price_for_last_50min",
        "p_avg_open_price_for_last_50min",
        "p_max_open_price_for_last_50min",
        "p_stddev_open_price_for_last_50min",
        "p_min_high_price_for_last_15min",
        "p_avg_high_price_for_last_15min",
        "p_max_high_price_for_last_15min",
        "p_stddev_high_price_for_last_15min",
        "p_min_high_price_for_last_25min",
        "p_avg_high_price_for_last_25min",
        "p_max_high_price_for_last_25min",
        "p_stddev_high_price_for_last_25min",
        "p_min_high_price_for_last_50min",
        "p_avg_high_price_for_last_50min",
        "p_max_high_price_for_last_50min",
        "p_stddev_high_price_for_last_50min",
        "p_min_low_price_for_last_15min",
        "p_avg_low_price_for_last_15min",
        "p_max_low_price_for_last_15min",
        "p_stddev_low_price_for_last_15min",
        "p_min_low_price_for_last_25min",
        "p_avg_low_price_for_last_25min",
        "p_max_low_price_for_last_25min",
        "p_stddev_low_price_for_last_25min",
        "p_min_low_price_for_last_50min",
        "p_avg_low_price_for_last_50min",
        "p_max_low_price_for_last_50min",
        "p_stddev_low_price_for_last_50min",
        "p_min_close_price_for_last_15min",
        "p_avg_close_price_for_last_15min",
        "p_max_close_price_for_last_15min",
        "p_stddev_close_price_for_last_15min",
        "p_min_close_price_for_last_25min",
        "p_avg_close_price_for_last_25min",
        "p_max_close_price_for_last_25min",
        "p_stddev_close_price_for_last_25min",
        "p_min_close_price_for_last_50min",
        "p_avg_close_price_for_last_50min",
        "p_max_close_price_for_last_50min",
        "p_stddev_close_price_for_last_50min",
        "p_min_volume_price_for_last_15min",
        "p_avg_volume_price_for_last_15min",
        "p_max_volume_price_for_last_15min",
        "p_stddev_volume_price_for_last_15min",
        "p_min_volume_price_for_last_25min",
        "p_avg_volume_price_for_last_25min",
        "p_max_volume_price_for_last_25min",
        "p_stddev_volume_price_for_last_25min",
        "p_min_volume_price_for_last_50min",
        "p_avg_volume_price_for_last_50min",
        "p_max_volume_price_for_last_50min",
        "p_stddev_volume_price_for_last_50min",
        "snp_media_events_count",
        "m_avg_snp_media_events_count_last_15min",
        "m_max_snp_media_events_count_last_15min",
        "m_stddev_snp_media_events_count_last_15min",
        "m_sum_snp_media_events_count_last_15min",
        "m_avg_snp_media_events_count_last_30min",
        "m_max_snp_media_events_count_last_30min",
        "m_stddev_snp_media_events_count_last_30min",
        "m_sum_snp_media_events_count_last_30min",
        "m_avg_snp_media_events_count_last_45min",
        "m_max_snp_media_events_count_last_45min",
        "m_stddev_snp_media_events_count_last_45min",
        "m_sum_snp_media_events_count_last_45min",
        "m_avg_snp_media_events_count_last_1hour",
        "m_max_snp_media_events_count_last_1hour",
        "m_stddev_snp_media_events_count_last_1hour",
        "m_sum_snp_media_events_count_last_1hour",
        "m_avg_snp_media_events_count_last_3hour",
        "m_max_snp_media_events_count_last_3hour",
        "m_stddev_snp_media_events_count_last_3hour",
        "m_sum_snp_media_events_count_last_3hour",
        "m_avg_snp_media_events_count_last_3days",
        "m_max_snp_media_events_count_last_3days",
        "m_stddev_snp_media_events_count_last_3days",
        "m_sum_snp_media_events_count_last_3days",
        "m_avg_snp_media_events_count_last_7days",
        "m_max_snp_media_events_count_last_7days",
        "m_stddev_snp_media_events_count_last_7days",
        "m_sum_snp_media_events_count_last_7days",
        "m_avg_snp_media_events_count_last_14days",
        "m_max_snp_media_events_count_last_14days",
        "m_stddev_snp_media_events_count_last_14days",
        "m_sum_snp_media_events_count_last_14days",
        "m_avg_snp_media_events_count_last_30days",
        "m_max_snp_media_events_count_last_30days",
        "m_stddev_snp_media_events_count_last_30days",
        "m_sum_snp_media_events_count_last_30days",
        "m_avg_snp_media_events_count_last_60days",
        "m_max_snp_media_events_count_last_60days",
        "m_stddev_snp_media_events_count_last_60days",
        "m_sum_snp_media_events_count_last_60days") \
# .printSchema()

In [19]:
features_5min.printSchema()

root
 |-- time: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: integer (nullable = true)
 |-- t_trade_epoch: long (nullable = true)
 |-- t_trade_date: date (nullable = true)
 |-- t_trade_hour: integer (nullable = true)
 |-- t_trade_minute: integer (nullable = true)
 |-- t_trade_day_of_week: integer (nullable = true)
 |-- t_trade_open_e: long (nullable = true)
 |-- t_trade_open_h: integer (nullable = true)
 |-- t_trade_close_e: long (nullable = true)
 |-- t_trade_close_h: integer (nullable = true)
 |-- t_time_diff_between_trade_and_open_in_sec: long (nullable = true)
 |-- t_trade_part: integer (nullable = false)
 |-- t_trade_part_open_e: long (nullable = true)
 |-- t_trade_part_open_h: integer (nullable = true)
 |-- t_time_diff_between_trade_and_trade_part_open_in_sec: long (nullable = true)
 |-- t_precent_of_time_from_start_of_trade_pase: double (nul

In [20]:

features_5min.orderBy("t_trade_epoch").coalesce(1).write.mode("overwrite").csv(path="./data/DWH/new_features_5min_csv",header=True)



                                                                                

In [15]:
# spark.sql(f"""
#  select --time, 
#         --t_trade_part,
#         --t_precent_of_time_from_start_of_trade_pase,
#         --t_precent_of_time_from_start_of_trade_day,     
#         t_trade_epoch,
#         rnk,
#         open,
#         p_min_open_price_for_last_15min,
#         p_avg_open_price_for_last_15min,
#         p_max_open_price_for_last_15min,
#         p_stddev_open_price_for_last_15min,
#         p_min_open_price_for_last_25min,
#         p_avg_open_price_for_last_25min,
#         p_max_open_price_for_last_25min,
#         p_stddev_open_price_for_last_25min,
#         p_min_open_price_for_last_50min,
#         p_avg_open_price_for_last_50min,
#         p_max_open_price_for_last_50min,
#         p_stddev_open_price_for_last_50min
       
#  from df
#  where t_trade_date = '2021-12-14'
#  order by time desc

#  """).show(10000,False)

In [16]:
# spark.sql(
# """
# select 
# time,
# t_trade_date,
# open,
# p_avg_open_price_for_last_3min,
# p_avg_open_price_for_last_5min,
# p_avg_open_price_for_last_10min,
# p_min_open_price_for_last_3min,
# p_min_open_price_for_last_5min,
# p_min_open_price_for_last_10min,
# p_max_open_price_for_last_3min,
# p_max_open_price_for_last_5min,
# p_max_open_price_for_last_10min
# from df
# """
# ).show(100, False)




# spark.sql(
# """
# select 
# t_trade_date,
# time,
# high,
# p_min_high_price_for_last_3min,
# p_min_high_price_for_last_5min,
# p_min_high_price_for_last_10min,
# p_max_high_price_for_last_3min,
# p_max_high_price_for_last_5min,
# p_max_high_price_for_last_10min,
# p_avg_high_price_for_last_3min,
# p_avg_high_price_for_last_5min,
# p_avg_high_price_for_last_10min,
# p_stddev_high_price_for_last_3min,
# p_stddev_high_price_for_last_5min,
# p_stddev_high_price_for_last_10min
# from df
# """
# ).show(100, False)

In [17]:
# spark.sql(f"""
# select time, 
#        t_trade_epoch,
#        t_trade_open_e,
#        t_trade_close_e,
#        t_trade_part_open_e,

#        t_trade_part,
#        case
#            when t_trade_part = 'pre trade' then {pre_trade_d}
#            when t_trade_part = 'trade' then {trade_d}
#            when t_trade_part = 'post trade' then {post_trade_d}
#            else -1
#        end as part_dur,
           
#        t_time_diff_between_trade_and_trade_part_open_in_sec,
#        t_precent_of_time_from_start_of_trade_pase,
#        t_precent_of_time_from_start_of_trade_day
# from df
# where t_trade_date = '2021-12-14'
# order by time desc

# """).show(10000,False)

# spark.sql("""
# select *
# from df2
# where t_trade_open_h == 4
# and t_trade_epoch = t_trade_open_e


# """).show(100,False)


# Test t_trade_open_e t_trade_close_e

# spark.sql("""
# select count(*) rows_c, 
#        sum(case when t_trade_open_h == 4 then 1 else 0 end) open_h,
#        sum(case when t_trade_close_h == 20 then 1 else 0 end) close_h
# from df

# """).show(100,False)

# +------+------+-------+
# |rows_c|open_h|close_h|
# +------+------+-------+
# |247708|247708|247212 |
# +------+------+-------+

# spark.sql("""
# select t_trade_open_h, t_trade_close_h, t_trade_date
# from df
# where t_trade_close_h <> 20
# group by 1,2,3
# """).show(100,False)

# +--------------+---------------+------------+
# |t_trade_open_h|t_trade_close_h|t_trade_date|
# +--------------+---------------+------------+
# |4             |17             |2022-11-25  |
# +--------------+---------------+------------+

# spark.sql("""
# select count(distinct t_trade_date) as date_c
# from df
# """).show(100,False)
# +------+
# |date_c|
# +------+
# |288   |
# +------+


# spark.sql("select time from df").show(100,False)


In [9]:
# Test t_trade_part

# spark.sql("""

# select t_trade_date,
        
#         --time, 

#        t_trade_hour,
#        t_trade_part
# from df
# group by 1,2,3
# """).show(10000,False)

# spark.sql("""

# select count(*)
# from df
# where t_trade_part='Unknown'

# """).show(100,False)

# +--------+
# |count(1)|
# +--------+
# |0       |
# +--------+


In [20]:
# # Test t_trade_part_open_e and t_trade_part_open_h


# df.select(F.col("time"),
#        F.col("t_trade_part"),
#        F.col("t_trade_part_open_e"),
#        F.col("t_precent_of_time_from_start_of_trade_day"), 
#        F.col("t_time_diff_between_trade_and_trade_part_open_in_sec"),           
#        F.col("t_precent_of_time_from_start_of_trade_pase")).show(10000,False)

# df.filter(df["t_trade_date"]=='2021-12-13')\
# .select(F.col("time"),
#        F.col("t_trade_epoch"),
#        F.col("t_trade_part_open_e"),           
#        F.col("t_time_diff_between_trade_and_open_in_sec")).show(10000,False)




# spark.sql("""

# select t_trade_date,
        
#        time, 
#        t_trade_part,
#        t_trade_part_open_e
# from df

# """).show(10000,False)