In [1]:
# !pip install findspark

In [2]:
import findspark

findspark.init()


In [5]:
# Variables 
DATA_INTERVAL = 1 # 1 minute
MIN = 60
SEC = 60
pre_trade_d = (5.5*(MIN*SEC)) - (DATA_INTERVAL*SEC)
trade_d = 6.5*(MIN*SEC) - (DATA_INTERVAL*SEC)
post_trade_d = 4*(MIN*SEC) - (DATA_INTERVAL*SEC)

total_trade_d = pre_trade_d + trade_d + post_trade_d + (2*DATA_INTERVAL*SEC)

pre_trade_ub = 5.5*(MIN*SEC)
trade_ub = pre_trade_ub + 6.5*(MIN*SEC)
post_trade_ub = trade_ub + 4*(MIN*SEC)

print(f"pre_trade_d - {pre_trade_d}")
print(f"trade_d - {trade_d}")
print(f"post_trade_d - {post_trade_d}")


pre_trade_d - 19740.0
trade_d - 23340.0
post_trade_d - 14340


In [6]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder \
          .appName('pre_process_1min_data') \
          .getOrCreate()


In [7]:
from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType, DoubleType

schema = StructType([
    StructField("time", TimestampType()),
    StructField("open", DoubleType()),
    StructField("high", DoubleType()),
    StructField("low", DoubleType()),
    StructField("close", DoubleType()),
    StructField("volume", IntegerType())    
])


# Read csv files
df = spark.read.option("header",True).csv("./data/alpha_vantage/SPY/interval=1min/*", schema=schema)

df.printSchema()

df.createOrReplaceTempView("df")

root
 |-- time: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: integer (nullable = true)



## Adding Time Features


In [8]:
# This column capture the time and date of SPY index price in epoch time.
df = df.withColumn("t_trade_epoch",F.unix_timestamp("time"))

# This column capture the date of SPY index price.
df = df.withColumn("t_trade_date", F.to_date("time"))

# This column capture the hour of SPY index price.
df = df.withColumn("t_trade_hour", F.hour("time"))

# This column capture the hour of SPY index price.
df = df.withColumn("t_trade_minute", F.minute("time"))


# This column capture the day of the week of SPY index price.
df = df.withColumn("t_trade_day_of_week", F.date_format("time", 'E'))

# e - epoch
# This column capture the open epoch of SPY index price per day.
df = df.withColumn("t_trade_open_e", F.min("t_trade_epoch").over(Window.partitionBy("t_trade_date")))
df = df.withColumn("t_trade_open_h", F.hour(F.min("time").over(Window.partitionBy("t_trade_date"))))

# This column capture the close epoch of SPY index price per day.
df = df.withColumn("t_trade_close_e", F.max("t_trade_epoch").over(Window.partitionBy("t_trade_date")))
df = df.withColumn("t_trade_close_h", F.hour(F.max("time").over(Window.partitionBy("t_trade_date"))))

# This column capture the number of seconds pass from trade open and trade close.
df = df.withColumn("t_time_diff_between_trade_and_open_in_sec", F.col("t_trade_epoch")-F.col("t_trade_open_e"))

# This column capture what part of the trade we are (pre trade, trade, post trade)
df = df.withColumn("t_trade_part", F.when(F.col("t_time_diff_between_trade_and_open_in_sec") < pre_trade_ub,"pre trade")
.when(F.col("t_time_diff_between_trade_and_open_in_sec") < trade_ub,"trade")
.when(F.col("t_time_diff_between_trade_and_open_in_sec") <= post_trade_ub,"post trade")             
.otherwise("Unknown"))

column_list = ["t_trade_date","t_trade_part"]
win_spec = Window.partitionBy([F.col(x) for x in column_list])

# e - epoch
# This column capture the open epoch of SPY index price per trading part in day.
df = df.withColumn("t_trade_part_open_e", F.min("t_trade_epoch").over(win_spec))
df = df.withColumn("t_trade_part_open_h", F.hour(F.min("time").over(win_spec)))

# This column capture the number of seconds pass from trade part open and trade.
df = df.withColumn("t_time_diff_between_trade_and_trade_part_open_in_sec", F.col("t_trade_epoch")-F.col("t_trade_part_open_e"))

df = df.withColumn("t_precent_of_time_from_start_of_trade_pase", F.when(F.col("t_trade_part") == "pre trade", F.col("t_time_diff_between_trade_and_trade_part_open_in_sec")/pre_trade_d*100)
.when(F.col("t_trade_part") == "trade", F.col("t_time_diff_between_trade_and_trade_part_open_in_sec")/trade_d*100)
.when(F.col("t_trade_part") == "post trade", F.col("t_time_diff_between_trade_and_trade_part_open_in_sec")/post_trade_d*100)              
.otherwise(-1))

df = df.withColumn("t_precent_of_time_from_start_of_trade_day", F.col("t_time_diff_between_trade_and_open_in_sec")/total_trade_d*100)


df.createOrReplaceTempView("df")

df.printSchema()

root
 |-- time: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: integer (nullable = true)
 |-- t_trade_epoch: long (nullable = true)
 |-- t_trade_date: date (nullable = true)
 |-- t_trade_hour: integer (nullable = true)
 |-- t_trade_minute: integer (nullable = true)
 |-- t_trade_day_of_week: string (nullable = true)
 |-- t_trade_open_e: long (nullable = true)
 |-- t_trade_open_h: integer (nullable = true)
 |-- t_trade_close_e: long (nullable = true)
 |-- t_trade_close_h: integer (nullable = true)
 |-- t_time_diff_between_trade_and_open_in_sec: long (nullable = true)
 |-- t_trade_part: string (nullable = false)
 |-- t_trade_part_open_e: long (nullable = true)
 |-- t_trade_part_open_h: integer (nullable = true)
 |-- t_time_diff_between_trade_and_trade_part_open_in_sec: long (nullable = true)
 |-- t_precent_of_time_from_start_of_trade_pase: double (nulla

In [9]:
# spark.sql(f"""
# select time, 
#        t_trade_epoch,
#        t_trade_open_e,
#        t_trade_close_e,
#        t_trade_part_open_e,

#        t_trade_part,
#        case
#            when t_trade_part = 'pre trade' then {pre_trade_d}
#            when t_trade_part = 'trade' then {trade_d}
#            when t_trade_part = 'post trade' then {post_trade_d}
#            else -1
#        end as part_dur,
           
#        t_time_diff_between_trade_and_trade_part_open_in_sec,
#        t_precent_of_time_from_start_of_trade_pase,
#        t_precent_of_time_from_start_of_trade_day
# from df
# where t_trade_date = '2021-12-14'
# order by time desc

# """).show(10000,False)
# Test t_trade_open_e t_trade_close_e

# spark.sql("""
# select count(*) rows_c, 
#        sum(case when t_trade_open_h == 4 then 1 else 0 end) open_h,
#        sum(case when t_trade_close_h == 20 then 1 else 0 end) close_h
# from df

# """).show(100,False)

# +------+------+-------+
# |rows_c|open_h|close_h|
# +------+------+-------+
# |247708|247708|247212 |
# +------+------+-------+

# spark.sql("""
# select t_trade_open_h, t_trade_close_h, t_trade_date
# from df
# where t_trade_close_h <> 20
# group by 1,2,3
# """).show(100,False)

# +--------------+---------------+------------+
# |t_trade_open_h|t_trade_close_h|t_trade_date|
# +--------------+---------------+------------+
# |4             |17             |2022-11-25  |
# +--------------+---------------+------------+

# spark.sql("""
# select count(distinct t_trade_date) as date_c
# from df
# """).show(100,False)
# +------+
# |date_c|
# +------+
# |288   |
# +------+


                                                                                

+-------------------+-------------+--------------+---------------+-------------------+------------+--------+----------------------------------------------------+------------------------------------------+-----------------------------------------+
|time               |t_trade_epoch|t_trade_open_e|t_trade_close_e|t_trade_part_open_e|t_trade_part|part_dur|t_time_diff_between_trade_and_trade_part_open_in_sec|t_precent_of_time_from_start_of_trade_pase|t_precent_of_time_from_start_of_trade_day|
+-------------------+-------------+--------------+---------------+-------------------+------------+--------+----------------------------------------------------+------------------------------------------+-----------------------------------------+
|2021-12-14 20:00:00|1639504800   |1639447260    |1639504800     |1639490460         |post trade  |14340.0 |14340                                               |100.0                                     |100.0                                    |
|2021-12-14 

In [7]:
# Test t_trade_part

# spark.sql("""

# select t_trade_date,
        
#         --time, 

#        t_trade_hour,
#        t_trade_part
# from df
# group by 1,2,3
# """).show(10000,False)

# spark.sql("""

# select count(*)
# from df
# where t_trade_part='Unknown'

# """).show(100,False)

# +--------+
# |count(1)|
# +--------+
# |0       |
# +--------+


In [8]:
# # Test t_trade_part_open_e and t_trade_part_open_h


# df.select(F.col("time"),
#        F.col("t_trade_part"),
#        F.col("t_trade_part_open_e"),
#        F.col("t_precent_of_time_from_start_of_trade_day"), 
#        F.col("t_time_diff_between_trade_and_trade_part_open_in_sec"),           
#        F.col("t_precent_of_time_from_start_of_trade_pase")).show(10000,False)

# spark.sql("""

# select t_trade_date,
        
#        time, 
#        t_trade_part,
#        t_trade_part_open_e
# from df

# """).show(10000,False)

## Adding Price Features


In [9]:
# Adding Open Price Features

df = df.withColumn("rnk",F.dense_rank().over(Window.partitionBy("t_trade_date").orderBy("t_trade_epoch")))\
.withColumn("p_min_open_price_for_last_3min",F.min("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_min_open_price_for_last_5min",F.min("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_min_open_price_for_last_10min",F.min("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_max_open_price_for_last_3min",F.max("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_max_open_price_for_last_5min",F.max("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_max_open_price_for_last_10min",F.max("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_avg_open_price_for_last_3min",F.avg("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_avg_open_price_for_last_5min",F.avg("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_avg_open_price_for_last_10min",F.avg("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_stddev_open_price_for_last_3min",F.stddev("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_stddev_open_price_for_last_5min",F.stddev("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_stddev_open_price_for_last_10min",F.stddev("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\


In [11]:
# Adding high Price Features

df = df.withColumn("rnk",F.dense_rank().over(Window.partitionBy("t_trade_date").orderBy("t_trade_epoch")))\
.withColumn("p_min_high_price_for_last_3min",F.min("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_min_high_price_for_last_5min",F.min("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_min_high_price_for_last_10min",F.min("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_max_high_price_for_last_3min",F.max("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_max_high_price_for_last_5min",F.max("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_max_high_price_for_last_10min",F.max("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_avg_high_price_for_last_3min",F.avg("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_avg_high_price_for_last_5min",F.avg("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_avg_high_price_for_last_10min",F.avg("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_stddev_high_price_for_last_3min",F.stddev("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_stddev_high_price_for_last_5min",F.stddev("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_stddev_high_price_for_last_10min",F.stddev("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\



# df.createOrReplaceTempView("df")

# df.printSchema()

In [13]:
# Adding low Price Features

df = df.withColumn("rnk",F.dense_rank().over(Window.partitionBy("t_trade_date").orderBy("t_trade_epoch")))\
.withColumn("p_min_low_price_for_last_3min",F.min("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_min_low_price_for_last_5min",F.min("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_min_low_price_for_last_10min",F.min("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_max_low_price_for_last_3min",F.max("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_max_low_price_for_last_5min",F.max("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_max_low_price_for_last_10min",F.max("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_avg_low_price_for_last_3min",F.avg("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_avg_low_price_for_last_5min",F.avg("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_avg_low_price_for_last_10min",F.avg("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_stddev_low_price_for_last_3min",F.stddev("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_stddev_low_price_for_last_5min",F.stddev("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_stddev_low_price_for_last_10min",F.stddev("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\


# df.createOrReplaceTempView("df")

# df.printSchema()

In [15]:
# # Adding close Price Features

df = df.withColumn("rnk",F.dense_rank().over(Window.partitionBy("t_trade_date").orderBy("t_trade_epoch")))\
.withColumn("p_min_close_price_for_last_3min",F.min("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_min_close_price_for_last_5min",F.min("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_min_close_price_for_last_10min",F.min("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_max_close_price_for_last_3min",F.max("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_max_close_price_for_last_5min",F.max("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_max_close_price_for_last_10min",F.max("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_avg_close_price_for_last_3min",F.avg("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_avg_close_price_for_last_5min",F.avg("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_avg_close_price_for_last_10min",F.avg("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_stddev_close_price_for_last_3min",F.stddev("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_stddev_close_price_for_last_5min",F.stddev("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_stddev_close_price_for_last_10min",F.stddev("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\


In [17]:
# # Adding volume Price Features

df = df.withColumn("rnk",F.dense_rank().over(Window.partitionBy("t_trade_date").orderBy("t_trade_epoch")))\
.withColumn("p_min_volume_price_for_last_3min",F.min("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_min_volume_price_for_last_5min",F.min("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_min_volume_price_for_last_10min",F.min("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_max_volume_price_for_last_3min",F.max("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_max_volume_price_for_last_5min",F.max("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_max_volume_price_for_last_10min",F.max("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_avg_volume_price_for_last_3min",F.avg("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_avg_volume_price_for_last_5min",F.avg("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_avg_volume_price_for_last_10min",F.avg("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_stddev_volume_price_for_last_3min",F.stddev("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_stddev_volume_price_for_last_5min",F.stddev("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_stddev_volume_price_for_last_10min",F.stddev("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\


In [18]:

df.createOrReplaceTempView("df")

df.printSchema()

root
 |-- time: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: integer (nullable = true)
 |-- t_trade_epoch: long (nullable = true)
 |-- t_trade_date: date (nullable = true)
 |-- t_trade_hour: integer (nullable = true)
 |-- t_trade_minute: integer (nullable = true)
 |-- t_trade_day_of_week: string (nullable = true)
 |-- t_trade_open_e: long (nullable = true)
 |-- t_trade_open_h: integer (nullable = true)
 |-- t_trade_close_e: long (nullable = true)
 |-- t_trade_close_h: integer (nullable = true)
 |-- t_time_diff_between_trade_and_open_in_sec: long (nullable = true)
 |-- t_trade_part: string (nullable = false)
 |-- t_trade_part_open_e: long (nullable = true)
 |-- t_trade_part_open_h: integer (nullable = true)
 |-- t_time_diff_between_trade_and_trade_part_open_in_sec: long (nullable = true)
 |-- t_precent_of_time_from_start_of_trade_pase: double (nulla

In [19]:
# spark.sql(
# """
# select 
# time,
# t_trade_date,
# open,
# p_avg_open_price_for_last_3min,
# p_avg_open_price_for_last_5min,
# p_avg_open_price_for_last_10min,
# p_min_open_price_for_last_3min,
# p_min_open_price_for_last_5min,
# p_min_open_price_for_last_10min,
# p_max_open_price_for_last_3min,
# p_max_open_price_for_last_5min,
# p_max_open_price_for_last_10min
# from df
# """
# ).show(100, False)




# spark.sql(
# """
# select 
# t_trade_date,
# time,
# high,
# p_min_high_price_for_last_3min,
# p_min_high_price_for_last_5min,
# p_min_high_price_for_last_10min,
# p_max_high_price_for_last_3min,
# p_max_high_price_for_last_5min,
# p_max_high_price_for_last_10min,
# p_avg_high_price_for_last_3min,
# p_avg_high_price_for_last_5min,
# p_avg_high_price_for_last_10min,
# p_stddev_high_price_for_last_3min,
# p_stddev_high_price_for_last_5min,
# p_stddev_high_price_for_last_10min
# from df
# """
# ).show(100, False)