In [1]:
# !pip install findspark

In [1]:
import findspark

findspark.init()


In [2]:
# Variables 
DATA_INTERVAL = 1 # 1 minute
MIN = 60
SEC = 60
pre_trade_d = (5.5*(MIN*SEC)) - (DATA_INTERVAL*SEC)
trade_d = 6.5*(MIN*SEC) - (DATA_INTERVAL*SEC)
post_trade_d = 4*(MIN*SEC) - (DATA_INTERVAL*SEC)

total_trade_d = pre_trade_d + trade_d + post_trade_d + (2*DATA_INTERVAL*SEC)

pre_trade_ub = 5.5*(MIN*SEC)
trade_ub = pre_trade_ub + 6.5*(MIN*SEC)
post_trade_ub = trade_ub + 4*(MIN*SEC)

print(f"pre_trade_d - {pre_trade_d}")
print(f"trade_d - {trade_d}")
print(f"post_trade_d - {post_trade_d}")


pre_trade_d - 19740.0
trade_d - 23340.0
post_trade_d - 14340


In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder \
          .appName('pre_process_1min_data') \
          .config('spark.driver.extraJavaOptions', '-Duser.timezone=GMT') \
          .config('spark.executor.extraJavaOptions', '-Duser.timezone=GMT') \
          .config('spark.sql.session.timeZone', 'UTC') \
          .getOrCreate()


:: loading settings :: url = jar:file:/Users/ybatash/.sdkman/candidates/spark/3.5.0/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/ybatash/.ivy2/cache
The jars for the packages stored in: /Users/ybatash/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5fc06da9-858b-469d-b3ef-54a3de47740c;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.0.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.0.0 in central
	found org.apache.kafka#kafka-clients;2.4.1 in local-m2-cache
	found com.github.luben#zstd-jni;1.4.4-3 in local-m2-cache
	found org.lz4#lz4-java;1.7.1 in local-m2-cache
	found org.xerial.snappy#snappy-java;1.1.7.5 in local-m2-cache
	found org.slf4j#slf4j-api;1.7.30 in local-m2-cache
	found org.spark-project.spark#unused;1.0.0 in local-m2-cache
	found org.apache.commons#commons-pool2;2.6.2 in local-m2-cache
:: resolution report :: resolve 243ms :: artifacts dl 14ms
	:: modules in use:
	com.github.luben#zstd-jni;1.4.4-3 from local-m2-cache in [d

In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType, DoubleType

schema = StructType([
    StructField("time", TimestampType()),
    StructField("open", DoubleType()),
    StructField("high", DoubleType()),
    StructField("low", DoubleType()),
    StructField("close", DoubleType()),
    StructField("volume", IntegerType())    
])


# Read csv files
df = spark.read.option("header",True).csv("./data/alpha_vantage/SPY/interval=1min/*", schema=schema)

df.printSchema()

df.createOrReplaceTempView("df")

root
 |-- time: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: integer (nullable = true)



## Adding Time Features


In [5]:
PRE_TRADE=0
TRADE=1
POST_TRADE=2
UNKNOWN=-1

# This column capture the time and date of SPY index price in epoch time.
df = df.withColumn("t_trade_epoch",F.unix_timestamp("time"))

# This column capture the date of SPY index price.
df = df.withColumn("t_trade_date", F.to_date("time"))

# This column capture the hour of SPY index price.
df = df.withColumn("t_trade_hour", F.hour("time"))

# This column capture the hour of SPY index price.
df = df.withColumn("t_trade_minute", F.minute("time"))


# This column capture the day of the week of SPY index price.
df = df.withColumn("t_trade_day_of_week", F.dayofweek("time"))

# e - epoch
# This column capture the open epoch of SPY index price per day.
df = df.withColumn("t_trade_open_e", F.min("t_trade_epoch").over(Window.partitionBy("t_trade_date")))
df = df.withColumn("t_trade_open_h", F.hour(F.min("time").over(Window.partitionBy("t_trade_date"))))

# This column capture the close epoch of SPY index price per day.
df = df.withColumn("t_trade_close_e", F.max("t_trade_epoch").over(Window.partitionBy("t_trade_date")))
df = df.withColumn("t_trade_close_h", F.hour(F.max("time").over(Window.partitionBy("t_trade_date"))))

# This column capture the number of seconds pass from trade open and trade close.
df = df.withColumn("t_time_diff_between_trade_and_open_in_sec", F.col("t_trade_epoch")-F.col("t_trade_open_e"))

# This column capture what part of the trade we are (pre trade, trade, post trade)
df = df.withColumn("t_trade_part", F.when(F.col("t_time_diff_between_trade_and_open_in_sec") < pre_trade_ub,PRE_TRADE)
.when(F.col("t_time_diff_between_trade_and_open_in_sec") < trade_ub,TRADE)
.when(F.col("t_time_diff_between_trade_and_open_in_sec") <= post_trade_ub,POST_TRADE)             
.otherwise(UNKNOWN))

column_list = ["t_trade_date","t_trade_part"]
win_spec = Window.partitionBy([F.col(x) for x in column_list])

# e - epoch
# This column capture the open epoch of SPY index price per trading part in day.
df = df.withColumn("t_trade_part_open_e", F.min("t_trade_epoch").over(win_spec))
df = df.withColumn("t_trade_part_open_h", F.hour(F.min("time").over(win_spec)))

# This column capture the number of seconds pass from trade part open and trade.
df = df.withColumn("t_time_diff_between_trade_and_trade_part_open_in_sec", F.col("t_trade_epoch")-F.col("t_trade_part_open_e"))

df = df.withColumn("t_precent_of_time_from_start_of_trade_pase", F.when(F.col("t_trade_part") == PRE_TRADE, F.col("t_time_diff_between_trade_and_trade_part_open_in_sec")/pre_trade_d*100)
.when(F.col("t_trade_part") == TRADE, F.col("t_time_diff_between_trade_and_trade_part_open_in_sec")/trade_d*100)
.when(F.col("t_trade_part") == POST_TRADE, F.col("t_time_diff_between_trade_and_trade_part_open_in_sec")/post_trade_d*100)              
.otherwise(UNKNOWN))

df = df.withColumn("t_precent_of_time_from_start_of_trade_day", F.col("t_time_diff_between_trade_and_open_in_sec")/total_trade_d*100)


df.createOrReplaceTempView("df")

df.printSchema()

root
 |-- time: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: integer (nullable = true)
 |-- t_trade_epoch: long (nullable = true)
 |-- t_trade_date: date (nullable = true)
 |-- t_trade_hour: integer (nullable = true)
 |-- t_trade_minute: integer (nullable = true)
 |-- t_trade_day_of_week: integer (nullable = true)
 |-- t_trade_open_e: long (nullable = true)
 |-- t_trade_open_h: integer (nullable = true)
 |-- t_trade_close_e: long (nullable = true)
 |-- t_trade_close_h: integer (nullable = true)
 |-- t_time_diff_between_trade_and_open_in_sec: long (nullable = true)
 |-- t_trade_part: integer (nullable = false)
 |-- t_trade_part_open_e: long (nullable = true)
 |-- t_trade_part_open_h: integer (nullable = true)
 |-- t_time_diff_between_trade_and_trade_part_open_in_sec: long (nullable = true)
 |-- t_precent_of_time_from_start_of_trade_pase: double (nul

## Adding Price Features


In [6]:
# Adding Open Price Features

df = df.withColumn("rnk",F.dense_rank().over(Window.partitionBy("t_trade_date").orderBy("t_trade_epoch")))\
.withColumn("p_min_open_price_for_last_3min",F.min("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_avg_open_price_for_last_3min",F.avg("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_max_open_price_for_last_3min",F.max("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_stddev_open_price_for_last_3min",F.stddev("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_min_open_price_for_last_5min",F.min("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_avg_open_price_for_last_5min",F.avg("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_max_open_price_for_last_5min",F.max("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_stddev_open_price_for_last_5min",F.stddev("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_min_open_price_for_last_10min",F.min("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_avg_open_price_for_last_10min",F.avg("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_max_open_price_for_last_10min",F.max("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_stddev_open_price_for_last_10min",F.stddev("open").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))


In [7]:
# Adding high Price Features

df = df.withColumn("rnk",F.dense_rank().over(Window.partitionBy("t_trade_date").orderBy("t_trade_epoch")))\
.withColumn("p_min_high_price_for_last_3min",F.min("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_avg_high_price_for_last_3min",F.avg("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_max_high_price_for_last_3min",F.max("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_stddev_high_price_for_last_3min",F.stddev("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_min_high_price_for_last_5min",F.min("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_avg_high_price_for_last_5min",F.avg("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_max_high_price_for_last_5min",F.max("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_stddev_high_price_for_last_5min",F.stddev("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_min_high_price_for_last_10min",F.min("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_avg_high_price_for_last_10min",F.avg("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_max_high_price_for_last_10min",F.max("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_stddev_high_price_for_last_10min",F.stddev("high").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))

# df.createOrReplaceTempView("df")

# df.printSchema()

In [8]:
# Adding low Price Features

df = df.withColumn("rnk",F.dense_rank().over(Window.partitionBy("t_trade_date").orderBy("t_trade_epoch")))\
.withColumn("p_min_low_price_for_last_3min",F.min("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_avg_low_price_for_last_3min",F.avg("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_max_low_price_for_last_3min",F.max("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_stddev_low_price_for_last_3min",F.stddev("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_min_low_price_for_last_5min",F.min("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_avg_low_price_for_last_5min",F.avg("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_max_low_price_for_last_5min",F.max("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_stddev_low_price_for_last_5min",F.stddev("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_min_low_price_for_last_10min",F.min("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_avg_low_price_for_last_10min",F.avg("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_max_low_price_for_last_10min",F.max("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_stddev_low_price_for_last_10min",F.stddev("low").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))


# df.createOrReplaceTempView("df")

# df.printSchema()

In [9]:
# # Adding close Price Features

df = df.withColumn("rnk",F.dense_rank().over(Window.partitionBy("t_trade_date").orderBy("t_trade_epoch")))\
.withColumn("p_min_close_price_for_last_3min",F.min("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_avg_close_price_for_last_3min",F.avg("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_max_close_price_for_last_3min",F.max("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_stddev_close_price_for_last_3min",F.stddev("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_min_close_price_for_last_5min",F.min("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_avg_close_price_for_last_5min",F.avg("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_max_close_price_for_last_5min",F.max("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_stddev_close_price_for_last_5min",F.stddev("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_min_close_price_for_last_10min",F.min("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_avg_close_price_for_last_10min",F.avg("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_max_close_price_for_last_10min",F.max("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_stddev_close_price_for_last_10min",F.stddev("close").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))


In [10]:
# # Adding volume Price Features

df = df.withColumn("rnk",F.dense_rank().over(Window.partitionBy("t_trade_date").orderBy("t_trade_epoch")))\
.withColumn("p_min_volume_price_for_last_3min",F.min("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_avg_volume_price_for_last_3min",F.avg("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_max_volume_price_for_last_3min",F.max("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_stddev_volume_price_for_last_3min",F.stddev("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-3,-1)))\
.withColumn("p_min_volume_price_for_last_5min",F.min("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_avg_volume_price_for_last_5min",F.avg("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_max_volume_price_for_last_5min",F.max("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_stddev_volume_price_for_last_5min",F.stddev("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-5,-1)))\
.withColumn("p_min_volume_price_for_last_10min",F.min("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_avg_volume_price_for_last_10min",F.avg("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_max_volume_price_for_last_10min",F.max("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))\
.withColumn("p_stddev_volume_price_for_last_10min",F.stddev("volume").over(Window.partitionBy("t_trade_date").orderBy("rnk").rangeBetween(-10,-1)))


# Fill Null with 0

In [11]:
p_prefix_column_list = [f.name for f in df.schema.fields if f.name.startswith("p_")]

#Replace 0 for null for all integer columns
# df.na.fill(value=0).show()

#Replace 0 for null on only population column 
df = df.na.fill(value=0,subset=p_prefix_column_list)


In [12]:
# # Check if we have Nulls
# # ----------------------------

# df.select([F.count(F.when(F.isnan(c), c)).alias(c) for c in p_prefix_column_list]).show()


In [13]:

df.createOrReplaceTempView("df")

df.printSchema()

root
 |-- time: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: integer (nullable = true)
 |-- t_trade_epoch: long (nullable = true)
 |-- t_trade_date: date (nullable = true)
 |-- t_trade_hour: integer (nullable = true)
 |-- t_trade_minute: integer (nullable = true)
 |-- t_trade_day_of_week: integer (nullable = true)
 |-- t_trade_open_e: long (nullable = true)
 |-- t_trade_open_h: integer (nullable = true)
 |-- t_trade_close_e: long (nullable = true)
 |-- t_trade_close_h: integer (nullable = true)
 |-- t_time_diff_between_trade_and_open_in_sec: long (nullable = true)
 |-- t_trade_part: integer (nullable = false)
 |-- t_trade_part_open_e: long (nullable = true)
 |-- t_trade_part_open_h: integer (nullable = true)
 |-- t_time_diff_between_trade_and_trade_part_open_in_sec: long (nullable = true)
 |-- t_precent_of_time_from_start_of_trade_pase: double (nul

24/08/25 06:32:01 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


# Adding media data points

In [14]:

media_df = spark.read.parquet("./data/DWH/dim_media/*.snappy.parquet")

media_df.printSchema()

media_df.summary().show()

media_df.createOrReplaceTempView("media_df")

                                                                                

root
 |-- epoch: integer (nullable = true)
 |-- ts_: string (nullable = true)
 |-- date_: date (nullable = true)
 |-- year_: integer (nullable = true)
 |-- month_: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- day_of_month: integer (nullable = true)
 |-- time_: string (nullable = true)
 |-- hour_: integer (nullable = true)
 |-- minute_: integer (nullable = true)
 |-- snp_media_events_count: long (nullable = true)
 |-- m_avg_snp_media_events_count_last_15min: double (nullable = true)
 |-- m_max_snp_media_events_count_last_15min: long (nullable = true)
 |-- m_stddev_snp_media_events_count_last_15min: double (nullable = true)
 |-- m_sum_snp_media_events_count_last_15min: long (nullable = true)
 |-- m_avg_snp_media_events_count_last_30min: double (nullable = true)
 |-- m_max_snp_media_events_count_last_30min: long (nullable = true)
 |-- m_stddev_snp_media_events_count_last_30min: double (nullable = true)
 |-- m_sum_snp_media_events_count_last_30min: long (nullab

[Stage 1:>                                                          (0 + 9) / 9]

CodeCache: size=131072Kb used=40756Kb max_used=42270Kb free=90315Kb
 bounds [0x0000000104298000, 0x0000000106c58000, 0x000000010c298000]
 total_blobs=13297 nmethods=12302 adapters=908
 compilation: disabled (not enough contiguous free space left)


                                                                                

+-------+--------------------+-------------------+------------------+------------------+-----------+------------------+--------+-----------------+------------------+----------------------+---------------------------------------+---------------------------------------+------------------------------------------+---------------------------------------+---------------------------------------+---------------------------------------+------------------------------------------+---------------------------------------+---------------------------------------+---------------------------------------+------------------------------------------+---------------------------------------+---------------------------------------+---------------------------------------+------------------------------------------+---------------------------------------+---------------------------------------+---------------------------------------+------------------------------------------+-------------------------------------

In [15]:
# Join both data freams 
features_1min = df.join(media_df, df["t_trade_epoch"] == media_df["epoch"])\
.select( "time",
        "open",
        "high",
        "low",
        "close",
        "volume",
        "t_trade_epoch",
        "t_trade_date",
        "t_trade_hour",
        "t_trade_minute",
        "t_trade_day_of_week",
        "t_trade_open_e",
        "t_trade_open_h",
        "t_trade_close_e",
        "t_trade_close_h",
        "t_time_diff_between_trade_and_open_in_sec",
        "t_trade_part",
        "t_trade_part_open_e",
        "t_trade_part_open_h",
        "t_time_diff_between_trade_and_trade_part_open_in_sec",
        "t_precent_of_time_from_start_of_trade_pase",
        "t_precent_of_time_from_start_of_trade_day",
        "p_min_open_price_for_last_3min",
        "p_avg_open_price_for_last_3min",
        "p_max_open_price_for_last_3min",
        "p_stddev_open_price_for_last_3min",
        "p_min_open_price_for_last_5min",
        "p_avg_open_price_for_last_5min",
        "p_max_open_price_for_last_5min",
        "p_stddev_open_price_for_last_5min",
        "p_min_open_price_for_last_10min",
        "p_avg_open_price_for_last_10min",
        "p_max_open_price_for_last_10min",
        "p_stddev_open_price_for_last_10min",
        "p_min_high_price_for_last_3min",
        "p_avg_high_price_for_last_3min",
        "p_max_high_price_for_last_3min",
        "p_stddev_high_price_for_last_3min",
        "p_min_high_price_for_last_5min",
        "p_avg_high_price_for_last_5min",
        "p_max_high_price_for_last_5min",
        "p_stddev_high_price_for_last_5min",
        "p_min_high_price_for_last_10min",
        "p_avg_high_price_for_last_10min",
        "p_max_high_price_for_last_10min",
        "p_stddev_high_price_for_last_10min",
        "p_min_low_price_for_last_3min",
        "p_avg_low_price_for_last_3min",
        "p_max_low_price_for_last_3min",
        "p_stddev_low_price_for_last_3min",
        "p_min_low_price_for_last_5min",
        "p_avg_low_price_for_last_5min",
        "p_max_low_price_for_last_5min",
        "p_stddev_low_price_for_last_5min",
        "p_min_low_price_for_last_10min",
        "p_avg_low_price_for_last_10min",
        "p_max_low_price_for_last_10min",
        "p_stddev_low_price_for_last_10min",
        "p_min_close_price_for_last_3min",
        "p_avg_close_price_for_last_3min",
        "p_max_close_price_for_last_3min",
        "p_stddev_close_price_for_last_3min",
        "p_min_close_price_for_last_5min",
        "p_avg_close_price_for_last_5min",
        "p_max_close_price_for_last_5min",
        "p_stddev_close_price_for_last_5min",
        "p_min_close_price_for_last_10min",
        "p_avg_close_price_for_last_10min",
        "p_max_close_price_for_last_10min",
        "p_stddev_close_price_for_last_10min",
        "p_min_volume_price_for_last_3min",
        "p_avg_volume_price_for_last_3min",
        "p_max_volume_price_for_last_3min",
        "p_stddev_volume_price_for_last_3min",
        "p_min_volume_price_for_last_5min",
        "p_avg_volume_price_for_last_5min",
        "p_max_volume_price_for_last_5min",
        "p_stddev_volume_price_for_last_5min",
        "p_min_volume_price_for_last_10min",
        "p_avg_volume_price_for_last_10min",
        "p_max_volume_price_for_last_10min",
        "p_stddev_volume_price_for_last_10min",
        "snp_media_events_count",
        "m_avg_snp_media_events_count_last_15min",
        "m_max_snp_media_events_count_last_15min",
        "m_stddev_snp_media_events_count_last_15min",
        "m_sum_snp_media_events_count_last_15min",
        "m_avg_snp_media_events_count_last_30min",
        "m_max_snp_media_events_count_last_30min",
        "m_stddev_snp_media_events_count_last_30min",
        "m_sum_snp_media_events_count_last_30min",
        "m_avg_snp_media_events_count_last_45min",
        "m_max_snp_media_events_count_last_45min",
        "m_stddev_snp_media_events_count_last_45min",
        "m_sum_snp_media_events_count_last_45min",
        "m_avg_snp_media_events_count_last_1hour",
        "m_max_snp_media_events_count_last_1hour",
        "m_stddev_snp_media_events_count_last_1hour",
        "m_sum_snp_media_events_count_last_1hour",
        "m_avg_snp_media_events_count_last_3hour",
        "m_max_snp_media_events_count_last_3hour",
        "m_stddev_snp_media_events_count_last_3hour",
        "m_sum_snp_media_events_count_last_3hour",
        "m_avg_snp_media_events_count_last_3days",
        "m_max_snp_media_events_count_last_3days",
        "m_stddev_snp_media_events_count_last_3days",
        "m_sum_snp_media_events_count_last_3days",
        "m_avg_snp_media_events_count_last_7days",
        "m_max_snp_media_events_count_last_7days",
        "m_stddev_snp_media_events_count_last_7days",
        "m_sum_snp_media_events_count_last_7days",
        "m_avg_snp_media_events_count_last_14days",
        "m_max_snp_media_events_count_last_14days",
        "m_stddev_snp_media_events_count_last_14days",
        "m_sum_snp_media_events_count_last_14days",
        "m_avg_snp_media_events_count_last_30days",
        "m_max_snp_media_events_count_last_30days",
        "m_stddev_snp_media_events_count_last_30days",
        "m_sum_snp_media_events_count_last_30days",
        "m_avg_snp_media_events_count_last_60days",
        "m_max_snp_media_events_count_last_60days",
        "m_stddev_snp_media_events_count_last_60days",
        "m_sum_snp_media_events_count_last_60days") \
# .printSchema()

In [16]:
# features_1min.write.mode("overwrite").parquet("./data/DWH/features_1min")

# features_1min.write.mode("overwrite").csv("./data/DWH/features_1min_csv")

In [17]:
features_1min.printSchema()

root
 |-- time: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: integer (nullable = true)
 |-- t_trade_epoch: long (nullable = true)
 |-- t_trade_date: date (nullable = true)
 |-- t_trade_hour: integer (nullable = true)
 |-- t_trade_minute: integer (nullable = true)
 |-- t_trade_day_of_week: integer (nullable = true)
 |-- t_trade_open_e: long (nullable = true)
 |-- t_trade_open_h: integer (nullable = true)
 |-- t_trade_close_e: long (nullable = true)
 |-- t_trade_close_h: integer (nullable = true)
 |-- t_time_diff_between_trade_and_open_in_sec: long (nullable = true)
 |-- t_trade_part: integer (nullable = false)
 |-- t_trade_part_open_e: long (nullable = true)
 |-- t_trade_part_open_h: integer (nullable = true)
 |-- t_time_diff_between_trade_and_trade_part_open_in_sec: long (nullable = true)
 |-- t_precent_of_time_from_start_of_trade_pase: double (nul

In [18]:

features_1min.orderBy("t_trade_epoch").coalesce(1).write.mode("overwrite").csv(path="./data/DWH/new_features_1min_csv",header=True)
# df.orderBy("").coalesce(1).write.csv("address")
# df.repartition(1).write.csv("address")


                                                                                

In [21]:
!head -n 5 ./data/DWH/new_features_1min_csv/part-00000-011801cc-ded0-4036-b0f4-aeef100934dd-c000.csv


time,open,high,low,close,volume,t_trade_epoch,t_trade_date,t_trade_hour,t_trade_minute,t_trade_day_of_week,t_trade_open_e,t_trade_open_h,t_trade_close_e,t_trade_close_h,t_time_diff_between_trade_and_open_in_sec,t_trade_part,t_trade_part_open_e,t_trade_part_open_h,t_time_diff_between_trade_and_trade_part_open_in_sec,t_precent_of_time_from_start_of_trade_pase,t_precent_of_time_from_start_of_trade_day,p_min_open_price_for_last_3min,p_avg_open_price_for_last_3min,p_max_open_price_for_last_3min,p_stddev_open_price_for_last_3min,p_min_open_price_for_last_5min,p_avg_open_price_for_last_5min,p_max_open_price_for_last_5min,p_stddev_open_price_for_last_5min,p_min_open_price_for_last_10min,p_avg_open_price_for_last_10min,p_max_open_price_for_last_10min,p_stddev_open_price_for_last_10min,p_min_high_price_for_last_3min,p_avg_high_price_for_last_3min,p_max_high_price_for_last_3min,p_stddev_high_price_for_last_3min,p_min_high_price_for_last_5min,p_avg_high_price_for_last_5min,p_max_high_price_for_las