In [2]:
# !pip install findspark

In [1]:
import findspark

findspark.init()


In [13]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder \
          .appName('pre_process_1min_data') \
          .getOrCreate()


In [34]:
from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType, DoubleType

schema = StructType([
    StructField("trade_time", TimestampType()),
    StructField("open", DoubleType()),
    StructField("high", DoubleType()),
    StructField("low", DoubleType()),
    StructField("close", DoubleType()),
    StructField("volume", IntegerType())    
])


# Read csv files
df = spark.read.option("header",True).csv("./data/alpha_vantage/interval=1min/*", schema=schema)

df.printSchema()


root
 |-- trade_time: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: integer (nullable = true)



In [36]:
# Calculate avg_open_price_for_last_3*interval -> avg_open_price_for_last_3min
# interval = 1min

# Step 1 - convert trade time to -> trade_epoch
df = df.withColumn("trade_epoch",F.unix_timestamp("trade_time"))

# df.printSchema()
# df.show(100,False)


In [40]:

df.withColumn("rnk",F.dense_rank().over(Window.partitionBy().orderBy("trade_epoch")))\
.withColumn("avg_open_price_for_last_3min",F.avg("open").over(Window.partitionBy().orderBy("rnk").rangeBetween(-3*60,-1)))\
.withColumn("avg_open_price_for_last_5min",F.avg("open").over(Window.partitionBy().orderBy("rnk").rangeBetween(-5*60,-1)))\
.show(100, False)



23/02/05 21:33:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/02/05 21:33:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/02/05 21:33:22 WARN CSVDataSource: CSV header does not conform to the schema.
 Header: time, open, high, low, close, volume
 Schema: trade_time, open, high, low, close, volume
Expected: trade_time but found: time
CSV file: file:///Users/ybatash/Workspace/tau.ac.il/tau-final-project/data/alpha_vantage/interval=1min/interval_1min_year1month9.csv
23/02/05 21:33:22 WARN CSVDataSource: CSV header does not conform to the schema.
 Header: time, open, high, low, close, volume
 Schema: trade_time, open, high, low, close, volume
Expected: trade_time but found: time
CSV file: file:///Users/ybatash/Workspace/tau.ac.il/tau-final-project/data/alpha_vantage/interval=1min/in

+-------------------+------------------+------------------+------------------+------------------+------+-----------+---+----------------------------+----------------------------+
|trade_time         |open              |high              |low               |close             |volume|trade_epoch|rnk|avg_open_price_for_last_3min|avg_open_price_for_last_5min|
+-------------------+------------------+------------------+------------------+------------------+------+-----------+---+----------------------------+----------------------------+
|2021-12-13 04:01:00|462.3392138451222 |462.652976592838  |462.3392138451222 |462.5451206483107 |2225  |1639360860 |1  |null                        |null                        |
|2021-12-13 04:02:00|462.55492573417683|462.67258676457027|462.55492573417683|462.6627816787041 |1078  |1639360920 |2  |462.3392138451222           |462.3392138451222           |
|2021-12-13 04:03:00|462.6137562493735 |462.62356133523963|462.6137562493735 |462.62356133523963|1422  |1

                                                                                

In [8]:
#  |-- avg_open_price_for_last_3*interval e.g - 3min,  15min, 45min
#  |-- avg_open_price_for_last_5*interval e.g - 5min,  25min, 75min 
#  |-- avg_open_price_for_last_10*interva e.g - 10min, 50min, 150min

root
 |-- time: string (nullable = true)
 |-- open: string (nullable = true)
 |-- high: string (nullable = true)
 |-- low: string (nullable = true)
 |-- close: string (nullable = true)
 |-- volume: string (nullable = true)
 |-- date: date (nullable = true)



In [27]:
df.withColumn("rnk",F.dense_rank().over(Window.partitionBy().orderBy("time")))\
  .withColumn("last_week_avg",F.avg("open").over(Window.partitionBy().orderBy("rnk").rangeBetween(-7,-1)))\
  .withColumn("last_2_week_avg",F.avg("open").over(Window.partitionBy().orderBy("rnk").rangeBetween(-14,-1))              
).show()
              
              

23/02/04 21:11:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/02/04 21:11:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/02/04 21:11:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+------------------+------------------+------------------+------------------+------+---+------------------+------------------+
|               time|              open|              high|               low|             close|volume|rnk|     last_week_avg|   last_2_week_avg|
+-------------------+------------------+------------------+------------------+------------------+------+---+------------------+------------------+
|2021-12-13 04:01:00| 462.3392138451222|  462.652976592838| 462.3392138451222| 462.5451206483107|  2225|  1|              null|              null|
|2021-12-13 04:02:00|462.55492573417683|462.67258676457027|462.55492573417683| 462.6627816787041|  1078|  2| 462.3392138451222| 462.3392138451222|
|2021-12-13 04:03:00| 462.6137562493735|462.62356133523963| 462.6137562493735|462.62356133523963|  1422|  3| 462.4470697896495| 462.4470697896495|
|2021-12-13 04:04:00|  462.652976592838| 462.7608325373653|  462.652976592838| 462.7608325373653|   462|  4| 462.50263

[Stage 19:>                                                         (0 + 1) / 1]                                                                                

In [19]:
week_window = Window.orderBy("Week_tmp").rangeBetween(-7, -1)

In [None]:
df_grouped = df.groupBy(["Store_ID", "Month"]).agg(F.sum("Sales").alias("average"))

In [15]:
res_df = spark.sql("""
select time,
       open,
       high,
       low,
       close,
       volume,
       as avg_price_for_last_50_days
       
from DF_1m
where time = 'time'
group by 1
limit 100
""").show(100,False)

+----+--------+
|time|count(1)|
+----+--------+
+----+--------+



