In [None]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
import scala.util.{Failure, Success, Try}
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
import com.mongodb.spark._
import com.mongodb.spark.config._
import org.bson._

import java.time.LocalDateTime
import java.time.LocalDate
import java.time.format.DateTimeFormatter

In [None]:
val spark = (
    SparkSession
    .builder()
    .appName("Frequency")
    .config("spark.cores.max", 6)
    .config("spark.executor.cores", 6)
    .config("spark.executor.memory", "36g")
    .getOrCreate()
    )

In [None]:
def dateTimeFormatter = DateTimeFormatter.ofPattern("yyyyMMddHHmmss")
def appName: String = LocalDateTime.now().format(dateTimeFormatter)
def previousDay(minusDays: Long) = {
    LocalDateTime.now().minusDays(minusDays).format(dateTimeFormatter).substring(0, 8)
}

In [None]:
// val df = (
//     spark.read.format("mongodb")
//     .option("spark.mongodb.read.connection.uri", mongoUrl)
//     .option("spark.mongodb.write.connection.uri", mongoUrl)
//     .option("database", "coreEngine")
//     .option("collection", "Price").load()
//     )

In [None]:
val mongoUrl = "mongodb+srv://xxxxxxxxxxxxxxxxxxxxxx/"

val priceReadConfig = ReadConfig(Map(
  "spark.mongodb.input.uri" -> mongoUrl,
  "spark.mongodb.input.database" -> "coreEngine",
  "spark.mongodb.input.collection" -> "Price",
))

val priceWriteConfig = WriteConfig(Map(
  "spark.mongodb.output.uri" -> mongoUrl,
  "spark.mongodb.output.database" -> "coreEngine",
  "spark.mongodb.output.collection" -> "Price",
  "spark.mongodb.output.maxBatchSize" -> "8000"  
))

In [None]:
val priceMatchPipe = String.format("{ $match: { updateDate: { $gte: '%s' } } }", previousDay(2555))
val rdd = MongoSpark.load(spark.sparkContext, priceReadConfig)
val priceDf = rdd.withPipeline(Seq(Document.parse(priceMatchPipe)))

In [None]:
val priceYFReadConfig = ReadConfig(Map(
  "spark.mongodb.input.uri" -> mongoUrl,
  "spark.mongodb.input.database" -> "coreEngine",
  "spark.mongodb.input.collection" -> "PriceYF",
))

val priceYFWriteConfig = WriteConfig(Map(
  "spark.mongodb.output.uri" -> mongoUrl,
  "spark.mongodb.output.database" -> "coreEngine",
  "spark.mongodb.output.collection" -> "PriceYF",
  "spark.mongodb.output.maxBatchSize" -> "8000"  
))

In [None]:
val priceYFMatchPipe = String.format("{ $match: { updateDate: { $gte: '%s' } } }", previousDay(2555))
val rdd = MongoSpark.load(spark.sparkContext, priceYFReadConfig)
val priceYFDf = rdd.withPipeline(Seq(Document.parse(priceYFMatchPipe)))

In [None]:
val pdf = priceDf.toDF().drop("_id")
val pyfdf = priceYFDf.toDF().select("updateDate", "adjPrice", "stockCode")

In [None]:
val mergeDf = (
    pdf.join(pyfdf, Seq("stockCode", "updateDate"), "left")
    .withColumn("closingPrice", when(col("adjPrice").isNull, col("adjPrice")).otherwise(col("closingPrice")))
    )

In [None]:
mergeDf.printSchema()

In [None]:
val window = Window.partitionBy(col("stockCode")).orderBy(col("updateDate"))
val rankWindow = Window.partitionBy(col("stockCode")).orderBy(col("updateDate").desc)
val hf = (
    mergeDf
    .where(col("closingPrice").isNotNull)
    .withColumn("market", when(col("classify") === "KOSDAQ", 0).otherwise(1))
    .select("stockCode", "stockFullName", "market", "closingPrice", "prepare", "openPrice", "highPrice", "lowPrice", "tradingVolume", "transactionAmount", "updateDate")
    .withColumn("closingPricePrev", lag(col("closingPrice"), 1).over(window))
    .withColumn("closingPriceNext", lag(col("closingPrice"), -1).over(window))
    .withColumn("closingPriceDiff", col("closingPrice") - col("closingPricePrev"))
    .withColumn("closingPriceReturn", log(col("closingPrice") / col("closingPricePrev")))
    .withColumn("closingPriceEvt", when(col("closingPriceDiff") <= (col("closingPricePrev") * -0.1), 1).otherwise(0))
    .withColumn("closingPrice5Min", min(col("closingPriceDiff")).over(window.rowsBetween(-4, 0)))
    .withColumn("closingPrice5Max", max(col("closingPriceDiff")).over(window.rowsBetween(-4, 0)))
    .withColumn("closingPrice5", when((col("closingPrice5Max").cast("long") < 0) && (col("closingPrice5Min").cast(LongType) < 0), 1).otherwise(0))
    .withColumn("tradingVolumeLog", log(col("tradingVolume")))
    .withColumn("closingPriceReturn5days", mean(col("closingPriceReturn")).over(window.rowsBetween(-4, 0)))
    .withColumn("closingPriceReturn22days", mean(col("closingPriceReturn")).over(window.rowsBetween(-21, 0)))
    .withColumn("tradingVolumeVolatility5days", (variance(col("tradingVolume")).over(window.rowsBetween(-4, 0))))
    .withColumn("tradingVolumeVolatility22days", (variance(col("tradingVolume")).over(window.rowsBetween(-21, 0))))
    .withColumn("tradingVolumeVolatility5daysLog", log(col("tradingVolume")))
    .withColumn("tradingVolumeVolatility22daysLog", log(col("tradingVolume")))
    .withColumn("event", when((col("closingPriceEvt") === 1) || (col("closingPrice5") === 1), 1).otherwise(0))
    .select("stockCode", "stockFullName", "market", "updateDate", "closingPrice", "tradingVolume", "transactionAmount", "closingPricePrev", "closingPriceNext", "closingPriceDiff", "closingPriceReturn", "closingPrice5Min", "closingPrice5Max", "closingPrice5", "tradingVolumeLog", "closingPriceReturn5days", "closingPriceReturn22days", "tradingVolumeVolatility5days", "tradingVolumeVolatility22days", "tradingVolumeVolatility5daysLog", "tradingVolumeVolatility22daysLog", "event")
//     .withColumn("rank", row_number().over(rankWindow))
    )

In [None]:
"""
(
    hf.where(col("updateDate") === "20230428" and col("stockCode") === "005930")
    .na.fill(0)
    .write.format("mongodb")
    .mode("append")
    .option("upsertDocument", "true")
    .option("idFieldList", "updateDate,stockCode")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "PriceFeatures")
    .save()
    )
"""

In [None]:
"""
(
    hf
//     .where(col("rank") === 1)
//     .drop("rank")
//     .orderBy(col("stockCode"), col("updateDate"))
    .na.fill(0)
    .write.format("mongodb")
    .mode("append")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "PriceFeatures")
    .save()
    )
    """

In [None]:
case class HighFrequencyModel(
    stockCode: String,
    stockFullName: String,
    market: Int,
    updateDate: String,
    closingPrice: Int,
    tradingVolume: Int,
    transactionAmount: Long,
    closingPricePrev: Int,
    closingPriceNext: Int,
    closingPriceDiff: Int,
    closingPriceReturn: Double,
    closingPrice5Min: Int,
    closingPrice5Max: Int,
    closingPrice5: Int,
    tradingVolumeLog: Double,
    closingPriceReturn5days: Double,
    closingPriceReturn22days: Double,
    tradingVolumeVolatility5days: Double,
    tradingVolumeVolatility22days: Double,
    tradingVolumeVolatility5daysLog: Double,
    tradingVolumeVolatility22daysLog: Double,
    event: Int
)

In [None]:
val result = hf.where(col("rank") === 1).drop("rank").orderBy("stockCode").na.fill(0)

In [None]:
result.show()

In [None]:
result.as[HighFrequencyModel].collect()