In [None]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
import scala.util.{Failure, Success, Try}
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession

import java.time.LocalDateTime
import java.time.LocalDate
import java.time.format.DateTimeFormatter
import java.time.temporal.ChronoUnit.DAYS

In [None]:
def dateTimeFormatter = DateTimeFormatter.ofPattern("yyyyMMddHHmmss")
def appName: String = LocalDateTime.now().format(dateTimeFormatter)
def previousDay(minusDays: Long) = {
    LocalDateTime.now().minusDays(minusDays).format(dateTimeFormatter).substring(0, 8)
}

In [None]:
val appName = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMddHHmmss"))

val spark = (
    SparkSession
    .builder()
    .appName(appName)
    .config("spark.cores.max", 8)
    .config("spark.executor.cores", 8)
    .config("spark.executor.memory", "36g")
    .getOrCreate()
    )

In [None]:
val mongoUrl = "mongodb+srv://xxxxxxxxxxxxxxxxxxxxxx/"

In [None]:
val fsDf = (
    spark.read.format("mongodb")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "ReportRiskPremiumWithPbrPerStress")
    .load()
    .drop("_id")
    .withColumnRenamed("riskPremium", "FSRiskPremium")
    )

In [None]:
val nsDf = (
    spark.read.format("mongodb")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "NewsRiskPremium")
    .load()
    .drop("_id", "riskParagraph", "wholeParagraph")
    .withColumnRenamed("riskPremium", "NSRiskPremium")
    )

In [None]:
val hfDf = (
    spark.read.format("mongodb")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("aggregation.pipeline", "{ $match: { updateDate: { $gte: '20230401' } } }")
    .option("collection", "PriceRiskPremium")
    .load()
    .drop("_id", "stockFullName")
    .withColumnRenamed("riskPremium", "HFRiskPremium")
    )

In [None]:
val indexDf = (
    spark.read.format("mongodb")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "IndexComposition")
    .option("aggregation.pipeline", "{ $match: { updateDate: { $gte: '20230401' } } }")
    .load()
    .select("updateDate", "rank", "index", "isuSrtCd")
    .withColumnRenamed("isuSrtCd", "stockCode")
    )

In [None]:
val prDf = (
    spark.read.format("mongodb")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "Price")
    .option("aggregation.pipeline", "{ $match: { updateDate: { $gte: '20230401' } } }")
    .load()
    .select("stockCode", "updateDate", "classify")
    .withColumn("classify", when(col("classify") === "KOSPI", "Y").otherwise("K"))
    )

In [None]:
val kosdaq150Df = indexDf.where(col("index") === "kosdaq_150").withColumnRenamed("rank", "kosdaq150").drop("rank").drop("index")
val kospi200Df = indexDf.where(col("index") === "kospi_200").withColumnRenamed("rank", "kospi200").drop("rank").drop("index")
val krx100Df = indexDf.where(col("index") === "krx_100").withColumnRenamed("rank", "krx100").drop("rank").drop("index")
val krx300Df = indexDf.where(col("index") === "krx_300").withColumnRenamed("rank", "krx300").drop("rank").drop("index")

In [None]:
// fsDf.printSchema()

In [None]:
val partition = Window.partitionBy("stockCode").orderBy("updateDate")
val ff = partition.rowsBetween(Window.unboundedPreceding, 0)

val rpdf = (
    hfDf
    .join(fsDf, Seq("stockCode", "updateDate"), "left")
    .join(nsDf, Seq("stockCode", "updateDate"), "left")
    .withColumn("stockCode", last(col("stockCode"), true).over(ff))
    .withColumn("updateDate", last(col("updateDate"), true).over(ff))
    .withColumn("HFRiskPremium", last(col("HFRiskPremium"), true).over(ff))
    .withColumn("warningSignal", last(col("warningSignal"), true).over(ff))
    .withColumn("FSPctRank", last(col("FSPctRank"), true).over(ff))
    .withColumn("RegUpdateDate", last(col("RegUpdateDate"), true).over(ff))
    .withColumn("TF1", last(col("TF1"), true).over(ff))
    .withColumn("TF2", last(col("TF2"), true).over(ff))
    .withColumn("TF3", last(col("TF3"), true).over(ff))
    .withColumn("TT", last(col("TT"), true).over(ff))
    .withColumn("VaRTF1", last(col("VaRTF1"), true).over(ff))
    .withColumn("VaRTF2", last(col("VaRTF2"), true).over(ff))
    .withColumn("VaRTF3", last(col("VaRTF3"), true).over(ff))
    .withColumn("balanceRateLoan", last(col("balanceRateLoan"), true).over(ff))
    .withColumn("basicReturn", last(col("basicReturn"), true).over(ff))
    .withColumn("bsnsYear", last(col("bsnsYear"), true).over(ff))
    .withColumn("corpCls", last(col("corpCls"), true).over(ff))
    .withColumn("corpCode", last(col("corpCode"), true).over(ff))
    .withColumn("event", last(col("event"), true).over(ff))
    .withColumn("expectedLossFN1", last(col("expectedLossFN1"), true).over(ff))
    .withColumn("expectedLossFN2", last(col("expectedLossFN2"), true).over(ff))
    .withColumn("expectedLossFN3", last(col("expectedLossFN3"), true).over(ff))
    .withColumn("expectedProfit", last(col("expectedProfit"), true).over(ff))
    .withColumn("expectedRisk", last(col("expectedRisk"), true).over(ff))
    .withColumn("fixed_pbr", last(col("fixed_pbr"), true).over(ff))
    .withColumn("fixed_per", last(col("fixed_per"), true).over(ff))
    .withColumn("grade", last(col("grade"), true).over(ff))
    .withColumn("hfrp", last(col("hfrp"), true).over(ff))
    .withColumn("loanAvailable", last(col("loanAvailable"), true).over(ff))
    .withColumn("lockAlert30", last(col("lockAlert30"), true).over(ff))
    .withColumn("lockAlert60", last(col("lockAlert60"), true).over(ff))
    .withColumn("lockWarn30", last(col("lockWarn30"), true).over(ff))
    .withColumn("lockWarn60", last(col("lockWarn60"), true).over(ff))
    .withColumn("marketCap", last(col("marketCap"), true).over(ff))
    .withColumn("pbr", last(col("pbr"), true).over(ff))
    .withColumn("per", last(col("per"), true).over(ff))
    .withColumn("plbtEvent", last(col("plbtEvent"), true).over(ff))
    .withColumn("predict", last(col("predict"), true).over(ff))
    .withColumn("priceEvent", last(col("priceEvent"), true).over(ff))
    .withColumn("profitLoss", last(col("profitLoss"), true).over(ff))
    .withColumn("quarter", last(col("quarter"), true).over(ff))
    .withColumn("r", last(col("r"), true).over(ff))
    .withColumn("r_s", last(col("r_s"), true).over(ff))
    .withColumn("rank", last(col("rank"), true).over(ff))
    .withColumn("rceptNo", last(col("rceptNo"), true).over(ff))
    .withColumn("reGrade", last(col("reGrade"), true).over(ff))
    .withColumn("reLoanAvailable", last(col("reLoanAvailable"), true).over(ff))
    .withColumn("rePredict", last(col("rePredict"), true).over(ff))
    .withColumn("reRank", last(col("reRank"), true).over(ff))
    .withColumn("recoveryFN1", last(col("recoveryFN1"), true).over(ff))
    .withColumn("recoveryFN2", last(col("recoveryFN2"), true).over(ff))
    .withColumn("recoveryFN3", last(col("recoveryFN3"), true).over(ff))
    .withColumn("reprtCode", last(col("reprtCode"), true).over(ff))
    .withColumn("FSRiskPremium", last(col("FSRiskPremium"), true).over(ff))
    .withColumn("stockName", last(col("stockName"), true).over(ff))
    .withColumn("stress", last(col("stress"), true).over(ff))
    .withColumn("stressAlert", last(col("stressAlert"), true).over(ff))
    .withColumn("stressWarn", last(col("stressWarn"), true).over(ff))
    .withColumn("threshold", last(col("threshold"), true).over(ff))
    .withColumn("documentRiskPremium", last(col("documentRiskPremium"), true).over(ff))
    .withColumn("NSRiskPremium", last(col("NSRiskPremium"), true).over(ff))
    .where(col("FSPctRank").isNotNull)
)

In [None]:
val vixDf = (
    spark.read.format("mongodb")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "KospiVolatilityCalcurate")
    .load()
    .drop("_id")
    )

In [None]:
val riskPremiumDf = (
    rpdf
    .na.fill(0)
    .join(vixDf, Seq("updateDate"), "left")
    .withColumn("BDRiskPremium", lit(0))
    .withColumn("FNRiskPremium", lit(0))
    .withColumn("HFPctRank", percent_rank().over(Window.partitionBy("updateDate").orderBy("HFRiskPremium")))
    .withColumn("BDPctRank", percent_rank().over(Window.partitionBy("updateDate").orderBy("BDRiskPremium")))
    .withColumn("FNPctRank", percent_rank().over(Window.partitionBy("updateDate").orderBy("FNRiskPremium")))
    .withColumn("NSPctRank", percent_rank().over(Window.partitionBy("updateDate").orderBy("NSRiskPremium")))
    .withColumn("riskPremium", col("FSRiskPremium") + (col("HFRiskPremium") * lit(0.0001)) + (col("BDRiskPremium") * lit(0.0001)) + (col("FNRiskPremium") * lit(0.0001)) + (col("NSRiskPremium") * lit(0.0001)))
    .withColumn("prevRiskPremium", lag(col("riskPremium"), 1).over(Window.partitionBy("stockCode").orderBy("updateDate")))
    .withColumn("diffRiskPremium", col("riskPremium") - col("prevRiskPremium"))
    .withColumn("score", ((lit(1) - col("FSPctRank")) * 10000).cast(IntegerType))
    .withColumn("FSScore", ((lit(1) - col("FSPctRank")) * 10000).cast(IntegerType))
    .withColumn("HFScore", ((lit(1) - col("HFPctRank")) * 10000).cast(IntegerType))
    .withColumn("BDScore", ((lit(1) - col("BDPctRank")) * 10000).cast(IntegerType))
    .withColumn("FNScore", ((lit(1) - col("FNPctRank")) * 10000).cast(IntegerType))
    .withColumn("NSScore", ((lit(1) - col("NSPctRank")) * 10000).cast(IntegerType))
    .withColumn("ReRank", col("reRank"))
    .withColumn("loanThreshold", sum(when(col("grade") === "A" or col("grade") === "B" or col("grade") === "C", 1).otherwise(0)).over(Window.partitionBy(col("updateDate"))) * col("volatilityThreshold"))
    .withColumn("creditLoanAvailable", when(col("reRank") < col("loanThreshold"), 1).otherwise(0))
    .join(kosdaq150Df, Seq("stockCode", "updateDate"), "left")
    .join(kospi200Df, Seq("stockCode", "updateDate"), "left")
    .join(krx100Df, Seq("stockCode", "updateDate"), "left")
    .join(krx300Df, Seq("stockCode", "updateDate"), "left")
    .withColumn("fs", struct(col("FSRiskPremium").as("riskPremium"), col("FSScore").as("score")))
    .withColumn("hf", struct(col("HFRiskPremium").as("riskPremium"), col("HFScore").as("score")))
    .withColumn("bd", struct(col("BDRiskPremium").as("riskPremium"), col("BDScore").as("score")))
    .withColumn("fn", struct(col("FNRiskPremium").as("riskPremium"), col("FNScore").as("score")))
    .withColumn("ns", struct(col("NSRiskPremium").as("riskPremium"), col("NSScore").as("score")))
    .withColumn("detail", struct(col("fs"), col("hf"), col("bd"), col("fn"), col("ns")))
    .withColumn("rank", struct(col("ReRank"), col("kosdaq150"), col("kospi200"), col("krx100"), col("krx300")))
    .join(prDf, Seq("updateDate", "stockCode"), "left")
    .drop("corpCls")
    .withColumnRenamed("classify", "corpCls")
    .select(
        "corpCls", "stockCode", "updateDate", "grade", "reGrade", "loanThreshold", "stockName", "riskPremium", 
        "prevRiskPremium", "diffRiskPremium", "score", "detail", "rank", "reRank", "loanAvailable", "reLoanAvailable", "creditLoanAvailable", 
        "VaRTF1", "VaRTF2", "VaRTF3", "basicReturn", "expectedProfit", "expectedRisk", "profitLoss",
        "fixed_pbr", "fixed_per", "lockAlert30", "lockAlert60", "lockWarn30", "lockWarn60", "pbr", "per", "plbtEvent", "stressAlert", "stressWarn",
        "priceEvent", "stress", "balanceRateLoan"
    )
)
                

In [None]:
"""
(
    riskPremiumDf
    .where(col("updateDate") > "20230630")
    .write.format("mongodb")
    .mode("append")
    .option("upsertDocument", "true")
    .option("idFieldList", "updateDate,stockCode")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "RiskPremium_New")
    .save()
    )
"""

In [None]:
spark.stop()