In [None]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
import scala.util.{Failure, Success, Try}
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession

import java.time.LocalDateTime
import java.time.LocalDate
import java.time.format.DateTimeFormatter
import java.time.temporal.ChronoUnit.DAYS

In [None]:
def dateTimeFormatter = DateTimeFormatter.ofPattern("yyyyMMddHHmmss")
def appName: String = LocalDateTime.now().format(dateTimeFormatter)
def previousDay(minusDays: Long) = {
    LocalDateTime.now().minusDays(minusDays).format(dateTimeFormatter).substring(0, 8)
}

In [None]:
val appName = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMddHHmmss"))

val spark = (
    SparkSession
    .builder()
    .appName(appName)
    .config("spark.cores.max", 8)
    .config("spark.executor.cores", 8)
    .config("spark.executor.memory", "36g")
    .getOrCreate()
    )

In [None]:
val mongoUrl = "mongodb+srv://xxxxxxxxxxxxxxxxxxxxxx/"
val matchPipe = "{ $match: { updateDate: { $gte: '20180101' } } }"

In [None]:
val fsDf = (
    spark.read.format("mongodb")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "ReportRiskPremiumCalcurate")
    .option("aggregation.pipeline", matchPipe)
    .load()
    .drop("_id", "bsnsYear", "quarter")
    .withColumnRenamed("riskPremium", "FSRiskPremium")
    .where(!col("stockName").like("%리츠%") and 
           !col("stockName").like("%베트남개발1%") and 
           !col("stockName").like("%상상인%") and 
           !col("stockName").like("%은행%") and 
           !col("stockName").like("%보험%") and 
           !col("stockName").like("%증권%") and 
           !col("stockName").like("%화재%") and 
           !col("stockName").like("%케이프%") and 
           !col("stockName").like("%CNH%") and 
           !col("stockName").like("%금융%") and 
           !col("stockName").like("%신한지주%") and 
           !col("stockName").like("%리드코프%")
          )
    )

In [None]:
val nsDf = (
    spark.read.format("mongodb")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "NewsRiskPremium")
    .load()
    .drop("_id", "riskParagraph", "wholeParagraph")
    .withColumnRenamed("riskPremium", "NSRiskPremium")
    )

In [None]:
val prDf = (
    spark.read.format("mongodb")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "Price")
    .option("aggregation.pipeline", matchPipe)
    .load()
    .select("stockCode", "updateDate", "classify")
    .withColumn("classify", when(col("classify") === "KOSPI", "Y").otherwise("K"))
    )

In [None]:
val hfDf = (
    spark.read.format("mongodb")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "PriceRiskPremium")
    .option("aggregation.pipeline", matchPipe)
    .load()
    .drop("_id", "stockFullName")
    .withColumnRenamed("riskPremium", "HFRiskPremium")
    )

In [None]:
val indexDf = (
    spark.read.format("mongodb")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "IndexComposition")
    .option("aggregation.pipeline", matchPipe)
    .load()
    .select("updateDate", "rank", "index", "isuSrtCd")
    .withColumnRenamed("isuSrtCd", "stockCode")
    )




In [None]:
val vixDf = (
    spark.read.format("mongodb")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "KospiVolatilityCalcurate")
    .load()
    .drop("_id")
    )

In [None]:
val kosdaq150Df = indexDf.where(col("index") === "kosdaq_150").withColumnRenamed("rank", "kosdaq150").drop("rank").drop("index")
val kospi200Df = indexDf.where(col("index") === "kospi_200").withColumnRenamed("rank", "kospi200").drop("rank").drop("index")
val krx100Df = indexDf.where(col("index") === "krx_100").withColumnRenamed("rank", "krx100").drop("rank").drop("index")
val krx300Df = indexDf.where(col("index") === "krx_300").withColumnRenamed("rank", "krx300").drop("rank").drop("index")

In [None]:
val crDf = (
    spark.read.format("mongodb")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "CreditLoanRate")
    .load()
    .select("stockCode", "updateDate", "balanceRateLoan")
    .withColumn("balanceRateLoan", col("balanceRateLoan").cast(DoubleType))
    .withColumn("stockCode", lpad(col("stockCode"), 6, "0"))
    )



In [None]:

// root
//  |-- FSPctRank: double (nullable = true)
//  |-- VaRTF1: double (nullable = true)
//  |-- VaRTF2: double (nullable = true)
//  |-- basicReturn: double (nullable = true)
//  |-- corpCls: string (nullable = true)
//  |-- corpCode: string (nullable = true)
//  |-- event: long (nullable = true)
//  |-- expectedProfit: double (nullable = true)
//  |-- expectedRisk: double (nullable = true)
//  |-- grade: string (nullable = true)
//  |-- loanAvailable: integer (nullable = true)
//  |-- predict: long (nullable = true)
//  |-- profitLoss: double (nullable = true)
//  |-- rceptNo: string (nullable = true)
//  |-- reprtCode: string (nullable = true)
//  |-- FSRiskPremium: double (nullable = true)
//  |-- stockCode: string (nullable = true)
//  |-- stockName: string (nullable = true)
//  |-- threshold: double (nullable = true)
//  |-- updateDate: string (nullable = true)

val ff = Window.partitionBy("stockCode").orderBy("updateDate").rowsBetween(Window.unboundedPreceding, 0)

val riskPremiumDfTemp = (
    hfDf
    .join(fsDf, Seq("stockCode", "updateDate"), "left")
    .join(nsDf, Seq("stockCode", "updateDate"), "left")
    .join(vixDf, Seq("updateDate"), "left")
    .join(prDf, Seq("stockCode", "updateDate"), "left")
    .withColumn("BDRiskPremium", lit(0))
    .withColumn("FNRiskPremium", rand() * 0.001)
    .withColumn("FSPctRank", last(col("FSPctRank"), true).over(ff))
    .withColumn("VaRTF1", last(col("VaRTF1"), true).over(ff))
    .withColumn("VaRTF2", last(col("VaRTF2"), true).over(ff))
    .withColumn("basicReturn", last(col("basicReturn"), true).over(ff))
    .withColumn("corpCls", last(col("corpCls"), true).over(ff))
    .withColumn("corpCode", last(col("corpCode"), true).over(ff))
    .withColumn("event", last(col("event"), true).over(ff))
    .withColumn("expectedProfit", last(col("expectedProfit"), true).over(ff))
    .withColumn("expectedRisk", last(col("expectedRisk"), true).over(ff))
    .withColumn("grade", last(col("grade"), true).over(ff))
    .withColumn("loanAvailable", last(col("loanAvailable"), true).over(ff))
    .withColumn("predict", last(col("predict"), true).over(ff))
    .withColumn("profitLoss", last(col("profitLoss"), true).over(ff))
    .withColumn("rceptNo", last(col("rceptNo"), true).over(ff))
    .withColumn("reprtCode", last(col("reprtCode"), true).over(ff))
    .withColumn("FSRiskPremium", last(col("FSRiskPremium"), true).over(ff))
    .withColumn("stockCode", last(col("stockCode"), true).over(ff))
    .withColumn("stockName", last(col("stockName"), true).over(ff))
    .withColumn("threshold", last(col("threshold"), true).over(ff))
    .withColumn("BDRiskPremium", last(col("BDRiskPremium"), true).over(ff))
    .withColumn("FNRiskPremium", last(col("FNRiskPremium"), true).over(ff))
    .where(col("FSPctRank").isNotNull)
    .drop("corpCls")
    .withColumnRenamed("classify", "corpCls")
)

In [None]:
val riskPremiumDf = (
    riskPremiumDfTemp
    .withColumn("FSPctRank", percent_rank().over(Window.partitionBy("updateDate").orderBy("FSRiskPremium")))
    .withColumn("HFPctRank", percent_rank().over(Window.partitionBy("updateDate").orderBy("HFRiskPremium")))
    .withColumn("BDPctRank", percent_rank().over(Window.partitionBy("updateDate").orderBy("BDRiskPremium")))
    .withColumn("FNPctRank", percent_rank().over(Window.partitionBy("updateDate").orderBy("FNRiskPremium")))
    .withColumn("NSPctRank", percent_rank().over(Window.partitionBy("updateDate").orderBy("NSRiskPremium")))
    .withColumn("grade", lit("D"))
    .withColumn("grade", when(col("predict") >= 1, lit("F")).otherwise(col("grade")))
    .withColumn("grade", when(col("FSPctRank") <= 0.40, lit("C")).otherwise(col("grade")))
    .withColumn("grade", when(col("FSPctRank") <= 0.20, lit("B")).otherwise(col("grade")))
    .withColumn("grade", when(col("FSPctRank") <= 0.10, lit("A")).otherwise(col("grade")))
    .withColumn("riskPremium", col("FSPctRank") + (col("BDPctRank") * lit(0.0001)) + (col("HFPctRank") * lit(0.0001)) + (col("FNPctRank") * lit(0.0001)) + (col("NSPctRank") * lit(0.0001)))
    .withColumn("prevRiskPremium", lag(col("riskPremium"), 1).over(Window.partitionBy("stockCode").orderBy("updateDate")))
    .withColumn("diffRiskPremium", col("riskPremium") - col("prevRiskPremium"))
    .where(col("stockName").isNotNull)
    .withColumn("score", ((lit(1) - col("FSPctRank")) * 10000).cast(IntegerType))
    .withColumn("FSScore", ((lit(1) - col("FSPctRank")) * 10000).cast(IntegerType))
    .withColumn("HFScore", ((lit(1) - col("HFPctRank")) * 10000).cast(IntegerType))
    .withColumn("BDScore", ((lit(1) - col("BDPctRank")) * 10000).cast(IntegerType))
    .withColumn("FNScore", ((lit(1) - col("FNPctRank")) * 10000).cast(IntegerType))
    .withColumn("NSScore", ((lit(1) - col("NSPctRank")) * 10000).cast(IntegerType))
    .withColumn("ReRank", row_number().over(Window.partitionBy(col("updateDate")).orderBy(col("FSRiskPremium"), col("VaRTF1"))))
    .withColumn("loanThreshold", sum(when(col("grade") === "A" or col("grade") === "B" or col("grade") === "C", 1).otherwise(0)).over(Window.partitionBy(col("updateDate"))) * col("volatilityThreshold"))
    .withColumn("creditLoanAvailable", when(col("score") < col("loanThreshold"), 1).otherwise(0))
    .join(kosdaq150Df, Seq("stockCode", "updateDate"), "left")
    .join(kospi200Df, Seq("stockCode", "updateDate"), "left")
    .join(krx100Df, Seq("stockCode", "updateDate"), "left")
    .join(krx300Df, Seq("stockCode", "updateDate"), "left")
    .withColumn("fs", struct(col("FSRiskPremium").as("riskPremium"), col("FSScore").as("score")))
    .withColumn("hf", struct(col("HFRiskPremium").as("riskPremium"), col("HFScore").as("score")))
    .withColumn("bd", struct(col("BDRiskPremium").as("riskPremium"), col("BDScore").as("score")))
    .withColumn("fn", struct(col("FNRiskPremium").as("riskPremium"), col("FNScore").as("score")))
    .withColumn("ns", struct(col("NSRiskPremium").as("riskPremium"), col("NSScore").as("score")))
    .withColumn("detail", struct(col("fs"), col("hf"), col("bd"), col("fn"), col("ns")))
    .withColumn("rank", struct(col("ReRank"), col("kosdaq150"), col("kospi200"), col("krx100"), col("krx300")))
    .select("corpCls", "stockCode", "updateDate", "grade", "loanThreshold", "stockName", "riskPremium", "prevRiskPremium", "diffRiskPremium", "score", "detail", "rank", "loanAvailable", "creditLoanAvailable", "VaRTF1", "VaRTF2", "basicReturn", "expectedProfit", "expectedRisk", "profitLoss")
    .join(crDf, Seq("stockCode", "updateDate"), "left")
    .withColumn("balanceRateLoan", last(col("balanceRateLoan"), true).over(ff))
    .na.fill(0)
    )

//     .withColumn("score", col("ReRank"))

In [None]:
"""
(
    riskPremiumDf
    .write.format("mongodb")
    .mode("append")
    .option("upsertDocument", "true")
    .option("idFieldList", "updateDate,stockCode")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "RiskPremium")
    .save()
    )
"""