In [None]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
import scala.util.{Failure, Success, Try}
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession

In [None]:
val spark = (
    SparkSession
    .builder()
    .appName("Beta90D")
    .config("spark.cores.max", 4)
    .config("spark.executor.cores", 4)
    .config("spark.executor.memory", "36g")
    .getOrCreate()
    )
    
val csv_options = Map("header" -> "true")

In [None]:
val mongoUrl = "mongodb+srv://xxxxxxxxxxxxxxxxxxxxxx/"

val stockPrice = (
    spark.read.format("mongodb")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("aggregation.pipeline", "{ $match: { updateDate: { $gte: '20230101'} } }")
    .option("collection", "Price").load()
    .withColumn("closingPrice",col("closingPrice").cast(IntegerType))
    .withColumn("market", when(col("classify") === "KOSDAQ", 0).otherwise(1))
    .select("stockCode", "stockFullName", "market", "closingPrice", "updateDate")
    )

In [None]:
val indexPrice = (
    spark.read.format("mongodb")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "IndexPrice").load()
    .where(col("idxNm") === "코스닥지수" or col("idxNm") === "코스피")
    .withColumn("clsprcIdx",col("clsprcIdx").cast(FloatType))
    .withColumn("market", when(col("market") === "03", 0).otherwise(1))
    .select("clsprcIdx", "market", "updateDate")
    )

In [None]:
val partition = Window.partitionBy("stockCode").orderBy("updateDate")
val w_90 = partition.rowsBetween(-90, 0)

val beta = (
    indexPrice.join(stockPrice, Seq("updateDate", "market"), "left")
    .withColumn("updateDateTimestamp", to_timestamp(col("updateDate"), "yyyyMMdd"))
    .withColumn("weekOfYear", weekofyear(col("updateDateTimestamp")))
    .withColumn("weekOfDay", dayofweek(col("updateDateTimestamp")))
    .withColumn("year", year(col("updateDateTimestamp")))
    .withColumn("clsprcIdxLag", lag(col("clsprcIdx"), 1).over(partition))
    .withColumn("closingPriceLag", lag(col("closingPrice"), 1).over(partition))
    .withColumn("clsprcIdxReturn", (col("clsprcIdx") - col("clsprcIdxLag")) / col("clsprcIdxLag"))
    .withColumn("closingPriceReturn", (col("closingPrice") - col("closingPriceLag")) / col("closingPriceLag"))
    .where(col("clsprcIdxReturn").isNotNull)
    .withColumn("idxVariance", var_samp(col("clsprcIdxReturn")).over(w_90))
    .withColumn("varSamp", var_samp(col("clsprcIdxReturn")).over(w_90))
    .withColumn("idxStockVariance", covar_pop(col("closingPriceReturn"), col("clsprcIdxReturn")).over(w_90))
    .withColumn("beta", col("idxStockVariance") / col("idxVariance"))
    .select("updateDate", "market", "stockCode", "stockFullName", "year", "weekOfYear", "weekOfDay", "clsprcIdx", "closingPrice", "beta")
    .where(col("updateDate") >= "20230630")
    .where(col("beta").isNotNull)
    
    ).cache()

In [None]:
"""
(
    beta.where(col("updateDate") > "20230630")
    .write.format("mongodb")
    .mode("append")
    .option("upsertDocument", "true")
    .option("idFieldList", "updateDate,stockCode")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "PriceBetaCalcurate")
    .save()
    )
    """

In [None]:
spark.stop()