In [None]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
import scala.util.{Failure, Success, Try}
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession

import java.time.LocalDateTime
import java.time.LocalDate
import java.time.format.DateTimeFormatter
import java.time.temporal.ChronoUnit.DAYS

In [None]:
val appName = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMddHHmmss"))

val spark = (
    SparkSession
    .builder()
    .appName(appName)
    .config("spark.cores.max", 8)
    .config("spark.executor.cores", 8)
    .config("spark.executor.memory", "36g")
    .getOrCreate()
    )

In [None]:
val mongoUrl = "mongodb+srv://xxxxxxxxxxxxxxxxxxxxxx/"

In [None]:
// createdAt 20230508

val df = (
    spark.read.format("mongodb")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "News")
    .load()
    .drop("_id")
    )

In [None]:
df.printSchema()

In [None]:
val ns = (
    df
    .withColumn("stockCode", explode(col("stockCode")))
    .withColumn("sentimentContent", explode(col("sentimentContent")))
    .withColumn("label", col("sentimentContent.sentiment.label"))
    .withColumn("score", col("sentimentContent.sentiment.score"))
    .withColumn("wholeParagraph", count("stockCode").over(Window.partitionBy("createdAt", "stockCode")))
    .withColumn("labelCount", count("stockCode").over(Window.partitionBy("createdAt", "stockCode", "label", "newsId")))
    .where(col("label") === "negative")
    .select("createdAt", "stockCode", "label", "wholeParagraph", "labelCount")
    .distinct()
    .withColumn("riskParagraph", sum("labelCount").over(Window.partitionBy("createdAt", "stockCode", "label", "wholeParagraph")))
    .select("createdAt", "stockCode", "wholeParagraph", "riskParagraph")
    .distinct()
    .withColumn("documentRiskPremium", col("riskParagraph") / col("wholeParagraph"))
    .withColumn("riskPremium", col("riskParagraph") / sum("wholeParagraph").over(Window.partitionBy("createdAt")))
    .withColumnRenamed("createdAt", "updateDate")
    .orderBy("updateDate", "stockCode")
)

In [None]:
ns.show()

In [None]:
"""
(
    ns
    .na.fill(0)
    .write.format("mongodb")
    .mode("append")
    .option("upsertDocument", "true")
    .option("idFieldList", "updateDate,stockCode")
    .option("spark.mongodb.read.connection.uri", mongoUrl)
    .option("spark.mongodb.write.connection.uri", mongoUrl)
    .option("database", "coreEngine")
    .option("collection", "NewsRiskPremium")
    .save()
    )
"""