In [None]:
sc.install_pypi_package("pybind11==2.10.3")
sc.install_pypi_package("numpy==1.19.0")
sc.install_pypi_package("Pillow==8.2")
sc.install_pypi_package("Cython==0.29.33")
sc.install_pypi_package("scipy==1.2.0")
sc.install_pypi_package("pythran==0.12.1")
sc.install_pypi_package("pandas==1.0.0")
sc.install_pypi_package("matplotlib==3.3.0")
sc.install_pypi_package("lifelines==0.27.4")
sc.install_pypi_package("s3fs==0.4.2")
sc.install_pypi_package("boto3==1.26.59")
sc.install_pypi_package("joblib==1.2.0")

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from itertools import groupby
from operator import itemgetter

from lifelines import CoxPHFitter
from scipy.optimize import minimize

import pandas as pd
import boto3
import os
import pickle
import tempfile
import joblib
import io
import numpy as np
import matplotlib.pyplot as plt
import sys

MongoUrl = "mongodb+srv://xxxxxxxxxxxxxxxxxxxxxx/"

In [None]:
spark = SparkSession \
    .builder \
    .appName("FinancialSheets_ML_Training") \
    .config("spark.cores.max", 6) \
    .config("spark.executor.cores", 6) \
    .config("spark.executor.memory", "36g") \
    .getOrCreate()

In [None]:
reportRiskPremiumDf = spark.read.format("mongodb") \
    .option("spark.mongodb.read.connection.uri", MongoUrl) \
    .option("spark.mongodb.write.connection.uri", MongoUrl) \
    .option("database", "coreEngine") \
    .option("collection", "ReportRiskPremium") \
    .load() \
    .drop("_id")

In [None]:


priceDf = spark.read.format("mongodb") \
    .option("spark.mongodb.read.connection.uri", MongoUrl) \
    .option("spark.mongodb.write.connection.uri", MongoUrl) \
    .option("database", "coreEngine") \
    .option("collection", "Price") \
    .load() \
    .drop("_id") \
    .select("stockCode", "updateDate", "closingPrice") \
    .withColumn('rolling', F.lag(F.col('closingPrice'), offset=90).over(Window.partitionBy('stockCode').orderBy(['updateDate']))) \
    .withColumn('closingPriceRolling', F.col('rolling') / F.col('closingPrice')) \
    .withColumn('r_s', F.when(F.col('closingPriceRolling') >= 1, 1 - 1).otherwise(F.col('closingPriceRolling') - 1)) \
    .withColumn('r', F.lit(0.07)) \
    .withColumn('recoveryFN1', F.lit(0.5)) \
    .withColumn('recoveryFN2', F.lit(0.1)) \
    .withColumn('priceEvent', F.when(F.col('r_s') < -0.2, 1).otherwise(0)) \
    .withColumn('priceEvent', F.sum(F.col('priceEvent')).over(Window.partitionBy('stockCode').orderBy('updateDate').rowsBetween(-90, 0))) \
    .withColumn('priceEvent', F.when(F.col('priceEvent') > 0, 1).otherwise(0)) \
    .drop('closingPrice', 'closingPriceRolling', 'rolling') \
    .where(F.col("r_s").isNotNull())
    
    
# matchPipe = "{ $match: { updateDate: { $gte: '20200101' } } }"
#     .option("aggregation.pipeline", matchPipe) \
    

In [None]:
priceDf.show()

In [None]:
reportRiskPremiumDfPriceDf = reportRiskPremiumDf \
    .join(priceDf, on=["stockCode", "updateDate"], how="left") \
    .withColumn('TF1', F.sum(F.col('event')).over(Window.partitionBy('stockCode').orderBy(['updateDate'])) / F.count(F.col('event')).over(Window.partitionBy('stockCode').orderBy(['updateDate']))) \
    .withColumn('TF2', F.sum(F.col('priceEvent')).over(Window.partitionBy('stockCode').orderBy(['updateDate'])) / F.count(F.col('priceEvent')).over(Window.partitionBy('stockCode').orderBy(['updateDate']))) \
    .withColumn('FSPctRank', F.percent_rank().over(Window.partitionBy("bsnsYear", "quarter").orderBy("riskPremium"))) \
    .withColumn("expectedProfit", F.lit(100000000 * 90/365) * F.col("r")) \
    .withColumn("expectedLossFN1", F.col("expectedProfit") * F.col("recoveryFN1") * F.col("TF1")) \
    .withColumn("expectedLossFN2", F.col("expectedProfit") * F.col("recoveryFN2") * F.col("TF2"))


In [None]:
pdf = reportRiskPremiumDfPriceDf.toPandas()
periodDf = pdf[['updateDate']].drop_duplicates()

In [None]:
def get_threshold(prob, df):
    df = df.fillna(0)    
    tn = (df['event'] == False).values * (prob == False).values # 현실 양성 | 예측 양성 
    fn1 = (df['event'] == True).values * (prob == False).values # 현실 악성 | 예측 양성 
    fn2 = (df['priceEvent'] == True).values * (prob == False).values # 수정주가 20 현실 악성 | 예측 양성
    tp = (df['event'] == True).values * (prob == True).values # 현실 악성 | 예측 악성
    fp = (df['event'] == False).values * (prob == True).values # 현실 양성 | 예측 악성

    x1 = df['expectedProfit'] @ tn
    x2 = df['expectedLossFN1'] @ fn1
    x3 = df['expectedLossFN2'] @ fn2
    x4 = df['expectedLossFN1'] @ tp
    x5 = df['expectedProfit'] @ fp
    
    exret = (x1 - x2 - x4 + x5) / sum(df['expectedProfit'])

    return exret

In [None]:
sorted([p[0] for p in list(periodDf.values) if p[0] > '20210101'])

In [None]:
threshold_dfs = []

for i in sorted([p[0] for p in list(periodDf.values) if p[0] > '20210101']):
    tmp_df = pdf[(pdf['updateDate'] == i)].reset_index()
    metric = lambda x: get_threshold(prob=(tmp_df['FSPctRank'] > x), df=tmp_df)
    res = minimize(lambda x: -metric(x[0]), 0.3, method='nelder-mead', options={'disp': True})
    threshold = res.x[0]
    tmp_df['threshold'] = threshold
    tmp_df['predict'] = (tmp_df['FSPctRank'] >= tmp_df['threshold']).astype(int)
    threshold_dfs.append(tmp_df)

threshold_df = pd.concat(threshold_dfs)

In [None]:
for tdf in threshold_dfs:
    sdf = spark.createDataFrame(tdf) \
        .withColumn("grade", F.lit("D")) \
        .withColumn("grade", F.when(F.col("predict") >= 1, F.lit("F")).otherwise(F.col("grade"))) \
        .withColumn("grade", F.when(F.col("FSPctRank") <= 0.40, F.lit("C")).otherwise(F.col("grade"))) \
        .withColumn("grade", F.when(F.col("FSPctRank") <= 0.20, F.lit("B")).otherwise(F.col("grade"))) \
        .withColumn("grade", F.when(F.col("FSPctRank") <= 0.10, F.lit("A")).otherwise(F.col("grade"))) \
        .withColumn("TT", F.lit(0.07)) \
        .withColumn("TT", F.when(F.col("grade") == "B", 0.075).otherwise(F.col("TT"))) \
        .withColumn("TT", F.when(F.col("grade") == "C", 0.085).otherwise(F.col("TT"))) \
        .withColumn("TT", F.when(F.col("grade") == "D", 0.095).otherwise(F.col("TT"))) \
        .withColumn("TT", F.when(F.col("grade") == "F", 0.1).otherwise(F.col("TT"))) \
        .withColumn("basicReturn", F.col("TT") * F.lit((90.0 / 365.0) * 100000000.0)) \
        .withColumn("basicReturn", F.when(F.col("grade") == "A", F.col("basicReturn").cast('Integer') * (F.lit(0.9) - F.col("FSPctRank"))).otherwise(F.col("basicReturn"))) \
        .withColumn("basicReturn", F.when(F.col("grade") == "B", F.col("basicReturn").cast('Integer') * (F.lit(0.8) - F.col("FSPctRank"))).otherwise(F.col("basicReturn"))) \
        .withColumn("basicReturn", F.when(F.col("grade") == "C", F.col("basicReturn").cast('Integer') * (F.lit(0.7) - F.col("FSPctRank"))).otherwise(F.col("basicReturn"))) \
        .withColumn("basicReturn", F.when(F.col("grade") == "D", F.col("basicReturn").cast('Integer') * (F.lit(0.5) - F.col("FSPctRank"))).otherwise(F.col("basicReturn"))) \
        .withColumn("basicReturn", F.when(F.col("grade") == "F", F.col("basicReturn").cast('Integer') * (F.lit(0.0) - F.col("FSPctRank"))).otherwise(F.col("basicReturn"))) \
        .withColumn("VaRTF1", F.col("TF1") * F.lit((90.0 / 365.0) * (100000000.0 * 0.5 * 0.08))) \
        .withColumn("VaRTF2", F.col("TF2") * F.lit((90.0 / 365.0) * (100000000.0 * 0.1 * 0.08))) \
        .withColumn("expectedRisk", F.col("VaRTF1") + F.col("VaRTF1")) \
        .withColumn("profitLoss", F.col("basicReturn") - F.col("expectedRisk")) \
        .withColumn("loanAvailable", F.when(F.col("FSPctRank") < F.col("threshold"), 1).otherwise(0)) \
        .drop("index", "TT", "TF1", "TF2", "r_s", "r", "priceEvent", "recoveryFN1", "recoveryFN2", "expectedLossFN1", "expectedLossFN2")

    sdf \
        .write.format("mongodb") \
        .mode("append") \
        .option("upsertDocument", "true") \
        .option("idFieldList", "updateDate,stockCode") \
        .option("spark.mongodb.read.connection.uri", MongoUrl) \
        .option("spark.mongodb.write.connection.uri", MongoUrl) \
        .option("database", "coreEngine") \
        .option("collection", "ReportRiskPremiumCalcurate") \
        .save()

In [None]:
threshold_dfs = []

for i in sorted([p[0] for p in list(periodDf.values) if p[0] < '20210101']):
    tmp_df = pdf[(pdf['updateDate'] == i)].reset_index()
    metric = lambda x: get_threshold(prob=(tmp_df['FSPctRank'] > x), df=tmp_df)
    res = minimize(lambda x: -metric(x[0]), 0.3, method='nelder-mead', options={'disp': True})
    threshold = res.x[0]
    tmp_df['threshold'] = threshold
    tmp_df['predict'] = (tmp_df['FSPctRank'] >= tmp_df['threshold']).astype(int)
    threshold_dfs.append(tmp_df)

threshold_df = pd.concat(threshold_dfs)

In [None]:
for tdf in threshold_dfs:
    sdf = spark.createDataFrame(tdf) \
        .withColumn("grade", F.lit("D")) \
        .withColumn("grade", F.when(F.col("predict") >= 1, F.lit("F")).otherwise(F.col("grade"))) \
        .withColumn("grade", F.when(F.col("FSPctRank") <= 0.40, F.lit("C")).otherwise(F.col("grade"))) \
        .withColumn("grade", F.when(F.col("FSPctRank") <= 0.20, F.lit("B")).otherwise(F.col("grade"))) \
        .withColumn("grade", F.when(F.col("FSPctRank") <= 0.10, F.lit("A")).otherwise(F.col("grade"))) \
        .withColumn("TT", F.lit(0.07)) \
        .withColumn("TT", F.when(F.col("grade") == "B", 0.075).otherwise(F.col("TT"))) \
        .withColumn("TT", F.when(F.col("grade") == "C", 0.085).otherwise(F.col("TT"))) \
        .withColumn("TT", F.when(F.col("grade") == "D", 0.095).otherwise(F.col("TT"))) \
        .withColumn("TT", F.when(F.col("grade") == "F", 0.1).otherwise(F.col("TT"))) \
        .withColumn("basicReturn", F.col("TT") * F.lit((90.0 / 365.0) * 100000000.0)) \
        .withColumn("basicReturn", F.when(F.col("grade") == "A", F.col("basicReturn").cast('Integer') * (F.lit(0.9) - F.col("FSPctRank"))).otherwise(F.col("basicReturn"))) \
        .withColumn("basicReturn", F.when(F.col("grade") == "B", F.col("basicReturn").cast('Integer') * (F.lit(0.8) - F.col("FSPctRank"))).otherwise(F.col("basicReturn"))) \
        .withColumn("basicReturn", F.when(F.col("grade") == "C", F.col("basicReturn").cast('Integer') * (F.lit(0.7) - F.col("FSPctRank"))).otherwise(F.col("basicReturn"))) \
        .withColumn("basicReturn", F.when(F.col("grade") == "D", F.col("basicReturn").cast('Integer') * (F.lit(0.5) - F.col("FSPctRank"))).otherwise(F.col("basicReturn"))) \
        .withColumn("basicReturn", F.when(F.col("grade") == "F", F.col("basicReturn").cast('Integer') * (F.lit(0.0) - F.col("FSPctRank"))).otherwise(F.col("basicReturn"))) \
        .withColumn("VaRTF1", F.col("TF1") * F.lit((90.0 / 365.0) * (100000000.0 * 0.5 * 0.08))) \
        .withColumn("VaRTF2", F.col("TF2") * F.lit((90.0 / 365.0) * (100000000.0 * 0.1 * 0.08))) \
        .withColumn("expectedRisk", F.col("VaRTF1") + F.col("VaRTF1")) \
        .withColumn("profitLoss", F.col("basicReturn") - F.col("expectedRisk")) \
        .withColumn("loanAvailable", F.when(F.col("FSPctRank") < F.col("threshold"), 1).otherwise(0)) \
        .drop("index", "TT", "TF1", "TF2", "r_s", "r", "priceEvent", "recoveryFN1", "recoveryFN2", "expectedLossFN1", "expectedLossFN2")
"""
    sdf \
        .write.format("mongodb") \
        .mode("append") \
        .option("upsertDocument", "true") \
        .option("idFieldList", "updateDate,stockCode") \
        .option("spark.mongodb.read.connection.uri", MongoUrl) \
        .option("spark.mongodb.write.connection.uri", MongoUrl) \
        .option("database", "coreEngine") \
        .option("collection", "ReportRiskPremiumCalcurate") \
        .save()
        """