In [None]:
sc.install_pypi_package("pybind11==2.10.3")
sc.install_pypi_package("numpy==1.19.0")
sc.install_pypi_package("Pillow==8.2")
sc.install_pypi_package("Cython==0.29.33")
sc.install_pypi_package("scipy==1.2.0")
sc.install_pypi_package("pythran==0.12.1")
sc.install_pypi_package("pandas==1.0.0")
sc.install_pypi_package("matplotlib==3.3.0")
sc.install_pypi_package("lifelines==0.27.4")
sc.install_pypi_package("s3fs==0.4.2")
sc.install_pypi_package("boto3==1.26.59")
sc.install_pypi_package("joblib==1.2.0")

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from itertools import groupby
from operator import itemgetter

from lifelines import CoxPHFitter

import pandas as pd
import boto3
import os
import pickle
import tempfile
import joblib
import io
import numpy as np
import matplotlib.pyplot as plt

MongoUrl = "mongodb+srv://xxxxxxxxxxxxxxxxxxxxxx/"

In [None]:
spark = SparkSession \
    .builder \
    .appName("FinancialSheets_ML_Training") \
    .config("spark.cores.max", 6) \
    .config("spark.executor.cores", 6) \
    .config("spark.executor.memory", "36g") \
    .getOrCreate()

In [None]:
df = spark.read.format("mongodb") \
    .option("spark.mongodb.read.connection.uri", MongoUrl) \
    .option("spark.mongodb.write.connection.uri", MongoUrl) \
    .option("database", "coreEngine") \
    .option("collection", "ReportFeatures") \
    .load() \
    .drop("_id")

In [None]:
features = df.toPandas()

In [None]:
historicalDay = df.select("updateDate").distinct().orderBy("updateDate")

In [None]:
historicalDayList = [i[0] for i in historicalDay.toPandas().values if i[0] > '20210506'] #20180101

In [None]:
hf_features = features
hf_features["event"] = hf_features["event"].fillna(0).astype(bool)
hf_features = hf_features.sort_values(["stockCode", "rceptNo"])
hf_features = hf_features[hf_features["period"] >= 1.0]

In [None]:
columns = [
    "CACL", "CATA", "CLCA", "CLGR", "CLTL", "DSR01", "DSR02", "DSR03", 
    "DSR04", "DSR05", "DSR06", "DSR07", "EBTIN", "EQTA", "FAGR", 
    "FFOEQ", "FFOTL", "INSL", "INTL", "LNSL", "LNTA", "MB", "NEGBE", 
    "NIGR", "NISL", "RETA", "SLEQ", "SLFA", "TLEQ", "TLTA"]

for column in columns:
    hf_features[column]=hf_features.groupby(["bsnsYear", "reprtCode"])[column].apply(lambda x:x.fillna(0))

In [None]:
s3 = boto3.client(
    "s3",
    region_name="ap-northeast-2",
    aws_access_key_id="xxxxxxxxxxxxxxxxxxxxxx",
    aws_secret_access_key="xxxxxxxxxxxxxxxxxxxxxx")

In [None]:
model_files = s3.list_objects(Bucket="penta-engine", Prefix="FinancialSheetsModel/")
model_files = [i['Key'] for i in model_files['Contents']][1:]

In [None]:
hf_features = hf_features.fillna(0).dropna()
hf_features.updateDate = hf_features.updateDate.astype('int32')

In [None]:
hf_features

In [None]:
dfs = []

for i in model_files:
    print(i)
    
    i = "FinancialSheetsModel/20230530.pkl"
    
    target_date = int(i[-12:][:8])
    df_test_set = hf_features[hf_features["updateDate"] <= target_date].sort_values(by="updateDate", ascending=False).drop_duplicates().groupby("stockCode").head(1).sort_values(by=["stockCode", "updateDate"], ascending=True).reset_index(drop=True)
    df_test_set = df_test_set.set_index(["stockCode", "corpCls", "corpCode", "reprtCode", "rceptNo", "stockName", "updateDate", "bsnsYear", "quarter"])
    
    model_file = s3.get_object(Bucket="penta-engine", Key=i)
    bytes_stream = io.BytesIO(model_file['Body'].read())
    model = joblib.load(bytes_stream)
    hazard = model.predict_partial_hazard(df_test_set.drop('event', axis=1))
    
    df_test_set['riskPremium'] = hazard
    result = df_test_set.reset_index()
    result = result[['stockCode','corpCls','corpCode','reprtCode','rceptNo','stockName','updateDate','bsnsYear','quarter','riskPremium','event']]
    result['event'] = result['event'].astype('int')
    result['updateDate'] = str(target_date)
    dfs.append(result)
    

In [None]:
i = "FinancialSheetsModel/20230504.pkl"

target_date = 20210506
df_test_set = hf_features[hf_features["updateDate"] <= target_date].sort_values(by="updateDate", ascending=False).drop_duplicates().groupby("stockCode").head(1).sort_values(by=["stockCode", "updateDate"], ascending=True).reset_index(drop=True)
df_test_set = df_test_set.set_index(["stockCode", "corpCls", "corpCode", "reprtCode", "rceptNo", "stockName", "updateDate", "bsnsYear", "quarter"])

model_file = s3.get_object(Bucket="penta-engine", Key=i)
bytes_stream = io.BytesIO(model_file['Body'].read())
model = joblib.load(bytes_stream)
hazard = model.predict_partial_hazard(df_test_set.drop('event', axis=1))

df_test_set['riskPremium'] = hazard
result = df_test_set.reset_index()
result = result[['stockCode','corpCls','corpCode','reprtCode','rceptNo','stockName','updateDate','bsnsYear','quarter','riskPremium','event']]
result['event'] = result['event'].astype('int')
result['updateDate'] = str(target_date)
# dfs.append(result)


In [None]:
result.sort_values("riskPremium").head(30)

In [None]:
pdf = pd.concat(dfs).reset_index(drop=True)

In [None]:
pdf['corpCls'] = pdf['corpCls'].apply(lambda x:  'K' if x == 0 else 'Y')

In [None]:
sdf = spark.createDataFrame(pdf)

In [None]:
sdf.where(F.col("stockCode") == "005930").show()

In [None]:
"""
(
    sdf
    .write.format("mongodb")
    .mode("append")
    .option("upsertDocument", "true")
    .option("idFieldList", "updateDate,stockCode,rceptNo")
    .option("spark.mongodb.read.connection.uri", MongoUrl)
    .option("spark.mongodb.write.connection.uri", MongoUrl)
    .option("database", "coreEngine")
    .option("collection", "ReportRiskPremium")
    .save()
    )
    """

In [None]:
sdf.count() # 1498888