In [None]:
sc.install_pypi_package("pybind11==2.10.3")
sc.install_pypi_package("numpy==1.19.0")
sc.install_pypi_package("Pillow==8.2")
sc.install_pypi_package("Cython==0.29.33")
sc.install_pypi_package("scipy==1.2.0")
sc.install_pypi_package("pythran==0.12.1")
sc.install_pypi_package("pandas==1.0.0")
sc.install_pypi_package("matplotlib==3.3.0")
sc.install_pypi_package("lifelines==0.27.4")
sc.install_pypi_package("s3fs==0.4.2")
sc.install_pypi_package("boto3==1.26.59")
sc.install_pypi_package("joblib==1.2.0")

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from itertools import groupby
from operator import itemgetter

from lifelines import CoxPHFitter

import pandas as pd
import boto3
import os
import pickle
import tempfile
import joblib
import io
import numpy as np
import matplotlib.pyplot as plt

MongoUrl = "mongodb+srv://xxxxxxxxxxxxxxxxxxxxxx/"

1. cox 모델로 학습한 모델을 불러오기 
2. test 데이터셋으로 예측 진행.
3. 예측 결과를 risk premium이라고 명명
4. 이를 몽고디비에 'ReportRiskPremium_New'으로 저장

In [None]:
spark = SparkSession \
    .builder \
    .appName("FinancialSheets_ML_Training") \
    .config("spark.cores.max", 6) \
    .config("spark.executor.cores", 6) \
    .config("spark.executor.memory", "36g") \
    .getOrCreate()

In [None]:
df = spark.read.format("mongodb") \
    .option("spark.mongodb.read.connection.uri", MongoUrl) \
    .option("spark.mongodb.write.connection.uri", MongoUrl) \
    .option("database", "coreEngine") \
    .option("collection", "ReportFeatures") \
    .option("aggregation.pipeline", "{ $match: { updateDate: { $gt: '20230530' } } }") \
    .load() \
    .drop("_id")
# df.show()

In [None]:
features = df.toPandas()

In [None]:
features = features.fillna(0)

In [None]:
hf_features = features
hf_features["event"] = hf_features["event"].fillna(0).astype(bool)
hf_features["plbtEvent"] = hf_features["plbtEvent"].fillna(0).astype(bool)
hf_features = hf_features.sort_values(["stockCode", "rceptNo"])
hf_features = hf_features[hf_features["period"] >= 1.0]

In [None]:
columns = [
            "DSR01", "DSR02", "DSR03", "DSR04", "DSR05",
#             "FFOEQ", "FFOTL", "CACL", "EQTA", "INSL", "CATA", "TLTA", "INTL", 
#             "MB", "NIGR", "FAGR", "EBTIN", "CLCA", "NEGBE", "CLGR", 
            "CLTL", "EBTIN2", "INTL2", "LNSL", "LNTA", "MB2", "NIGR2", "NISL", "RETA", 
            "SLEQ", "SLFA", "TLEQ"]
for column in columns:
    hf_features[column]=hf_features.groupby(["bsnsYear", "reprtCode"])[column].apply(lambda x:x.fillna(0))

In [None]:
hf_features = hf_features.dropna()

In [None]:
boundry = "2022"

df_test_set = hf_features[hf_features["bsnsYear"] >= boundry].drop_duplicates()
df_test_set = df_test_set.set_index(["stockCode", "corpCls", "corpCode", "reprtCode", "rceptNo", "stockName", "updateDate", "bsnsYear", "quarter"])

In [None]:
df_test_set

In [None]:
s3 = boto3.client(
    "s3",
    region_name="ap-northeast-2",
    aws_access_key_id="xxxxxxxxxxxxxxxxxxxxxx",
    aws_secret_access_key="xxxxxxxxxxxxxxxxxxxxxx")

In [None]:
model_file = s3.get_object(Bucket="penta-engine", Key="FinancialSheetsModelNew.pkl")
bytes_stream = io.BytesIO(model_file['Body'].read()) 
model = joblib.load(bytes_stream) #joblib : pkl 저장 및 불러오기
"""
io.BytesIo : 메모리에 엑셀파일 저장하기. 다시 말해 Bytes IO 메모리에 엑셀 객체를 저장해두고 사용. to_excel()을 사용할 수 없을 때,
                여기서 IO는 input/output
                *stringIO는 string data다룸 // BytesIO는 binary data 다룸 
"""

In [None]:
hazard = model.predict_partial_hazard(df_test_set.drop('event', axis=1))

In [None]:
df_test_set['riskPremium'] = hazard
result = df_test_set.reset_index()
result = result[['stockCode','corpCls','corpCode','reprtCode','rceptNo','stockName','updateDate','bsnsYear','quarter','riskPremium','event', 'plbtEvent']]
result['event'] = result['event'].astype('int')
result['plbtEvent'] = result['plbtEvent'].astype('int')
result['corpCls'] = result['corpCls'].astype('str')

In [None]:
result = df_test_set.reset_index()
result = result[['stockCode','corpCls','corpCode','reprtCode','rceptNo','stockName','updateDate','bsnsYear','quarter','riskPremium','event', 'plbtEvent']]
result['event'] = result['event'].astype('int')
result['plbtEvent'] = result['plbtEvent'].astype('int')
result['corpCls'] = result['corpCls'].astype('str')

In [None]:
result

In [None]:
sdf = spark.createDataFrame(result)

In [None]:
"""
(
    sdf
    .write.format("mongodb")
    .mode("append")
    .option("upsertDocument", "true")
    .option("idFieldList", "updateDate,stockCode,rceptNo")
    .option("spark.mongodb.read.connection.uri", MongoUrl)
    .option("spark.mongodb.write.connection.uri", MongoUrl)
    .option("database", "coreEngine")
    .option("collection", "ReportRiskPremium_New")
    .save()
    )
    """

In [None]:
spark.stop()