In [None]:
sc.install_pypi_package("pybind11==2.10.3")
sc.install_pypi_package("numpy==1.19.0")
sc.install_pypi_package("Pillow==8.2")
sc.install_pypi_package("Cython==0.29.33")
sc.install_pypi_package("scipy==1.2.0")
sc.install_pypi_package("pythran==0.12.1")
sc.install_pypi_package("pandas==1.0.0")
sc.install_pypi_package("matplotlib==3.3.0")
sc.install_pypi_package("lifelines==0.27.4")
sc.install_pypi_package("s3fs==0.4.2")
sc.install_pypi_package("boto3==1.26.59")
sc.install_pypi_package("joblib==1.2.0")

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from itertools import groupby
from operator import itemgetter

from lifelines import CoxPHFitter

import pandas as pd
import boto3
import os
import pickle
import tempfile
import joblib
import io
import numpy as np
import matplotlib.pyplot as plt

MongoUrl = "mongodb+srv://xxxxxxxxxxxxxxxxxxxxxx/"

In [None]:
spark = SparkSession \
    .builder \
    .appName("FinancialSheets_ML_Training") \
    .config("spark.cores.max", 4) \
    .config("spark.executor.cores", 4) \
    .config("spark.executor.memory", "36g") \
    .getOrCreate()

In [None]:
df = spark.read.format("mongodb") \
    .option("spark.mongodb.read.connection.uri", MongoUrl) \
    .option("spark.mongodb.write.connection.uri", MongoUrl) \
    .option("database", "coreEngine") \
    .option("collection", "ReportFeatures") \
    .load()

In [None]:
historicalDay = df.select("updateDate").distinct().orderBy("updateDate")

In [None]:
historicalDayList = [i[0] for i in historicalDay.toPandas().values if i[0] > '20180101']
# 20180103

In [None]:
features = df.toPandas()

In [None]:
hf_features = features.drop(["_id"], axis=1)
hf_features["event"] = hf_features["event"].fillna(0).astype(bool)
hf_features = hf_features.sort_values(["stockCode", "rceptNo"])
hf_features = hf_features[hf_features["period"] >= 1.0]

In [None]:
columns = [
    "CACL", "CATA", "CLCA", "CLGR", "CLTL", "DSR01", "DSR02", "DSR03", 
    "DSR04", "DSR05", "DSR06", "DSR07", "EBTIN", "EQTA", "FAGR", 
    "FFOEQ", "FFOTL", "INSL", "INTL", "LNSL", "LNTA", "MB", "NEGBE", 
    "NIGR", "NISL", "RETA", "SLEQ", "SLFA", "TLEQ", "TLTA"]

for column in columns:
    hf_features[column]=hf_features.groupby(["bsnsYear", "reprtCode"])[column].apply(lambda x:x.fillna(0))

In [None]:
hf_features = hf_features.fillna(0).dropna()

In [None]:
dfs = []

for i in historicalDayList:
    dfs.append(hf_features[hf_features["updateDate"] <= i].sort_values(by="updateDate", ascending=True).drop_duplicates().groupby("stockCode").head(10000).sort_values(by=["stockCode", "updateDate"], ascending=True).reset_index(drop=True))

In [None]:
s3 = boto3.client(
    "s3",
    region_name="ap-northeast-2",
    aws_access_key_id="xxxxxxxxxxxxxxxxxxxxxx",
    aws_secret_access_key="xxxxxxxxxxxxxxxxxxxxxx")

In [None]:
dfs[0]

In [None]:

for idx, dfi in enumerate(dfs):
    filterColumns = [
        "CACL", "CATA", "CLCA", "CLGR", "CLTL", "DSR01", "DSR02", "DSR03", 
        "DSR04", "DSR05", "DSR06", "DSR07", "EBTIN", "EQTA", "FAGR", 
        "FFOEQ", "FFOTL", "INSL", "INTL", "LNSL", "LNTA", "MB", "NEGBE", 
        "NIGR", "NISL", "RETA", "SLEQ", "SLFA", "TLEQ", "TLTA", "period", "event"]
    
    model_name = historicalDayList[idx]

    df_train_set = dfi.set_index(["stockCode", "corpCls", "corpCode", "reprtCode", "rceptNo", "stockName", "updateDate", "bsnsYear", "quarter"])[filterColumns]
    print(df_train_set)
    model = CoxPHFitter(penalizer=0.0001, l1_ratio=1)
    model.fit(df_train_set, duration_col="period", event_col="event", fit_options=dict(step_size=0.2))

 //   f = io.BytesIO()
 //   joblib.dump(model, f)
 //   f.seek(0)
 //   s3.put_object(Bucket="penta-engine", Key=f"FinancialSheetsModel/{model_name}.pkl", Body=f)