In [None]:
sc.install_pypi_package("scipy==1.1.0")
sc.install_pypi_package("Cython==0.29.33")
sc.install_pypi_package("scikit-learn==0.24.2")
sc.install_pypi_package("pandas==0.23.2")
sc.install_pypi_package("s3fs==0.4.2")
sc.install_pypi_package("boto3==1.26.59")
sc.install_pypi_package("joblib==1.2.0")

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from itertools import groupby
from operator import itemgetter

import pandas as pd
import boto3
import os
import pickle
import tempfile
import joblib
import io

MongoUrl = "mongodb+srv://xxxxxxxxxxxxxxxxxxxxxx/"

In [None]:
spark = SparkSession \
    .builder \
    .appName("Frequency") \
    .config("spark.cores.max", 6) \
    .config("spark.executor.cores", 6) \
    .config("spark.executor.memory", "36g") \
    .getOrCreate()

In [None]:
df = spark.read.format("mongodb") \
    .option("spark.mongodb.read.connection.uri", MongoUrl) \
    .option("spark.mongodb.write.connection.uri", MongoUrl) \
    .option("database", "coreEngine") \
    .option("collection", "PriceFeatures").load() \
    .drop("_id")

In [None]:
s3 = boto3.client(
    "s3",
    region_name="ap-northeast-2",
    aws_access_key_id="xxxxxxxxxxxxxxxxxxxxxx",
    aws_secret_access_key="xxxxxxxxxxxxxxxxxxxxxx")

In [None]:
historicalDay = df.select("updateDate").distinct().orderBy("updateDate")

In [None]:
historicalDayList = [i[0] for i in historicalDay.toPandas().values if i[0] > '20210101' and i[0] < '20221231' ]

In [None]:
hf = df.toPandas()

In [None]:
hf.updateDate = hf.updateDate.astype('int32')

In [None]:
model_files = s3.list_objects(Bucket="penta-engine", Prefix="HighFrequencyModel/")
model_files = [i['Key'] for i in model_files['Contents']][1:]

In [None]:
historicalDayList

In [None]:
dfs = []

for i in historicalDayList:
    target_date = int(i)
    model_file = s3.get_object(Bucket="penta-engine", Key=f"HighFrequencyModel/{target_date}.pkl")
    bytes_stream = io.BytesIO(model_file['Body'].read())
    model = joblib.load(bytes_stream)

    mask_test = hf.updateDate == target_date
    df_test = hf.loc[mask_test,:].set_index(["stockCode", "stockFullName", "updateDate"])

    df_test_set = df_test.drop(labels="event", axis=1)

    scaler = MinMaxScaler()
    scaler.fit(df_test_set)
    df_test_scaled = scaler.transform(df_test_set)
    predict = model.predict_proba(df_test_scaled)

    predict_df = pd.DataFrame(predict)
    predict_df.index = df_test_set.index
    predict_df['warningSignal'] = model.predict(df_test_set)


    pdf = predict_df.reset_index()
    pdf["stockCode"] = pdf["stockCode"].apply(lambda x: str(x).zfill(6))
    pdf["updateDate"] = pdf["updateDate"].apply(str)
    pdf.rename(columns={1: "riskPremium"}, inplace=True)
    pdf.drop(0, axis=1, inplace=True)

    dfs.append(pdf)

In [None]:
pdf = pd.concat(dfs).reset_index(drop=True)

In [None]:
sdf = spark.createDataFrame(pdf)

In [None]:
"""
sdf \
    .write.format("mongodb") \
    .mode("append") \
    .option("upsertDocument", "true") \
    .option("idFieldList", "updateDate,stockCode") \
    .option("spark.mongodb.read.connection.uri", MongoUrl) \
    .option("spark.mongodb.write.connection.uri", MongoUrl) \
    .option("database", "coreEngine") \
    .option("collection", "PriceRiskPremium") \
    .save()
    """