# ToDo

2/22時点でのmodel.pyのModelクラスのリファクタリングを行う。観点は下記の通り。

- Modelクラスの責務を分割
- DRY
- モデル選択を行えるクラスの作成

# 準備

In [234]:
from typing import Dict

import lightgbm as lgb
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    auc,
    f1_score,
    log_loss,
    precision_recall_curve,
    precision_score,
    recall_score,
    roc_curve,
)
from sklearn.model_selection import TimeSeriesSplit

from datamart import Datamart
from feature import Feature
from name import Name
from raw_data import RawData
from symbol_data import SymbolData

In [281]:
def create_datamart(name: str) -> pd.DataFrame:
    symbol_data = SymbolData(name).symbol_data
    raw_data = RawData(symbol_data).raw_data
    return Datamart(raw_data, "close", 5, 1, name).datamart


def train_test_time_series_split(df: pd.DataFrame):
    """datamartを時系列 x train x testで分割"""
    df = df.iloc[::-1]
    df.reset_index(inplace=True, drop=True)
    X = df.iloc[:, 1:]
    y = df["target"]
    folds = TimeSeriesSplit(n_splits=5)
    l_X_train = []
    l_X_test = []
    l_y_train = []
    l_y_test = []
    for train_index, test_index in folds.split(X):
        X_train, X_test = (
            X.iloc[train_index,],
            X.iloc[test_index,],
        )
        y_train, y_test = y[train_index], y[test_index]
        l_X_train.append(X_train)
        l_X_test.append(X_test)
        l_y_train.append(y_train)
        l_y_test.append(y_test)
    return l_X_train, l_X_test, l_y_train, l_y_test


def create_clf_pred_prob_list(
    model_str, l_X_train: list, l_X_test: list, l_y_train: list
) -> Dict[str, list]:
    """train_test_time_series_splitの結果を使用して、各データセットのモデル・予測値・予測確率・予測確率（Positive）を算出。"""
    l_clf = []
    l_pred = []
    l_prob = []
    l_prob_posi = []
    for X_train, X_test, y_train in zip(l_X_train, l_X_test, l_y_train):
        if model_str == "lgb":
            clf = lgb.LGBMClassifier()
        elif model_str == "lr":
            clf = LogisticRegression(max_iter=1500)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_prob = clf.predict_proba(X_test)
        y_prob_posi = y_prob[:, 1]
        l_clf.append(clf)
        l_pred.append(y_pred)
        l_prob.append(y_prob)
        l_prob_posi.append(y_prob_posi)
    return {"clf": l_clf, "pred": l_pred, "prob": l_prob, "prob_posi": l_prob_posi}


class Score:
    def __init__(
        self,
        l_X_train: list,
        l_X_test: list,
        l_y_train: list,
        l_y_test: list,
        l_pred: list,
        l_prob: list,
        l_prob_posi: list,
    ):
        self.l_X_train = l_X_train
        self.l_X_test = l_X_test
        self.l_y_train = l_y_train
        self.l_y_test = l_y_test
        self.l_pred = l_pred
        self.l_prob = l_prob
        self.l_prob_posi = l_prob_posi
        self.n_split = 5

    @property
    def l_idx_accuracy(self):
        l = []
        for true, pred in zip(self.l_y_test, self.l_pred):
            l.append(accuracy_score(y_true=true, y_pred=pred))
        return l

    @property
    def l_idx_precision(self):
        l = []
        for true, pred in zip(self.l_y_test, self.l_pred):
            l.append(precision_score(y_true=true, y_pred=pred))
        return l

    @property
    def l_idx_recall(self):
        l = []
        for true, pred in zip(self.l_y_test, self.l_pred):
            l.append(recall_score(y_true=true, y_pred=pred))
        return l

    @property
    def l_idx_f1(self):
        l = []
        for true, pred in zip(self.l_y_test, self.l_pred):
            l.append(f1_score(y_true=true, y_pred=pred))
        return l

    @property
    def l_idx_log_loss(self):
        l = []
        for true, prob in zip(self.l_y_test, self.l_prob):
            l.append(log_loss(y_true=true, y_pred=prob))
        return l

    @property
    def l_idx_log_loss(self):
        l = []
        for true, prob in zip(self.l_y_test, self.l_prob):
            l.append(-1 * log_loss(y_true=true, y_pred=prob))
        return l

    @property
    def l_idx_roc_auc(self):
        l = []
        for true, prob_posi in zip(self.l_y_test, self.l_prob_posi):
            fpr, tpr, _ = roc_curve(y_true=true, y_score=prob_posi)
            l.append(auc(fpr, tpr))
        return l

    @property
    def l_idx_pr_auc(self):
        l = []
        for true, prob_posi in zip(self.l_y_test, self.l_prob_posi):
            pr_precision, pr_recall, _ = precision_recall_curve(
                y_true=true, probas_pred=prob_posi
            )
            l.append(auc(pr_recall, pr_precision))
        return l

    @property
    def df_idx(self):
        v = [getattr(score, attr) for attr in dir(score) if attr.startswith("l_idx")]
        idx = [
            attr.replace("l_idx_", "")
            for attr in dir(score)
            if attr.startswith("l_idx")
        ]
        col = [
            order for order in range(1, self.n_split + 1)
        ]  # NOTE: n_splitはTimeSeriesSplitによる分割回数
        return pd.DataFrame(v, index=idx, columns=col)

    @property
    def average_score(self):
        """ある指標の中でどのモデルが良かったかの判断指標"""
        return self.df_idx.mean(axis=1)

    @property
    def ensemble_score(self):
        """あるモデルの中で何番目の分割が良かったかの判断指標"""
        return self.df_idx.mean(axis=0)

# Main

## データマート作成

In [252]:
datamart_msft = create_datamart("msft")
datamart_dia = create_datamart("dia")
datamart_spy = create_datamart("spy")

In [253]:
datamart = pd.concat(
    [
        datamart_msft,
        datamart_dia.drop("target", axis=1),
        datamart_spy.drop("target", axis=1),
    ],
    axis=1,
)

## データセット作成

In [254]:
l_X_train, l_X_test, l_y_train, l_y_test = train_test_time_series_split(datamart)

## モデル選択

In [295]:
l_model = ["lgb", "lr"]

l_score = []
l_average_score = []
ll_clf = []

for model in l_model:
    d = create_clf_pred_prob_list(model, l_X_train, l_X_test, l_y_train)
    l_clf, l_pred, l_prob, l_prob_posi = d["clf"], d["pred"], d["prob"], d["prob_posi"]
    score = Score(
        l_X_train, l_X_test, l_y_train, l_y_test, l_pred, l_prob, l_prob_posi,
    )
    l_score.append(score)
    l_average_score.append(score.average_score.sum())
    ll_clf.append(l_clf)

max_ = max(l_average_score)
idx_max_model = l_average_score.index(max_)

best_model_str = l_model[idx_max_model]
best_model_score = l_score[1]

s = best_model_score.ensemble_score.tolist()
max_ = max(s)
idx_max_split = s.index(max_)
ll_clf[idx_max_model][idx_max_split]

LogisticRegression(max_iter=1500)