# 3. データサイエンティストによる、ノートブックのモジュール化
01_exp_on_notebook.ipynb で実施した実験をモジュール化していきます。

## モジュール化するスクリプト
以下のスクリプトを4つのモジュールに分けます。
* 前処理 : preprocess.py
* 学習 : train.py
* 推論 : predict.py
* 評価 : evaluate.py

In [None]:
# Import the latest sagemaker, stepfunctions and boto3 SDKs
import sys

!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install -qU pandas

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
df = pd.read_csv("./dataset/census-income.csv")
df = df[["age","education","major industry code","class of worker","num persons worked for employer","capital gains","capital losses","dividends from stocks","income",]]
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.replace([" - 50000.", " 50000+."], [0, 1], inplace=True)
X_train, X_test, y_train, y_test = train_test_split(df.drop("income", axis=1), df["income"], test_size=0.2)
preprocess = make_column_transformer((KBinsDiscretizer(encode="onehot-dense", n_bins=10),["age", "num persons worked for employer"],),(StandardScaler(),["capital gains", "capital losses", "dividends from stocks"],),(OneHotEncoder(sparse=False, handle_unknown='ignore'),["education", "major industry code", "class of worker"],),)
X_train = preprocess.fit_transform(X_train)
X_test = preprocess.transform(X_test)
model = LogisticRegression(class_weight="balanced", solver="lbfgs", C=float(1.0), verbose=1)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
report_dict = classification_report(y_test, predictions, output_dict=True)
report_dict["accuracy"] = accuracy_score(y_test, predictions)
report_dict["roc_auc"] = roc_auc_score(y_test, predictions)
print(report_dict)

# モジュール化の方針
* はじめにライブラリの読み込み
* ノートブックから切り出した処理を関数化する
* 関数のはじめにファイル読み込みを行い、最後にファイル出力を行う
* 引数の処理を行なった後、関数を実行する

ロジックの検証はローカルで実施も行う。ローカル実行の場合、ファイルはoutputディレクトリで連携する。

## preprocess.py

In [None]:
%%writefile preprocess.py

import os
import argparse

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer, KBinsDiscretizer
from sklearn.compose import make_column_transformer


def preprocess(inputpath="/opt/ml/processing/input", outputpath="/opt/ml/processing/output", train_test_split_ratio=0.5):
    ### ファイル入力(ノートブックと同じ)
    
    ### ノートブック処理
    df = pd.read_csv(f"{inputpath}/census-income.csv")
    df = df[["age","education","major industry code","class of worker","num persons worked for employer","capital gains","capital losses","dividends from stocks","income",]]
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df.replace([" - 50000.", " 50000+."], [0, 1], inplace=True)
    X_train, X_test, y_train, y_test = train_test_split(df.drop("income", axis=1), df["income"], test_size=train_test_split_ratio)
    preprocess = make_column_transformer((KBinsDiscretizer(encode="onehot-dense", n_bins=10),["age", "num persons worked for employer"],),(StandardScaler(),["capital gains", "capital losses", "dividends from stocks"],),(OneHotEncoder(sparse=False, handle_unknown='ignore'),["education", "major industry code", "class of worker"],),)
    X_train = preprocess.fit_transform(X_train)
    X_test = preprocess.transform(X_test)
    
    ### ファイル出力
    os.makedirs(outputpath, exist_ok=True)
    pd.DataFrame(X_train).to_csv(f"{outputpath}/X_train.csv", index=False)
    pd.DataFrame(y_train).to_csv(f"{outputpath}/y_train.csv", index=False)
    pd.DataFrame(X_test).to_csv(f"{outputpath}/X_test.csv", index=False)
    pd.DataFrame(y_test).to_csv(f"{outputpath}/y_test.csv", index=False)


if __name__ == "__main__":
    ### 環境変数確認
    print(os.environ)
    
    ### 引数処理
    parser = argparse.ArgumentParser()
    parser.add_argument("--train-test-split-ratio", type=float, default=0.5)
    args, _ = parser.parse_known_args()
    print("Received arguments {}".format(args))
    
    ### 関数の実行
    if os.getenv('SM_MODEL_DIR'):
        print('===== SageMaker Processing Job =====')
        preprocess(train_test_split_ratio=args.train_test_split_ratio)
    else:
        print('===== Local =====')
        preprocess(inputpath="./dataset", outputpath="output/preprocess", train_test_split_ratio=args.train_test_split_ratio)

In [None]:
### ローカル実行
%run preprocess.py --train-test-split-ratio 0.2

## train.py

In [None]:
%%writefile train.py

import os
import argparse

import pandas as pd
from sklearn.linear_model import LogisticRegression
import joblib

def train(inputpath="/opt/ml/processing/input", outputpath="/opt/ml/processing/output", c=1.0):
    ### ファイル読み込み
    X_train = pd.read_csv(f"{inputpath}/preprocess/X_train.csv")
    y_train = pd.read_csv(f"{inputpath}/preprocess/y_train.csv")
    
    ### ノートブック処理
    model = LogisticRegression(class_weight="balanced", solver="lbfgs", C=float(c), verbose=1)
    model.fit(X_train, y_train)
    
    ### ファイル出力
    os.makedirs(outputpath, exist_ok=True)
    joblib.dump(model, f"{outputpath}/model.joblib")


if __name__ == "__main__":
    ### 環境変数取得
    print(os.environ)
    
    ### 引数処理
    parser = argparse.ArgumentParser()
    parser.add_argument("--c", type=float, default=1.0)
    args, _ = parser.parse_known_args()
    print("Received arguments {}".format(args))
    
    ### モジュールの実行
    if os.getenv('SM_MODEL_DIR'):
        print('===== SageMaker Processing Job =====')
        train(c=args.c)
    else:
        print('===== Local =====')
        train(inputpath="./output", outputpath="output/train", c=args.c)

In [None]:
%run train.py --c=0.5

## predict.py

In [None]:
%%writefile predict.py

import os
import argparse

import pandas as pd
import joblib


def predict(inputpath="/opt/ml/processing/input", outputpath="/opt/ml/processing/output"):
    ### ファイル読み込み
    X_test = pd.read_csv(f"{inputpath}/preprocess/X_test.csv")
    model = joblib.load(f"{inputpath}/train/model.joblib")
    
    ### ノートブック処理
    predictions = model.predict(X_test)
    
    ### ファイル出力
    os.makedirs(outputpath, exist_ok=True)
    pd.DataFrame(predictions).to_csv(f"{outputpath}/predictions.csv", index=False)    


if __name__ == "__main__":
    ### 環境変数取得
    print(os.environ)
    
    ### 引数処理
    
    ### モジュールの実行
    if os.getenv('SM_MODEL_DIR'):
        print('===== SageMaker Processing Job =====')
        predict()
    else:
        print('===== Local =====')
        predict(inputpath="./output", outputpath="output/predict")

In [None]:
%run predict.py

## evaluate.py

In [None]:
%%writefile evaluate.py

import os
import argparse

import pandas as pd
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

def evaluate(inputpath="/opt/ml/processing/input", outputpath="/opt/ml/processing/output"):
    ### ファイル読み込み
    y_test = pd.read_csv(f"{inputpath}/preprocess/y_test.csv")
    predictions = pd.read_csv(f"{inputpath}/predict/predictions.csv")
    
    ### ノートブック処理
    report_dict = classification_report(y_test, predictions, output_dict=True)
    report_dict["accuracy"] = accuracy_score(y_test, predictions)
    report_dict["roc_auc"] = roc_auc_score(y_test, predictions)
    print(report_dict)
    
    ### ファイル出力
    os.makedirs(outputpath, exist_ok=True)
    pd.DataFrame(report_dict).to_csv(f"{outputpath}/report_dict.csv", index=False)


if __name__ == "__main__":
    ### 環境変数取得
    print(os.environ)
    
    ### 引数処理
    
    ### モジュールの実行
    if os.getenv('SM_MODEL_DIR'):
        print('===== SageMaker Processing Job =====')
        evaluate()
    else:
        print('===== Local =====')
        evaluate(inputpath="./output", outputpath="output/evaluate")

In [None]:
%run evaluate.py

ノートブックで作成したモデル構築、評価コードを.pyコードに変更することができました。
作成した .pyコードをCodeCommitにpushし、実験パイプラインを用いて記録を行います。