<a href="https://colab.research.google.com/github/tomonari-masada/course2025-sml/blob/main/08_logistic_regression_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# プランナー課題８の実行例
 * https://www.kaggle.com/uciml/pima-indians-diabetes-database

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay

%config InlineBackend.figure_format = 'retina'

In [None]:
diabetes = pd.read_csv('/content/drive/MyDrive/data/diabetes.csv')

In [None]:
y = diabetes['Outcome']
X = diabetes.drop('Outcome', axis=1)

* この分割は変えない。

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=123)

In [None]:
X_train.hist(bins=50, figsize=(9,9));

## ベースライン: チューニングなしのロジスティック回帰をテストデータで評価

In [None]:
baseline = LogisticRegression(max_iter=1000, random_state=123)
baseline.fit(X_train, y_train)
print(f'test score: {baseline.score(X_test, y_test):.4f}')
y_test_pred_proba = baseline.predict_proba(X_test)
print(f'ROC AUC: {roc_auc_score(y_test, y_test_pred_proba[:,1]):.4f}')
print(f'Average precision: {average_precision_score(y_test, y_test_pred_proba[:,1]):.4f}')

In [None]:
fig, ax = plt.subplots()
RocCurveDisplay.from_estimator(baseline, X_test, y_test, name="baseline", ax=ax)
ax.set_title("ROC curve");

In [None]:
fig, ax = plt.subplots()
PrecisionRecallDisplay.from_estimator(baseline, X_test, y_test, name="baseline", ax=ax)
ax.set_title("2-class Precision-Recall curve");

* これをベースラインとみなす。
* これより良い結果を得るべく、試行錯誤する。
* 試行錯誤した結果として辿り着いたモデルで、最後に一回、テストデータ上での評価を行う。

## グリッドサーチ＆交差検証 (1)

* Pregnanciesについては0は意味のある値なのでimputationは行わない。

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ("imputer", SimpleImputer(missing_values=0, fill_value=0), ['BloodPressure', 'BMI', 'Glucose', 'SkinThickness', 'Insulin']),
     ("polynomial", PolynomialFeatures(), ['Age']),
])

In [None]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
     ("lr", LogisticRegression(max_iter=1000)),
])

In [None]:
param_grid = {
    "preprocessor__imputer__strategy": ["median", "mean", "constant"],
    "preprocessor__polynomial__degree": [1, 2],
    "lr__C": 10.0 ** np.arange(0, 5),
    "lr__penalty": [None, "l1", "l2"],
    }

In [None]:
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=4, random_state=12345)
clf = GridSearchCV(pipeline, param_grid, cv=rskf, n_jobs=-1, scoring="average_precision")
clf.fit(X_train, y_train)

In [None]:
pd.DataFrame.from_dict(clf.cv_results_).sort_values(by=["rank_test_score"]).head(3)

## テストデータで最終評価

In [None]:
best_pipeline = clf.best_estimator_

In [None]:
best_pipeline.fit(X_train, y_train)
print(f'test score: {best_pipeline.score(X_test, y_test):.4f}')
y_test_pred_proba = best_pipeline.predict_proba(X_test)
print(f'ROC AUC: {roc_auc_score(y_test, y_test_pred_proba[:,1]):.4f}')
print(f'Average precision: {average_precision_score(y_test, y_test_pred_proba[:,1]):.4f}')

In [None]:
fig, ax = plt.subplots()
RocCurveDisplay.from_estimator(baseline, X_test, y_test, name="baseline", ax=ax)
RocCurveDisplay.from_estimator(best_pipeline, X_test, y_test, name="ours", ax=ax)
ax.set_title("ROC curve");

In [None]:
fig, ax = plt.subplots()
PrecisionRecallDisplay.from_estimator(baseline, X_test, y_test, name="baseline", ax=ax)
PrecisionRecallDisplay.from_estimator(best_pipeline, X_test, y_test, name="ours", ax=ax)
ax.set_title("2-class Precision-Recall curve");