[辻真吾・矢吹太朗『ゼロからはじめるデータサイエンス入門』（講談社, 2021）](https://github.com/taroyabuki/fromzero)

In [None]:
# Google Colaboratoryの環境設定
import os
if 'COLAB_GPU' in os.environ:
  !python -m pip install h2o pandarallel pca pmdarima | tail -n 1

## 10.1 2値分類の性能指標

In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

y       = np.array([  0,   1,   1,   0,   1,   0,    1,   0,   0,   1])
y_score = np.array([0.7, 0.8, 0.3, 0.4, 0.9, 0.6, 0.99, 0.1, 0.2, 0.5])

In [None]:
y_ = np.array([1 if 0.5 <= p else 0 for p in y_score])
y_

In [None]:
confusion_matrix(y_true=y, y_pred=y_)

print(classification_report(y_true=y, y_pred=y_))

## 10.2 トレードオフ

In [None]:
import numpy as np
from sklearn.metrics import (roc_curve, RocCurveDisplay,
    precision_recall_curve, PrecisionRecallDisplay, auc)

y       = np.array([  0,   1,   1,   0,   1,   0,    1,   0,   0,   1])
y_score = np.array([0.7, 0.8, 0.3, 0.4, 0.9, 0.6, 0.99, 0.1, 0.2, 0.5])
y_      = np.array([1 if 0.5 <= p else 0 for p in y_score])

[sum((y == 0) & (y_ == 1)) / sum(y == 0), # FPR
 sum((y == 1) & (y_ == 1)) / sum(y == 1)] # TPR

In [None]:
my_fpr, my_tpr, _ = roc_curve(y_true=y,
                              y_score=y_score,
                              pos_label=1) # 1が陽性である．
RocCurveDisplay(fpr=my_fpr, tpr=my_tpr).plot()

In [None]:
auc(x=my_fpr, y=my_tpr)

In [None]:
[sum((y == 1) & (y_ == 1)) / sum(y  == 1), # Recall == TPR
 sum((y == 1) & (y_ == 1)) / sum(y_ == 1)] # Precision

In [None]:
my_precision, my_recall, _ = precision_recall_curve(y_true=y,
                                                    probas_pred=y_score,
                                                    pos_label=1)
PrecisionRecallDisplay(precision=my_precision, recall=my_recall).plot()

In [None]:
auc(x=my_recall, y=my_precision)

## 10.3 タイタニック

In [None]:
import graphviz
import pandas as pd
from sklearn import tree
from sklearn.metrics import roc_curve, RocCurveDisplay, auc
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

my_url = ('https://raw.githubusercontent.com/taroyabuki'
          '/fromzero/master/data/titanic.csv')
my_data = pd.read_csv(my_url)

In [None]:
my_data.head()

In [None]:
X, y = my_data.iloc[:, 0:3], my_data.Survived

my_pipeline = Pipeline([
    ('ohe', OneHotEncoder(drop='first')),
    ('tree', tree.DecisionTreeClassifier(max_depth=2, random_state=0,
                                         min_impurity_decrease=0.01))])
my_pipeline.fit(X, y)

In [None]:
my_enc  = my_pipeline.named_steps['ohe']  # パイプラインからエンコーダを取り出す．
my_tree = my_pipeline.named_steps['tree'] # パイプラインから木を取り出す．

my_dot = tree.export_graphviz(
    decision_tree=my_tree,
    out_file=None,
    feature_names=my_enc.get_feature_names() \
    if hasattr(my_enc, 'get_feature_names') else my_enc.get_feature_names_out(),
    class_names=my_pipeline.classes_,
    filled=True)
graphviz.Source(my_dot)

In [None]:
my_scores = cross_val_score(
    my_pipeline, X, y,
    cv=LeaveOneOut(),
    n_jobs=-1)
my_scores.mean()

In [None]:
tmp = pd.DataFrame(
    my_pipeline.predict_proba(X),
    columns=my_pipeline.classes_)
y_score = tmp.Yes

my_fpr, my_tpr, _ = roc_curve(y_true=y,
                              y_score=y_score,
                              pos_label='Yes')
my_auc = auc(x=my_fpr, y=my_tpr)
my_auc

RocCurveDisplay(fpr=my_fpr, tpr=my_tpr, roc_auc=my_auc).plot()

## 10.4 ロジスティック回帰

In [None]:
import matplotlib.pyplot as plt
import numpy as np

x = np.arange(-6, 6, 0.1)
y = 1 / (1 + np.exp(-x))
plt.plot(x, y)

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

my_url = ('https://raw.githubusercontent.com/taroyabuki'
          '/fromzero/master/data/titanic.csv')
my_data = pd.read_csv(my_url)

X, y = my_data.iloc[:, 0:3], my_data.Survived

my_pipeline = Pipeline([('ohe', OneHotEncoder(drop='first')),
                        ('lr', LogisticRegression(penalty='none'))])
my_pipeline.fit(X, y)

In [None]:
my_ohe = my_pipeline.named_steps.ohe
my_lr  = my_pipeline.named_steps.lr

my_lr.intercept_[0]

tmp = my_ohe.get_feature_names() \
if hasattr(my_ohe, 'get_feature_names') \
else my_ohe.get_feature_names_out()
pd.Series(my_lr.coef_[0],
          index=tmp)

In [None]:
my_scores = cross_val_score(
    my_pipeline, X, y,
    cv=LeaveOneOut(),
    n_jobs=-1)
my_scores.mean()