In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
from sklearn.datasets import load_breast_cancer
breast=load_breast_cancer()
data=breast.data
target=breast.target
breast.keys()

dict_keys(['feature_names', 'target_names', 'data', 'target', 'DESCR'])

In [3]:
print(data.shape)
print(target.shape)

(569, 30)
(569,)


In [4]:
pd.DataFrame(data,columns=breast.feature_names).head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [5]:
for i,j in enumerate(breast.target_names):
    print("{}: {}".format(j,sum(target==i)))

malignant: 212
benign: 357


In [6]:
from sklearn import cross_validation as cv
data_train,data_test,target_train,target_test=cv.train_test_split(data,target,train_size=0.75)

In [7]:
from sklearn.ensemble import RandomForestClassifier as RFC
clf=RFC()
clf.fit(data_train,target_train)
predict=clf.predict(data_test)
predict_proba=clf.predict_proba(data_test) # precision-recall curveにはスカラー値でのスコアリングが必要

# Metrics

In [8]:
from sklearn import metrics

In [9]:
# 正答率
metrics.accuracy_score(predict,target_test)

0.97202797202797198

In [10]:
# 混同行列
print(metrics.confusion_matrix(predict,target_test))

[[56  2]
 [ 2 83]]


> 縦軸がpredict, 横軸がtarget_test

## precision-recall関係
> [神嶌先生の解説参照](http://ibisforest.org/index.php?F%E5%80%A4)

In [11]:
# Precision、RecallとF値とsupport(正解ラベルのデータの数)
print(metrics.classification_report(predict,target_test,target_names=breast.target_names))

             precision    recall  f1-score   support

  malignant       0.97      0.97      0.97        58
     benign       0.98      0.98      0.98        85

avg / total       0.97      0.97      0.97       143



In [None]:
#Precision,Recallを再利用したい場合はこちらで抽出する
precision,recall,fscore,support=metrics.precision_recall_fscore_support(predict,target_test)
print(precision)
print(recall)
print(fscore)
print(support)

[ 0.96551724  0.97647059]
[ 0.96551724  0.97647059]
[ 0.96551724  0.97647059]
[58 85]


In [None]:
# precision-recall curve
precision,recall,thresholds=metrics.precision_recall_curve(target_test,predict_proba[:,1])
plt.plot(recall,precision)
plt.title("Precision-Recall Curve")

In [None]:
pd.DataFrame(np.c_[precision,recall],columns=["precision","recall"])

In [None]:
# AUC算出
metrics.auc(recall,precision)

In [None]:
# recallが閾値以下になった時をprecisionのカットオフにすると
for i,j in enumerate(recall):
    if j<0.8 and i==0:print("error");break
    elif j<0.8:
        print("criteria= {0:.3}".format(precision[i-1]))
        break

In [None]:
fp_ratio, tp_ratio, thresholds = metrics.roc_curve(target_test,predict_proba[:,1])
metrics.auc(fp_ratio,tp_ratio)

In [None]:
#ROCカーブのAUCは専用の関数がある
metrics.roc_auc_score(target_test,predict_proba[:,1])

In [None]:
plt.plot(fp_ratio,tp_ratio)
plt.title("ROC Curve")

# スコアリング関数

In [None]:
from sklearn import cross_validation as cv

In [None]:
# cross validationで使えるスコアリングは下記
print(list(metrics.scorer.SCORERS.keys()))

In [None]:
#下記形式でモデルに対してスカラー量を算出できる関数を入れれば、スコアリングもカスタム可能
# score_func(target_True, prediction)
custom_scorer=metrics.scorer.make_scorer(metrics.roc_auc_score)

In [None]:
cv_method=cv.StratifiedKFold(target_train,shuffle=True,n_folds=5)
cv.cross_val_score(clf,data_train,target_train,cv=cv_method,scoring=custom_scorer)

> 実際には、scoring="roc_auc"で動作するSCORERはほぼ同一

In [None]:
metrics.scorer.SCORERS["roc_auc"]