### 精准率和召回率的平衡

In [1]:
import numpy as np
from sklearn import datasets

In [2]:
digits = datasets.load_digits()
X = digits.data
# 如果不用copy，则y和digits指向同一个引用，修改y则会修改digits
y = digits.target.copy()

# 模拟数据倾斜，只关注数字为9的数据
y[digits.target == 9] = 1
y[digits.target != 9] = 0

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)

In [4]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
log_reg.score(X_test, y_test)



0.9755555555555555

In [7]:
y_log_predict = log_reg.predict(X_test)

In [21]:
from sklearn.metrics import f1_score, confusion_matrix, recall_score, precision_score
f1_score(y_test, y_log_predict)

0.8674698795180723

In [29]:
# 逻辑回归默认以 0 为阈值进行判断，此处可以进行调整，来判断对应精准率和召回率
log_reg.decision_function(X_test)

array([-22.05699728, -33.02941489, -16.21335088, -80.37916297,
       -48.25125309, -24.54006521, -44.39168581, -25.0429477 ,
        -0.97828839, -19.71745384, -66.25139278, -51.09603001,
       -31.49349159, -46.05334697, -38.67877272, -29.80472044,
       -37.58850411, -82.57570297, -37.81904378, -11.01165598,
        -9.17440282, -85.13004858, -16.71616451, -46.2372651 ,
        -5.32994343, -47.91762424, -11.66730613, -39.19606675,
       -25.25294596, -14.36647263, -16.99784322, -28.91906343,
       -34.33942078, -29.47608376,  -7.85813331,  -3.82093432,
       -24.08165466, -22.16362793, -33.61221199, -23.14024239,
       -26.91805368, -62.38938376, -38.85691793, -66.7726097 ,
       -20.14483425, -17.47887025, -18.06800222, -22.22226081,
       -29.62304646, -19.73171496,   1.49551643,   8.32080732,
       -36.2931167 , -42.50733657, -25.90459635, -34.98961158,
        -8.42012522, -50.04726791, -51.48209177,  19.88959953,
        -8.91888592, -31.99344717, -11.66100579,  -0.47

In [10]:
log_reg.decision_function(X_test)[: 10]

array([-22.05699728, -33.02941489, -16.21335088, -80.37916297,
       -48.25125309, -24.54006521, -44.39168581, -25.0429477 ,
        -0.97828839, -19.71745384])

In [12]:
log_reg.predict(X_test)[: 10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [13]:
decision_scores = log_reg.decision_function(X_test)

In [15]:
np.min(decision_scores)

-85.68609464141575

In [16]:
np.max(decision_scores)

19.889599525838047

In [17]:
y2_predict = np.array(decision_scores >= 5, dtype=int)

In [20]:
confusion_matrix(y_test, y2_predict)

array([[404,   1],
       [ 21,  24]])

In [22]:
precision_score(y_test, y2_predict)

0.96

In [23]:
recall_score(y_test, y2_predict)

0.5333333333333333

In [25]:
y3_predict = np.array(decision_scores >= -5, dtype=int)

In [26]:
confusion_matrix(y_test, y3_predict)

array([[390,  15],
       [  5,  40]])

In [27]:
precision_score(y_test, y3_predict)

0.7272727272727273

In [28]:
recall_score(y_test, y3_predict)

0.8888888888888888