### 对于极度偏斜的数据，只使用分类准确度是不够的

#### TN(真值：N，预测值：N), FP(真值：N，预测值：P), FN(真值：P，预测值：N), TP(真值：P，预测值：P)
#### precision = TP / (TP + FP) , recall = TP / (TP + FN)

#### 有时会注重精准率，如股票预测
#### 有时会注重召回率，如病人诊断

In [1]:
import numpy as np
import sklearn.datasets as ds
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
digits = ds.load_digits()

In [3]:
X = digits.data
y = digits.target.copy()

In [4]:
y[digits.target ==9 ] = 1
y[digits.target !=9 ] = 0

In [5]:
y.shape

(1797,)

In [6]:
log_res = LogisticRegression()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)

In [8]:
log_res.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
log_res.score(X_test, y_test)

0.9755555555555555

In [10]:
y_predict = log_res.predict(X_test)

In [11]:
def TN(y_test, y_predict):
    return np.sum((y_test == 0) & (y_predict == 0))

def FP(y_test, y_predict):
    return np.sum((y_test == 0) & (y_predict == 1))

def FN(y_test, y_predict):
    return np.sum((y_test == 1) & (y_predict == 0))

def TP(y_test, y_predict):
    return np.sum((y_test == 1) & (y_predict == 1))

In [12]:
def precision(y_test, y_predict):
    try:
        return TP(y_test, y_predict) / (TP(y_test, y_predict) + FP(y_test, y_predict))
    except:
        return 0.0

def recall(y_test, y_predict):
    try:
        return TP(y_test, y_predict) / (TP(y_test, y_predict) + FN(y_test, y_predict))
    except:
        return 0.0
    
def f1_score(y_test, y_predict):
    precisionScore = precision(y_test, y_predict)
    recallScore = recall(y_test, y_predict)
    try:
        return 2 * precisionScore * recallScore / (precisionScore + recallScore)
    except:
        return 0.0

In [13]:
precision(y_test, y_predict)

0.9473684210526315

In [14]:
recall(y_test, y_predict)

0.8

In [15]:
f1_score(y_test, y_predict)

0.8674698795180723

In [16]:
def confusion_matrix(y_test, y_predict):
    return np.array([[TN(y_test, y_predict), FP(y_test, y_predict)],
                     [FN(y_test, y_predict), TP(y_test, y_predict)]])

In [17]:
confusion_matrix(y_test, y_predict)

array([[403,   2],
       [  9,  36]])

### scikit-learn

In [18]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [19]:
confusion_matrix(y_test, y_predict)

array([[403,   2],
       [  9,  36]])

In [20]:
precision_score(y_test, y_predict)

0.9473684210526315

In [21]:
recall_score(y_test, y_predict)

0.8

In [22]:
f1_score(y_test, y_predict)

0.8674698795180723