In [1]:
import numpy as np
import sklearn.metrics
import scipy.stats
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [2]:
data = load_breast_cancer(return_X_y=False).data
target = load_breast_cancer(return_X_y=False).target

Модель

In [3]:
model = LogisticRegression(random_state = 42)

In [4]:
model.fit(data,target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Считаем Accuracy

In [5]:
(sum(target & model.predict(data)) + sum((1 - target) & (1 - model.predict(data))))/len(target)

0.95957820738137078

In [6]:
sklearn.metrics.accuracy_score(target,model.predict(data))

0.95957820738137078

Пишем функцию для задачи 1 и убеждаемся, что при правильном условии она выдает просто recall

In [7]:
def custom_scorer_1(y_true, pred_proba, alpha = 0.1):
    if type(alpha) != float:
        return 0
    
    else:
    
        y_pred = pred_proba[:,1] > np.percentile(pred_proba[:,1], 100 - 100 * alpha)

        return sum(y_pred & y_true) / sum(y_true)

In [8]:
custom_scorer_1(target, model.predict_proba(data),np.asscalar(sum(model.predict(data))/len(target)))

0.97478991596638653

In [9]:
sum(target & model.predict(data))/sum(target)

0.97478991596638653

Пишем функцию для задачи 2, оставляем целевой recall в качестве параметра (по условию задачи он должен быть 0.99) и проверяем, что при целевом значении recall как в построенной регрессии, precision совпадет с полученным из модели

In [10]:
def custom_scorer_2(y_true, pred_proba, alpha = 0.99):
    c = np.ma.masked_where(y_true == 0, pred_proba[:,1])
    c = np.ma.filled(c, np.nan)
    cutoff = np.nanpercentile(c, 100 - 100 * alpha)
    y_pred = pred_proba[:,1] > cutoff

    return sum(y_pred & y_true) / sum(y_pred)

In [11]:
custom_scorer_2(target, model.predict_proba(data),0.97478991596638653)

0.96132596685082872

In [12]:
sum(target & model.predict(data))/sum(model.predict(data))

0.96132596685082872

Создаем скореры и оптимизируем результат

In [13]:
custom_scorer_1 = sklearn.metrics.make_scorer(custom_scorer_1,needs_proba=True)

In [14]:
custom_scorer_2 = sklearn.metrics.make_scorer(custom_scorer_2,needs_proba=True)

In [15]:
model = LogisticRegression(random_state = 42)

In [16]:
param_dist = {
    'penalty': ['l1', 'l2'],
    'C': scipy.stats.randint(1, 1000),
    "fit_intercept": [1,0],
    "max_iter": scipy.stats.randint(1, 1000),
    "tol": 10. ** np.arange(-1,-10,-1)
}

In [26]:
%%time

optimize_accuracy = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=100, cv = 3, random_state = 42, scoring = 'accuracy')
optimize_accuracy.fit(data, target)
print(optimize_accuracy.best_params_, optimize_accuracy.best_score_)

{'C': 36, 'fit_intercept': 1, 'max_iter': 20, 'penalty': 'l1', 'tol': 1e-08} 0.970123022847
Wall time: 11min 29s


In [27]:
%%time

optimize_custom_1 = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=100, cv = 3, random_state = 42, scoring = custom_scorer_1)
optimize_custom_1.fit(data, target)
print(optimize_custom_1.best_params_, optimize_custom_1.best_score_)

  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)


{'C': 103, 'fit_intercept': 0, 'max_iter': 861, 'penalty': 'l1', 'tol': 1e-08} 0.159663865546
Wall time: 13min 8s


In [28]:
%%time

optimize_custom_2 = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=100, cv = 3, random_state = 42, scoring = custom_scorer_2)
optimize_custom_2.fit(data, target)
print(optimize_custom_2.best_params_, optimize_custom_2.best_score_)

  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)


{'C': 736, 'fit_intercept': 1, 'max_iter': 385, 'penalty': 'l1', 'tol': 0.01} 0.969857681312
Wall time: 13min 33s


  np.exp(prob, prob)
