In [24]:
# Базовые библиотеки
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Воспроизводимость
RANDOM_STATE = 42

# Библиотеки sklearn для данных, моделей, разбиения и метрик
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    RocCurveDisplay,
    PrecisionRecallDisplay,
    brier_score_loss,
)

# Для калибровки
from sklearn.calibration import CalibratedClassifierCV

In [31]:
# загрузка датасета
url = 'S05-hw-dataset.csv' 
data = pd.read_csv(url, sep = ',')
data

Unnamed: 0,client_id,age,income,years_employed,credit_score,debt_to_income,num_credit_cards,num_late_payments,has_mortgage,has_car_loan,savings_balance,checking_balance,region_risk_score,phone_calls_to_support_last_3m,active_loans,customer_tenure_years,default
0,1,25,94074,22,839,0.547339,1,7,0,0,26057,5229,0.080052,19,1,8,0
1,2,58,51884,26,565,0.290882,1,1,0,1,16221,11595,0.428311,15,0,7,0
2,3,53,48656,39,561,0.522340,1,13,0,0,55448,-2947,0.770883,15,4,5,0
3,4,42,81492,30,582,0.709123,2,10,1,1,35188,17727,0.357619,0,2,7,1
4,5,42,94713,8,642,0.793392,3,3,0,0,0,-404,0.414260,17,1,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,51,131729,6,687,0.283093,3,1,1,0,13136,14209,0.251680,19,3,2,0
2996,2997,35,54454,36,634,0.396180,5,5,0,0,0,10858,0.506146,2,2,12,0
2997,2998,38,63305,16,573,0.577888,2,13,0,1,24870,6689,0.366375,16,2,10,1
2998,2999,42,55606,33,689,0.659278,6,9,1,0,32677,3252,0.295284,16,2,8,1


In [6]:
data.head(5) # вывод первых 5 строк

Unnamed: 0,client_id,age,income,years_employed,credit_score,debt_to_income,num_credit_cards,num_late_payments,has_mortgage,has_car_loan,savings_balance,checking_balance,region_risk_score,phone_calls_to_support_last_3m,active_loans,customer_tenure_years,default
0,1,25,94074,22,839,0.547339,1,7,0,0,26057,5229,0.080052,19,1,8,0
1,2,58,51884,26,565,0.290882,1,1,0,1,16221,11595,0.428311,15,0,7,0
2,3,53,48656,39,561,0.52234,1,13,0,0,55448,-2947,0.770883,15,4,5,0
3,4,42,81492,30,582,0.709123,2,10,1,1,35188,17727,0.357619,0,2,7,1
4,5,42,94713,8,642,0.793392,3,3,0,0,0,-404,0.41426,17,1,10,1


In [None]:
data.info() # информация о столбцах и типах

In [None]:
data.describe() # описательная статистика

In [18]:
data["default"].value_counts(normalize=True) # распределение value_counts

default
0    0.589667
1    0.410333
Name: proportion, dtype: float64

In [None]:
# подготовка признаков таргета

X = data.drop(['default', "client_id"], axis=1)
Y = data['default']
print(X.shape)
print(Y.shape)


(3000, 15)
(3000,)


In [None]:
# разбиение на обучающую и тестовую выборку

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42, stratify = Y)
print("Обучающие данные:",X_train.shape, Y_train.shape)
print("Тестовые данные:",X_test.shape, Y_test.shape)

Обучающие данные: (2400, 15) (2400,)
Тестовые данные: (600, 15) (600,)


In [11]:
# реализация "тупой" модели

dummy = DummyClassifier(strategy = "most_frequent", random_state = 42)
dummy.fit(X_train, Y_train)

Y_pred = dummy.predict(X_test)
Y_proba = dummy.predict_proba(X_test)[:, 1]

dummy_acc = accuracy_score(Y_test, Y_pred)
dummy_roc = roc_auc_score(Y_test, Y_proba)

print(f"dummy: accuracy = {dummy_acc:.4f}, ROC-AUC = {dummy_roc:.4f}")

dummy: accuracy = 0.5900, ROC-AUC = 0.5000


In [16]:
# создание пайплайна

pipe = Pipeline(
    steps = [("scaler", StandardScaler()), 
    ("logreg", LogisticRegression(max_iter=1000, random_state=42))]
)
pipe

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('scaler', ...), ('logreg', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"copy  copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.",True
,"with_mean  with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.",True
,"with_std  with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).",True

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",42
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [28]:
# подбор двух парамсетров при помощи GridSearchCV: параметра регуляризации (С) и class_weight(дисбаланс классов). 

param_grid = {"logreg__C": [0.01, 0.1, 1.0, 10.0], 
              "logreg__class_weight": [None, "balanced"]}

grid = GridSearchCV (estimator=pipe, 
                     param_grid=param_grid, 
                     scoring="roc_auc",
                     cv=5,
                     n_jobs=-1, 
                     refit=True)

grid.fit(X_train, Y_train)

print("Лучшие параметры:", grid.best_params_)
print("Лучише CV и ROC-AUC:", grid.best_score_)

best_model = grid.best_estimator_

Y_pred = best_model.predict(X_test)
Y_proba = best_model.predict_proba(X_test)[:, 1]

acc = accuracy_score(Y_test, Y_pred)
roc = roc_auc_score(Y_test, Y_proba)
prec = precision_score(Y_test, Y_pred)
rec = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

print(f"Метрики на тестовой выборке| accuracy={acc:.4f} | roc_auc={roc:.4f} | precision={prec:.4f} | recall={rec:.4f} | f1={f1:.4f}")

Лучшие параметры: {'logreg__C': 10.0, 'logreg__class_weight': None}
Лучише CV и ROC-AUC: 0.8606590016322577
Метрики на тестовой выборке| accuracy=0.8000 | roc_auc=0.8756 | precision=0.7944 | recall=0.6911 | f1=0.7391


In [None]:
# создание ROC графика

figures_dir = Path("figures")
figures_dir.mkdir(exist_ok=True)
plt.figure()
RocCurveDisplay.from_predictions(Y_test, Y_proba)
plt.title("ROC кривая лучшей модели логистической регрессии")
plt.savefig(figures_dir / "roc_curve_logreg_grid.png", dpi=150, bbox_inches="tight")
plt.close()

<Figure size 640x480 with 0 Axes>

In [None]:
# сравнение параметров бейзлайна и логистической регрессии

comparison_df = pd.DataFrame(
    {
        "Модель": ["DummyClassifier", "Логистическая регрессия"],
        "Accurecy": [dummy_acc, acc],
        "ROC-AUC": [dummy_roc, roc]
    }
)
comparison_df

Unnamed: 0,Модель,Accurecy,ROC-AUC
0,DummyClassifier,0.59,0.5
1,Логистическая регрессия,0.8,0.875637


Бейзлайн модель не использует признаки и предсказывает наиболее частый класс. Ее значение ROC-AUC близко к 0.5, что соответсвует случайному угадыванию, в то время как логистическая регрессия показала себя намного лучше. Значение accurecy выросло с 0.59 до 0.8, а ROC-AUC с 0.50 до 0.876. Подбор параметра регуляризации С при помощи GridSerachCV позволил улучшить качество модели. Логистическая регрессия более разумный выбор, так как обеспечила более высокое качество.