In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

def fill_nan(table):
    for col in table.columns:
        table[col] = table[col].fillna(table[col].median())
    return table

In [3]:
data = pd.read_csv("../data/credit_scoring_sample.csv", sep=";")
data = fill_nan(data)

X = data.drop("SeriousDlqin2yrs", axis=1)
y = data["SeriousDlqin2yrs"]

data.head()

Unnamed: 0,SeriousDlqin2yrs,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,NumberOfTimes90DaysLate,NumberOfTime60-89DaysPastDueNotWorse,MonthlyIncome,NumberOfDependents
0,0,64,0,0.249908,0,0,8158.0,0.0
1,0,58,0,3870.0,0,0,5166.0,0.0
2,0,41,0,0.456127,0,0,6666.0,0.0
3,0,43,0,0.00019,0,0,10500.0,2.0
4,1,49,0,0.27182,0,0,400.0,0.0


In [4]:
np.random.seed(0)

churn_age = data[data['SeriousDlqin2yrs'] == 1]['age'].values

def get_bootstrap_samples(data, n_samples):
    indices = np.random.randint(0, len(data), (n_samples, len(data)))
    samples = data[indices]
    return samples

def stat_intervals(stat, alpha):
    boundaries = np.percentile(stat, [100 * alpha / 2., 100 * (1 - alpha / 2.)])
    return boundaries

bootstrap_mean_ages = np.mean(get_bootstrap_samples(churn_age, 1000), axis=1)
stat_intervals(bootstrap_mean_ages, 0.1)

array([45.71379414, 46.12700479])

In [None]:
from scipy.stats import binom

prob = sum(binom.pmf(k, 5, 0.7) for k in range(3, 6))
print(f"{prob:.4%}")

83.6920%


In [6]:
lr_final = LogisticRegression(C=0.001, random_state=5, class_weight="balanced", solver='liblinear')
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

lr_final.fit(X_scaled, y)

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'coef': lr_final.coef_[0],
    'abs_coef': np.abs(lr_final.coef_[0])
})
feature_importance.sort_values(by='abs_coef', ascending=False)

Unnamed: 0,feature,coef,abs_coef
1,NumberOfTime30-59DaysPastDueNotWorse,0.724004,0.724004
3,NumberOfTimes90DaysLate,0.517673,0.517673
0,age,-0.416304,0.416304
4,NumberOfTime60-89DaysPastDueNotWorse,0.194732,0.194732
5,MonthlyIncome,-0.162864,0.162864
6,NumberOfDependents,0.101326,0.101326
2,DebtRatio,-0.024082,0.024082


In [7]:
coefs = lr_final.coef_[0]
softmax_coefs = np.exp(coefs) / np.sum(np.exp(coefs))

feature_impact = pd.DataFrame({
    'feature': X.columns,
    'softmax_impact': softmax_coefs
})

print(feature_impact[feature_impact['feature'] == 'DebtRatio'])

     feature  softmax_impact
2  DebtRatio        0.114205


In [8]:
lr_raw = LogisticRegression(C=0.001, random_state=5, class_weight="balanced", solver='liblinear')
lr_raw.fit(X, y)

age_coef = lr_raw.coef_[0][X.columns.get_loc('age')]
odds_increase = np.exp(age_coef * 20)

print(f"Odds increase: {odds_increase:.4f}")

Odds increase: 0.6951


In [10]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)

rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42, class_weight="balanced")

parameters_rf = {
    "max_features": [1, 2, 4],
    "min_samples_leaf": [3, 5, 7, 9],
    "max_depth": [5, 10, 15],
}

rf_grid = GridSearchCV(rf, parameters_rf, n_jobs=-1, scoring="roc_auc", cv=skf)
rf_grid.fit(X, y)

print("Best RF Params:", rf_grid.best_params_)
print("Best RF Score:", rf_grid.best_score_)

Best RF Params: {'max_depth': 10, 'max_features': 2, 'min_samples_leaf': 9}
Best RF Score: 0.8357747225971666


In [11]:
try:
    lr_score = best_lr_score
except NameError:
    lr_score = 0.8305 

diff = rf_grid.best_score_ - lr_score
print(f"RF Score: {rf_grid.best_score_}")
print(f"LR Score: {lr_score}")
print(f"Improvement over LR: {diff:.4f}")

RF Score: 0.8357747225971666
LR Score: 0.8305
Improvement over LR: 0.0053


In [12]:
best_rf = rf_grid.best_estimator_
importances = pd.DataFrame({
    'feature': X.columns,
    'importance': best_rf.feature_importances_
})

importances.sort_values(by='importance', ascending=True)

Unnamed: 0,feature,importance
6,NumberOfDependents,0.0145
5,MonthlyIncome,0.058009
2,DebtRatio,0.076118
0,age,0.115844
4,NumberOfTime60-89DaysPastDueNotWorse,0.156463
3,NumberOfTimes90DaysLate,0.278794
1,NumberOfTime30-59DaysPastDueNotWorse,0.300271


In [13]:
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import RandomizedSearchCV

bagging_params = {
    "max_features": [2, 3, 4],
    "max_samples": [0.5, 0.7, 0.9],
    "estimator__C": [0.0001, 0.001, 0.01, 1, 10, 100],
}

bg = BaggingClassifier(
    estimator=LogisticRegression(class_weight="balanced", solver='liblinear'),
    n_estimators=100, 
    n_jobs=-1, 
    random_state=42
)

bg_search = RandomizedSearchCV(
    bg, 
    bagging_params, 
    n_iter=20, 
    scoring="roc_auc", 
    cv=skf, 
    random_state=1, 
    n_jobs=-1
)

bg_search.fit(X, y)
print("Best Bagging Score:", bg_search.best_score_)
print("Best Bagging Params:", bg_search.best_params_)



Best Bagging Score: 0.8093395546305292
Best Bagging Params: {'max_samples': 0.7, 'max_features': 2, 'estimator__C': 0.001}


Упр 2.11: Лучшая точность получилась ~0.809

Упр 2.12: Лучшие параметры max_features=2 (мало признаков) и max_samples=0.7.

Интерпретация: Для бэггинга важно, чтобы модели были разнообразными (некоррелированными). Использование малого числа признаков (max_features) для каждой модели снижает корреляцию между ними, что улучшает качество ансамбля.