In [1]:
from sklearn.datasets import load_breast_cancer

from sklearn.metrics import precision_score, recall_score, fbeta_score, accuracy_score, average_precision_score, log_loss
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor

from sklearn.datasets import make_classification
from sklearn.datasets import make_regression
import numpy as np

In [2]:
# Сетка гиперпараметров для перебора
param_grid = {'max_depth': [1, 2, 3, 4, 5]}

### Классификация

In [3]:
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Обучение
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Предикт
y_pred = clf.predict(X_test)


In [5]:
# Расчёт метрик
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
fbeta = fbeta_score(y_test, y_pred, beta=0.5, average='macro')
accuracy = accuracy_score(y_test, y_pred)
ap = average_precision_score(y_test, y_pred, average='macro')
cross_entropy = log_loss(y_test, y_pred)

In [6]:
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F-beta: {fbeta}')
print(f'Accuracy: {accuracy}')
print(f'Average Precision: {ap}')
print(f'Cross-entropy: {cross_entropy}')

Precision: 0.9439895185063871
Recall: 0.9439895185063871
F-beta: 0.9439895185063871
Accuracy: 0.9473684210526315
Average Precision: 0.9435941072677727
Cross-entropy: 1.8178513786118837


Теперь 10 штук

In [7]:
# Генерация датасетов для классификации
datasets = []
for _ in range(10):
    X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=_)
    datasets.append((X, y))


In [8]:
# Обучение и оценка дерева каждого датасета
for i, (X, y) in enumerate(datasets):
    # пропорции 60-20-20
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    clf = DecisionTreeClassifier()
    #clf.fit(X_train, y_train)

    # Поиск по сетке с кросс-валидацией
    grid_search = GridSearchCV(clf, param_grid, cv=5)
    grid_search.fit(X_val, y_val)

    # Получение наилучшего гиперпараметра
    best_max_depth = grid_search.best_params_['max_depth']

    # Обучение модели с наилучшим параметром на обучающей + валидационной выборках
    best_clf = DecisionTreeClassifier(max_depth=best_max_depth)
    best_clf.fit(X_train, y_train)
    
    
    y_pred = best_clf.predict(X_test)
    
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    fbeta = fbeta_score(y_test, y_pred, beta=0.5, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    ap = average_precision_score(y_test, y_pred, average='macro')
    cross_entropy = log_loss(y_test, y_pred)

    print(f"Metrics for dataset {i+1}:")
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F-beta: {fbeta}')
    print(f'Accuracy: {accuracy}')
    print(f'Average Precision: {ap}')
    print(f'Cross-entropy: {cross_entropy}')
    print("-" * 30)

Metrics for dataset 1:
Precision: 0.9299999999999999
Recall: 0.930172068827531
F-beta: 0.9299748795980735
Accuracy: 0.93
Average Precision: 0.9062745098039215
Cross-entropy: 2.417738335566654
------------------------------
Metrics for dataset 2:
Precision: 0.839421918908069
Recall: 0.839421918908069
F-beta: 0.839421918908069
Accuracy: 0.84
Average Precision: 0.7685468537799909
Cross-entropy: 5.526268190980126
------------------------------
Metrics for dataset 3:
Precision: 0.868431855500821
Recall: 0.8598997493734335
F-beta: 0.8604914834150714
Accuracy: 0.855
Average Precision: 0.8506235827664398
Cross-entropy: 5.0081385692106535
------------------------------
Metrics for dataset 4:
Precision: 0.9330357142857143
Recall: 0.9271844660194175
F-beta: 0.9284706119568504
Accuracy: 0.925
Average Precision: 0.8660714285714286
Cross-entropy: 2.5904681994255667
------------------------------
Metrics for dataset 5:
Precision: 0.8565208687819037
Recall: 0.8584943639291465
F-beta: 0.855254736303123

### Регрессия


In [9]:
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Обучение
reg = DecisionTreeRegressor()
reg.fit(X_train_reg, y_train_reg)

# Предикт
y_pred_reg = reg.predict(X_test_reg)


In [11]:
#функция для Quantile standart error
def quantile_standard_error(y_test_reg, y_pred_reg, q):
    errors = y_test_reg - y_pred_reg
    qse = np.maximum(q * errors, (q - 1) * errors)
    return np.mean(qse)

In [12]:
q = 0.95 # берём 95 перцентиль

In [13]:
# Расчёт метрик
mse = mean_squared_error(y_test_reg, y_pred_reg)
mae = mean_absolute_error(y_test_reg, y_pred_reg)
msle = mean_squared_log_error(y_test_reg, y_pred_reg)
qse = quantile_standard_error(y_test_reg, y_pred_reg, q)

In [14]:
print(f'Mean Squared Error (MSE): {mse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Log Error (MSLE): {msle}')
print(f"Quantile Standard Error (QSE) for {q}-quantile: {qse}")

Mean Squared Error (MSE): 0.06140350877192982
Mean Absolute Error (MAE): 0.06140350877192982
Mean Squared Log Error (MSLE): 0.0295015008546264
Quantile Standard Error (QSE) for 0.95-quantile: 0.03464912280701755


Теперь 10 штук

In [15]:
# Генерация регрессионных датасетов
reg_datasets = []
for _ in range(10):
    X_reg, y_reg = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=_)
    y_reg = abs(y_reg)
    reg_datasets.append((X_reg, y_reg))

In [16]:
# Обучение и оценка каждого датасета
for i, (X_reg, y_reg) in enumerate(reg_datasets):
    # пропорции 60-20-20
    X_train_reg, X_temp_reg, y_train_reg, y_temp_reg = train_test_split(X_reg, y_reg, test_size=0.4, random_state=42)
    X_val_reg, X_test_reg, y_val_reg, y_test_reg = train_test_split(X_temp_reg, y_temp_reg, test_size=0.5, random_state=42)

    
    reg = DecisionTreeRegressor()
    #reg.fit(X_train_reg, y_train_reg)
    

    # Поиск по сетке с кросс-валидацией
    grid_search = GridSearchCV(reg, param_grid, cv=5)
    grid_search.fit(X_val_reg, y_val_reg)

    # Получение наилучшего гиперпараметра
    best_max_depth = grid_search.best_params_['max_depth']

    # Обучение модели с наилучшим параметром на обучающей + валидационной выборках
    best_reg = DecisionTreeRegressor(max_depth=best_max_depth)
    best_reg.fit(X_train_reg, y_train_reg)
    

    y_pred_reg = best_reg.predict(X_test_reg)

    mse = mean_squared_error(y_test_reg, y_pred_reg)
    mae = mean_absolute_error(y_test_reg, y_pred_reg)
    msle = mean_squared_log_error(y_test_reg, y_pred_reg)
    qse = quantile_standard_error(y_test_reg, y_pred_reg, q)

    print(f"Metrics for regression dataset {i+1}:")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Mean Squared Log Error (MSLE): {msle}")
    print(f"Quantile Standard Error (QSE) for {q}-quantile: {qse}")
    print("-" * 30)

Metrics for regression dataset 1:
Mean Squared Error (MSE): 7226.202502727297
Mean Absolute Error (MAE): 68.34735064872149
Mean Squared Log Error (MSLE): 1.3187687906850363
Quantile Standard Error (QSE) for 0.95-quantile: 30.69657219269441
------------------------------
Metrics for regression dataset 2:
Mean Squared Error (MSE): 10302.334815220744
Mean Absolute Error (MAE): 81.73662543975887
Mean Squared Log Error (MSLE): 0.9463243793894369
Quantile Standard Error (QSE) for 0.95-quantile: 43.23963143866975
------------------------------
Metrics for regression dataset 3:
Mean Squared Error (MSE): 7029.262269162499
Mean Absolute Error (MAE): 67.97721014684325
Mean Squared Log Error (MSLE): 1.2568225003524747
Quantile Standard Error (QSE) for 0.95-quantile: 35.76875653144484
------------------------------
Metrics for regression dataset 4:
Mean Squared Error (MSE): 8299.017103073173
Mean Absolute Error (MAE): 71.90596167114046
Mean Squared Log Error (MSLE): 1.2541559405922924
Quantile Stan