In [2]:
import pandas.plotting as pd_plt
import matplotlib.colors as plt_colors
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import plotly.subplots 
from collections import OrderedDict

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

from sklearn.model_selection import train_test_split, GridSearchCV, \
                                    ShuffleSplit, KFold

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

from sklearn.preprocessing import MinMaxScaler, StandardScaler, \
                                  RobustScaler, PolynomialFeatures, \
                                  OrdinalEncoder, LabelEncoder, \
                                  OneHotEncoder, TargetEncoder, \
                                  QuantileTransformer, PowerTransformer, \
                                  KBinsDiscretizer

from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression, RANSACRegressor, \
                                 Ridge, Lasso, LinearRegression

from sklearn.metrics import recall_score, precision_score, \
                            f1_score, ConfusionMatrixDisplay, \
                            confusion_matrix, roc_auc_score, \
                            RocCurveDisplay, PrecisionRecallDisplay, \
                            roc_curve, precision_recall_curve, \
                            PrecisionRecallDisplay, make_scorer, \
                            mean_squared_error, auc

from sklearn.compose import ColumnTransformer, make_column_selector,\
                            make_column_transformer, TransformedTargetRegressor
import category_encoders as ce

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, \
                             VotingRegressor, StackingRegressor

### Ниже построение фореста и подбор параметров

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data1.data, data1.target, random_state=13)
forest = RandomForestClassifier(n_estimators=1000, max_features=10, random_state=13,
                               oob_score=True) 
forest.fit(X_train, y_train)
print('Правильность на обучающем наборе: {:.3f}'.format(forest.score(X_train, y_train)))
print('Правильность на OOB: {:.3f}'.format(forest.oob_score_))
print('Правильность на тестовом наборе: {:.3f}'.format(forest.score(X_test, y_test)))

In [None]:
# Подбор критерия
sc_train1 = []
sc_oob1 = []
sc_train2 = []
sc_oob2 = []
n_estimators = [200, 500, 1000, 5000]
for n in n_estimators:
    rf = RandomForestClassifier(n_estimators=n, random_state=13, n_jobs=-1, oob_score=True)
    rf.fit(X_train, y_train)
    sc_train_n = rf.score(X_train, y_train)
    sc_oob_n = rf.oob_score_
    sc_train1.append(sc_train_n)
    sc_oob1.append(sc_oob_n)

for n in n_estimators:
    rf = RandomForestClassifier(n_estimators=n, random_state=13, n_jobs=-1, criterion='entropy', oob_score=True)
    rf.fit(X_train, y_train)
    sc_train_n = rf.score(X_train, y_train)
    sc_oob_n = rf.oob_score_
    sc_train2.append(sc_train_n)
    sc_oob2.append(sc_oob_n)    
    
fig = plt.figure(figsize=(15,5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
ax1.plot(n_estimators,sc_train1)
ax1.plot(n_estimators,sc_oob1)
#ax1.set_ylim(0.9, 1.01)
ax2.plot(n_estimators,sc_train2)
ax2.plot(n_estimators,sc_oob2)
#ax2.set_ylim(0.9, 1.01);

In [None]:
# Важность фичей, не особо актуально, когда есть вектора шепли
def plot_feature_importances_cancer(model):
    n_features = data1.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), cancer.feature_names) 
    plt.xlabel("Важность признака")
    plt.ylabel("Признак")
plt.figure(figsize = (15, 10))
plot_feature_importances_cancer(forest)

In [None]:
# Подбор числа фичей
min_features = 1
max_features = data1.shape[1]

ensemble_clfs = [
    (
        "RandomForestClassifier, gini",
        RandomForestClassifier(
            criterion='gini',
             min_samples_leaf = 5,
            n_jobs = -1,
            oob_score=True,
            random_state=13,
        ),
    ),
    (
        "RandomForestClassifier, entropy",
        RandomForestClassifier(
            criterion='entropy',
             min_samples_leaf = 5,
            n_jobs = -1,
            oob_score=True,
            random_state=13,
        ),
    )
]

# Сопоставляем имя классификатора со списком пар (<n_estimators>, <коэффициент ошибок>).
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

for label, clf in ensemble_clfs:
    for i in range(min_features, max_features + 1):
        clf.set_params(n_estimators = 500, max_features = i)
        clf.fit(X_train, y_train)

        # Запишем ошибку OOB для каждой настройки `n_estimators=i`.
        oob_error = 1 - clf.oob_score_
        error_rate[label].append((i, oob_error))

In [None]:
for label, clf_err in error_rate.items():
    xs, ys = zip(*clf_err)
    plt.plot(xs, ys, label=label)

plt.xlim(min_features, max_features)
plt.xlabel("features")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
plt.show()


In [None]:
y_values_max_features = []
for i in range(1, 15):
    RF = RandomForest(n_estimators=500,n_jobs=-1, max_features=i, min_samples_leaf=5)
    RF.fit(X_train, y_train)
    y_values_max_features.append(RF.oob_score_)

In [None]:
# График для подбора
plt.figure(figsize=(8, 5))  
plt.plot(range(1, 10), y_values_max_features)  
plt.xlabel("max_features")          
plt.ylabel("score")          
plt.legend() 
plt.axvline(color='red', linestyle='--')
#plt.grid(True)                
plt.show()

In [None]:
# Подбор оптимальной глубины деревьев в лесе
y_values_max_depth = []
for n in range(1, 10):
    RF = RandomForest(n_estimators=500,n_jobs=-1, max_features='log2', min_samples_leaf=5, max_depth=n)
    RF.fit(X_train, y_train)
    y_values_max_depth.append(RF.oob_score_)

In [None]:
# График для подбора
plt.figure(figsize=(8, 5))  
plt.plot(range(1, 10), y_values_max_depth)  
plt.xlabel("max_depth")          
plt.ylabel("score")          
plt.legend() 
plt.axvline(color='red', linestyle='--')
#plt.grid(True)                
plt.show()

In [None]:
# Подбор минимального числа семплов в листе
y_values_min_samples = []
for n in range(1, 31):
    RF = RandomForestClassifier(n_estimators=500,n_jobs=-1, max_features='log2', min_samples_leaf=n)
    RF.fit(X_train, y_train)
    y_values_min_samples.append(RF.oob_score_)

In [None]:
# График для подбора
plt.figure(figsize=(8, 5))  
plt.plot(range(1, 31), y_values_min_samples)  
plt.xlabel("min_samples_leaf")          
plt.ylabel("score")          
plt.legend() 
plt.axvline(x=5, color='red', linestyle='--')               
plt.show()  

### Градиентный бустинг

In [None]:
gbrt = GradientBoostingRegressor(random_state=13, max_depth=5, n_estimators=500, learning_rate=0.05, max_features='sqrt', subsample=0.5)
gbrt.fit(X_train, y_train)
print('Правильность на обучающем наборе: {:.3f}'.format(gbrt.score(X_train, y_train)))
print('Правильность на тестовом наборе: {:.3f}'.format(gbrt.score(X_test, y_test)))

In [None]:
# Learning rate u subsample
y_values_GB_0_5 = []
y_values_GB_0_75 = []
y_values_GB_1 = []

for lr in [0.01, 0.05, 0.1, 0.5]:
    GB_0_5 = GradientBoostingRegressor(random_state=13, n_estimators=500, learning_rate=lr, subsample=0.5)
    GB_0_5.fit(X_train, y_train)
    y_values_GB_0_5.append(GB_0_5.score(X_test, y_test))
    GB_0_75 = GradientBoostingRegressor(random_state=13, n_estimators=500, learning_rate=lr, subsample=0.75)
    GB_0_75.fit(X_train, y_train)
    y_values_GB_0_75.append(GB_0_75.score(X_test, y_test))
    GB_1 = GradientBoostingRegressor(random_state=13, n_estimators=500, learning_rate=lr, subsample=1.0
    GB_1.fit(X_train, y_train)
    y_values_GB_1.append(GB_1.score(X_test, y_test))

In [None]:
# График для подбора
x = [0.01, 0.05, 0.1, 0.5]
plt.figure(figsize=(8, 5))
plt.plot(x, y_values_GB_0_5, label ='subsample=0.5')
plt.plot(x, y_values_GB_0_75, label ='subsample=0.75')
plt.plot(x, y_values_GB_1, label ='subsample=1')
plt.xlabel("learning_rate_GB")
plt.ylabel("score")
plt.legend()
plt.axvline(color='red', linestyle='')
plt.xscale('log')
plt.show()

In [None]:
# Максимальное число фичей
y_values_max_features_GB = []
for n in range(1, 20, 2):
    GB = GradientBoostingRegressor(random_state=13, n_estimators=500, learning_rate=0.05, max_features=n, subsample=0.5)
    GB.fit(X_train, y_train)
    y_values_max_features_GB.append(GB.score(X_test, y_test))

In [None]:
# График для подбора
plt.figure(figsize=(8, 5))  
plt.plot(range(1, 20, 2), y_values_max_features_GB)  
plt.xlabel("max_features_GB")          
plt.ylabel("score")          
plt.legend() 
plt.axvline(color='red', linestyle='--')
#plt.grid(True)                
plt.show()

In [None]:
# Подбор оптимальной глубины деревьев в лесе
y_values_max_depth_GB = []
for n in range(1, 6):
    GB = GradientBoostingRegressor(random_state=13, max_depth=n, n_estimators=500, learning_rate=0.05, max_features='sqrt', subsample=0.5)
    GB.fit(X_train, y_train)
    y_values_max_depth_GB.append(GB.score(X_test, y_test))

In [None]:
# График для подбора
plt.figure(figsize=(8, 5))  
plt.plot(range(1, 6), y_values_max_depth_GB)  
plt.xlabel("max_depth_GB")          
plt.ylabel("score")          
plt.legend() 
plt.axvline(color='red', linestyle='--')
#plt.grid(True)                
plt.show()

In [None]:
# Подбор минимального числа семплов в листе
y_values_min_samples_GB = []
for n in range(1, 100, 10):
    GB = GradientBoostingRegressor(random_state=13, max_depth=5, n_estimators=500, learning_rate=0.05, max_features='sqrt', subsample=0.5, min_samples_leaf=n)
    GB.fit(X_train, y_train)
    y_values_min_samples_GB.append(GB.score(X_test, y_test))

In [None]:
# График для подбора
plt.figure(figsize=(8, 5))  
plt.plot(range(1, 100, 10), y_values_min_samples_GB)  
plt.xlabel("min_samples_leaf_GB")          
plt.ylabel("score")          
plt.legend() 
plt.axvline(x=5, color='red', linestyle='--')               
plt.show()  

### Войтинг и стэкинг

In [None]:
# Забиваем свои модели и делаем
v_reg1 = GradientBoostingRegressor()
v_reg2 = RandomForestRegressor()
v_reg3 = LinearRegression()



# v_reg1.fit(X_train, y_train)
# v_reg2.fit(X_train, y_train)
# v_reg3.fit(X_train, y_train)

ve_reg = VotingRegressor([("gb", v_reg1), ("rf", v_reg2), ("lr", v_reg3)])
ve_reg.fit(X_train, y_train)
display(ve_reg.score(X_test, y_test))

In [None]:
# xt = Xd[:20]
# v_pred1 = v_reg1.predict(xt)
# v_pred2 = v_reg2.predict(xt)
# v_pred3 = v_reg3.predict(xt)
# v_pred4 = ve_reg.predict(xt)

# plt.figure(figsize=(15, 10))
# plt.plot(v_pred1, "gd", label="GradientBoostingRegressor")
# plt.plot(v_pred2, "b^", label="RandomForestRegressor")
# plt.plot(v_pred3, "ys", label="LinearRegression")
# plt.plot(v_pred4, "r*", ms=10, label="VotingRegressor")
# plt.plot(yd[:20], "ko", label="y_true")

# plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
# plt.ylabel("predicted")
# plt.xlabel("training samples")
# plt.legend(loc="best")
# plt.title("Regressor predictions and their average")

# plt.show()

In [None]:
# Стэкинг, если надо, пишем свою модель в final_estimator
se_reg = StackingRegressor(
    estimators = [("gb", v_reg1), ("rf", v_reg2), ("lr", v_reg3)],
    # final_estimator = ()
)
se_reg.fit(X_train, y_train)
display(se_reg.score(X_test, y_test))

In [None]:

# plt.figure(figsize=(15, 10))
# plt.plot(v_pred1, "gd", label="GradientBoostingRegressor")
# plt.plot(v_pred2, "b^", label="RandomForestRegressor")
# plt.plot(v_pred3, "ys", label="LinearRegression")
# plt.plot(v_pred4, "r*", ms=10, label="VotingRegressor")
# plt.plot(v_pred5, "g*", ms=10, label="StackingRegressor")
# plt.plot(yd[:20], "ko", label="y_true")

# plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
# plt.ylabel("predicted")
# plt.xlabel("training samples")
# plt.legend(loc="best")
# plt.title("Regressor predictions and their average")

# plt.show()