In [11]:
import numpy as np
import pandas as pd
import optuna
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.model_selection import cross_val_score

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#Получаем данные для обучения
text_embeddings = np.load("text_embeddings.npy")

df = pd.read_csv("esg_data.csv")
df = df[[20 <= len(text.split()) <= 1000 for text in df['text']]]   

ratings_E = df["E"].values 
ratings_S = df["S"].values
ratings_G = df["G"].values
ratings_ESG = df["ESG"].values

In [4]:
X_train, X_test, y_train_E, y_test_E = train_test_split(text_embeddings, ratings_E, test_size=0.2, random_state=42)
_, _, y_train_S, y_test_S = train_test_split(text_embeddings, ratings_S, test_size=0.2, random_state=42)
_, _, y_train_G, y_test_G = train_test_split(text_embeddings, ratings_G, test_size=0.2, random_state=42)
_, _, y_train_ESG, y_test_ESG = train_test_split(text_embeddings, ratings_ESG, test_size=0.2, random_state=42)

In [15]:
# Функция для подбора гиперпараметров
def optimize_xgboost(trial, X, y):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300, step=50),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
    }

    model = xgb.XGBRegressor(**params, n_jobs=-1)

    # Оцениваем модель с кросс-валидацией
    score = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error').mean()
    return score

In [17]:
study_E = optuna.create_study(direction="maximize")
study_E.optimize(lambda trial: optimize_xgboost(trial, X_train, y_train_E), n_trials=20)

[I 2025-03-24 14:28:30,194] A new study created in memory with name: no-name-d8392022-635e-43e0-8004-fc964fff8c9a
[I 2025-03-24 14:30:44,599] Trial 0 finished with value: -20.14693627476235 and parameters: {'n_estimators': 300, 'learning_rate': 0.24488645247299864, 'max_depth': 7, 'subsample': 0.931387356148973, 'colsample_bytree': 0.6860717232095006}. Best is trial 0 with value: -20.14693627476235.
[I 2025-03-24 14:32:08,177] Trial 1 finished with value: -19.619293700293984 and parameters: {'n_estimators': 300, 'learning_rate': 0.13913897243010412, 'max_depth': 5, 'subsample': 0.7581498126974126, 'colsample_bytree': 0.6399902051421923}. Best is trial 1 with value: -19.619293700293984.
[I 2025-03-24 14:32:26,032] Trial 2 finished with value: -20.484751023581477 and parameters: {'n_estimators': 100, 'learning_rate': 0.2686868346371796, 'max_depth': 3, 'subsample': 0.8045083954766062, 'colsample_bytree': 0.6589990114802932}. Best is trial 1 with value: -19.619293700293984.
[I 2025-03-24 

In [18]:
study_S = optuna.create_study(direction="maximize")
study_S.optimize(lambda trial: optimize_xgboost(trial, X_train, y_train_S), n_trials=20)

[I 2025-03-24 15:17:48,794] A new study created in memory with name: no-name-83418c22-a313-4ae5-8fcf-b5d58cf95471
[I 2025-03-24 15:18:59,167] Trial 0 finished with value: -23.69981439522085 and parameters: {'n_estimators': 150, 'learning_rate': 0.2643822471774941, 'max_depth': 7, 'subsample': 0.6753519716450556, 'colsample_bytree': 0.7702883068868174}. Best is trial 0 with value: -23.69981439522085.
[I 2025-03-24 15:22:05,182] Trial 1 finished with value: -22.499075446651606 and parameters: {'n_estimators': 300, 'learning_rate': 0.16406561324426147, 'max_depth': 8, 'subsample': 0.8417857343014872, 'colsample_bytree': 0.6790483886114407}. Best is trial 1 with value: -22.499075446651606.
[I 2025-03-24 15:22:33,241] Trial 2 finished with value: -23.387223618998902 and parameters: {'n_estimators': 100, 'learning_rate': 0.2598479586000306, 'max_depth': 5, 'subsample': 0.6392073655534625, 'colsample_bytree': 0.9232755924325222}. Best is trial 1 with value: -22.499075446651606.
[I 2025-03-24 

In [19]:
study_G = optuna.create_study(direction="maximize")
study_G.optimize(lambda trial: optimize_xgboost(trial, X_train, y_train_G), n_trials=20)

[I 2025-03-24 16:19:09,426] A new study created in memory with name: no-name-af355d24-e13a-4f05-87de-f90da5e24769
[I 2025-03-24 16:20:39,067] Trial 0 finished with value: -21.28688182701688 and parameters: {'n_estimators': 300, 'learning_rate': 0.17606122556732562, 'max_depth': 5, 'subsample': 0.7721038548580296, 'colsample_bytree': 0.7994134089544475}. Best is trial 0 with value: -21.28688182701688.
[I 2025-03-24 16:23:33,448] Trial 1 finished with value: -21.747699808580517 and parameters: {'n_estimators': 100, 'learning_rate': 0.22983493532108865, 'max_depth': 10, 'subsample': 0.8469095621121108, 'colsample_bytree': 0.9626576985991364}. Best is trial 0 with value: -21.28688182701688.
[I 2025-03-24 16:25:12,754] Trial 2 finished with value: -21.081488592197015 and parameters: {'n_estimators': 200, 'learning_rate': 0.15272456091144235, 'max_depth': 7, 'subsample': 0.839939088427309, 'colsample_bytree': 0.7402928532988206}. Best is trial 2 with value: -21.081488592197015.
[I 2025-03-24

In [20]:
best_params_E = study_E.best_params
best_params_S = study_S.best_params
best_params_G = study_G.best_params

In [21]:
model_E = xgb.XGBRegressor(**best_params_E, n_jobs=-1)
model_S = xgb.XGBRegressor(**best_params_S, n_jobs=-1)
model_G = xgb.XGBRegressor(**best_params_G, n_jobs=-1)

In [None]:
model_E.fit(X_train, y_train_E)
model_S.fit(X_train, y_train_S)
model_G.fit(X_train, y_train_G)

Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x0000015990BB3A10>>
Traceback (most recent call last):
  File "C:\Users\waksi\PycharmProjects\ESG_2025\venv\Lib\site-packages\xgboost\core.py", line 585, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument

KeyboardInterrupt: 


In [6]:
X_meta_train = np.column_stack([
    model_E.predict(X_train),
    model_S.predict(X_train),
    model_G.predict(X_train)
])

X_meta_test = np.column_stack([
    model_E.predict(X_test),
    model_S.predict(X_test),
    model_G.predict(X_test)
])

In [None]:
study_ESG = optuna.create_study(direction="maximize")
study_ESG.optimize(lambda trial: optimize_xgboost(trial, X_meta_train, y_train_ESG), n_trials=20)  # Метамодель

In [None]:
best_params_ESG = study_ESG.best_params

In [7]:
meta_model = xgb.XGBRegressor(**best_params_ESG, n_jobs=-1)
meta_model.fit(X_meta_train, y_train_ESG, verbose=True)

In [8]:
y_pred_ESG = meta_model.predict(X_meta_test)
rmse = np.sqrt(mean_squared_error(y_test_ESG, y_pred_ESG))
print(f"RMSE мета-модели: {rmse:.4f}")

# Сохраняем модели
joblib.dump(model_E, "model_E.pkl")
joblib.dump(model_S, "model_S.pkl")
joblib.dump(model_G, "model_G.pkl")
joblib.dump(meta_model, "meta_model.pkl")
print("Модели сохранены!")

RMSE мета-модели: 30.4797
Модели сохранены!
