# Мини‑таск 3: Обучение моделей для предсказания биологической активности

### Импорт необходимых библиотек

In [29]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras import layers, callbacks
from tensorflow.keras.models import Model

Для воспроизводимости результатов фиксируем seed

In [30]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

### Загрузка датасета

In [31]:
df = pd.read_csv("data.csv")

In [32]:
X = df.drop(columns=["Smiles", "Standard Value", "Activity"]).values

In [33]:
y_nm = df["Activity"].values

Переводим IC50 → pIC50 = 9 − log10(IC50)

In [34]:
y = 9 - np.log10(y_nm + 1e-9)

### Разбиение датасета и стандартизация

In [35]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

In [36]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled  = scaler.transform(X_test)

### Классические модели

In [37]:
classical_models = {
    "RF":  RandomForestRegressor(n_estimators=200, random_state=SEED, n_jobs=-1),
    "GB":  GradientBoostingRegressor(n_estimators=200, random_state=SEED),
    "XGB": XGBRegressor(n_estimators=200, random_state=SEED, verbosity=0),
    "LGB": LGBMRegressor(n_estimators=200, random_state=SEED, n_jobs=-1),
}

5-фолдовая CV с рандомизацией

In [38]:
cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
cv_results = []

In [39]:
for name, model in classical_models.items():
    # RMSE через отрицательную MSE
    rmse = np.mean(
        np.sqrt(-cross_val_score(
            model, X_train_scaled, y_train,
            cv=cv, scoring="neg_mean_squared_error", n_jobs=-1
        ))
    )
    # MAE через отрицательную MAE
    mae = np.mean(
        -cross_val_score(
            model, X_train_scaled, y_train,
            cv=cv, scoring="neg_mean_absolute_error", n_jobs=-1
        )
    )
    # Коэффициент детерминации R²
    r2 = np.mean(
        cross_val_score(
            model, X_train_scaled, y_train,
            cv=cv, scoring="r2", n_jobs=-1
        )
    )
    cv_results.append((name, rmse, mae, r2))

In [40]:
df_cv = pd.DataFrame(cv_results, columns=["Model", "RMSE", "MAE", "R2"])
print(df_cv)

  Model      RMSE       MAE        R2
0    RF  0.128982  0.010707  0.989329
1    GB  0.133300  0.018968  0.989714
2   XGB  0.188920  0.021479  0.976202
3   LGB  0.221422  0.029384  0.971170


### Нейронные сети

In [41]:
def build_mlp(input_dim):
    inp = layers.Input(shape=(input_dim,))
    x   = layers.Dense(256, activation="relu")(inp)
    x   = layers.Dropout(0.2)(x)
    x   = layers.Dense(128, activation="relu")(x)
    x   = layers.Dropout(0.2)(x)
    out = layers.Dense(1, activation="linear")(x)

    m = Model(inp, out)
    m.compile(
        optimizer="adam",
        loss="mse",
        metrics=["mae"]
    )
    return m

Строим и обучаем MLP

In [42]:
mlp = build_mlp(X_train_scaled.shape[1])
mlp.fit(
    X_train_scaled, y_train,
    validation_split=0.1,      # 10% из train на валидацию
    epochs=50,
    batch_size=64,
    callbacks=[callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
    verbose=1
)

Epoch 1/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 11.7569 - mae: 2.5671 - val_loss: 1.3952 - val_mae: 0.9141
Epoch 2/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 2.2794 - mae: 1.1362 - val_loss: 1.2029 - val_mae: 0.8179
Epoch 3/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1.6056 - mae: 0.9823 - val_loss: 1.1268 - val_mae: 0.8034
Epoch 4/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1.4557 - mae: 0.9132 - val_loss: 1.1121 - val_mae: 0.7774
Epoch 5/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1.3635 - mae: 0.8749 - val_loss: 1.1700 - val_mae: 0.8068
Epoch 6/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1.2308 - mae: 0.8372 - val_loss: 1.1592 - val_mae: 0.7780
Epoch 7/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1.1166 

<keras.src.callbacks.history.History at 0x203865aa560>

1D-сверточная сеть

In [43]:
X_train_c = X_train_scaled[..., np.newaxis]
X_test_c  = X_test_scaled[...,  np.newaxis]

def build_cnn(seq_len):
    inp = layers.Input(shape=(seq_len, 1))
    x   = layers.Conv1D(64, 3, activation="relu")(inp)
    x   = layers.MaxPooling1D(2)(x)
    x   = layers.Conv1D(32, 3, activation="relu")(x)
    x   = layers.GlobalMaxPooling1D()(x)
    x   = layers.Dense(64, activation="relu")(x)
    out = layers.Dense(1, activation="linear")(x)

    m = Model(inp, out)
    m.compile(
        optimizer="adam",
        loss="mse",
        metrics=["mae"]
    )
    return m

Строим и обучаем CNN

In [44]:
cnn = build_cnn(X_train_scaled.shape[1])
cnn.fit(
    X_train_c, y_train,
    validation_split=0.1,
    epochs=50,
    batch_size=64,
    callbacks=[callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
    verbose=1
)

Epoch 1/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 11.8335 - mae: 2.7715 - val_loss: 5.9062 - val_mae: 1.8204
Epoch 2/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 4.9661 - mae: 1.7445 - val_loss: 3.8058 - val_mae: 1.5105
Epoch 3/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 3.3967 - mae: 1.4629 - val_loss: 2.3223 - val_mae: 1.2373
Epoch 4/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 2.4377 - mae: 1.2337 - val_loss: 1.9352 - val_mae: 1.1358
Epoch 5/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 2.2349 - mae: 1.1591 - val_loss: 1.8365 - val_mae: 1.1015
Epoch 6/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 2.1627 - mae: 1.1362 - val_loss: 1.8034 - val_mae: 1.0901
Epoch 7/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 

<keras.src.callbacks.history.History at 0x2038365d330>

### Результаты

In [45]:
final_results = []

Классические модели

In [46]:
for name, model in classical_models.items():
    model.fit(X_train_scaled, y_train)
    p = model.predict(X_test_scaled)
    final_results.append((
        name,
        np.sqrt(mean_squared_error(y_test, p)),  # RMSE
        mean_absolute_error(y_test, p),          # MAE
        r2_score(y_test, p)                      # R2
    ))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014063 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11541
[LightGBM] [Info] Number of data points in the train set: 5113, number of used features: 976
[LightGBM] [Info] Start training from score 5.924489




MLP

In [47]:
p_mlp = mlp.predict(X_test_scaled).ravel()
final_results.append((
    "MLP",
    np.sqrt(mean_squared_error(y_test, p_mlp)),
    mean_absolute_error(y_test, p_mlp),
    r2_score(y_test, p_mlp),
))

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


CNN

In [48]:
p_cnn = cnn.predict(X_test_c).ravel()
final_results.append((
    "CNN",
    np.sqrt(mean_squared_error(y_test, p_cnn)),
    mean_absolute_error(y_test, p_cnn),
    r2_score(y_test, p_cnn),
))

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [49]:
df_ho = pd.DataFrame(final_results, columns=["Model", "RMSE", "MAE", "R2"])
print(df_ho)

  Model      RMSE       MAE        R2
0    RF  0.065935  0.008676  0.998161
1    GB  0.113823  0.019295  0.994520
2   XGB  0.084485  0.018895  0.996981
3   LGB  0.135795  0.028047  0.992200
4   MLP  1.162620  0.849886  0.428247
5   CNN  1.355170  1.026530  0.223180


Random Forest лидирует по всем трём метрикам (наименьшие ошибки, наибольший R² ≈ 0.989).

Gradient Boosting почти не отстаёт (R² ≈ 0.990), чуть хуже MAE.

XGBoost и LightGBM демонстрируют более высокие ошибки и чуть более низкий R², но всё ещё очень высокую объясняющую способность (> 0.97).

MLP и CNN показали значительно худшие результаты по сравнению с классическими моделями.