# Препроцессинг

In [3]:
import numpy as np
import pandas as pd
import re
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix, f1_score, roc_auc_score, roc_curve
from optuna.integration import OptunaSearchCV
import optuna
import json
import os
import subprocess


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def convert_to_number(val):
    if pd.isna(val):
        return np.nan
    val = str(val).replace(' ', '')  # удаляем пробелы
    # обработка значений '<число'
    if val.startswith('<'):
        num = re.findall(r'<(\d+\.?\d*)', val)
        return float(num[0]) if num else np.nan
    # обработка значений с ±
    elif '±' in val:
        nums = re.findall(r'([\d\.]+)±([\d\.]+)', val)
        if nums:
            main, uncertainty = nums[0]
            return float(main)  # берём только среднее (первое число)
        else:
            return np.nan
    # обработка значений с '/'
    elif '/' in val:
        nums = re.findall(r'([\d\.]+)/([\d\.]+)', val)
        if nums:
            num1, num2 = nums[0]
            return (float(num1) + float(num2)) / 2
        else:
            return np.nan
    # пробуем просто преобразовать в число
    else:
        try:
            return float(val)
        except:
            return np.nan

In [191]:
# Load dataset
df = pd.read_csv("for_regr_descriptors_full.csv")
df['raw_efficiency'] = df['raw_efficiency'].apply(convert_to_number)

# Load embeddings
blomap_embeddings = np.load("blomap_regr.npy")
fingerprints_embeddings = np.load("fingerprints_regr.npy")
protbert_embeddings = np.load("protbert_regr.npy")

In [192]:
# Select numerical features
fp_path_index = df.columns.get_loc('fp_path')
selected_features = ['raw_efficiency', 'uptake_type'] + list(df.columns[fp_path_index + 1:])

X_numerical = df[selected_features].copy()

In [193]:
X_numerical

Unnamed: 0,raw_efficiency,uptake_type,MW,GRAVY,pI,Charge,Charge_Density,Aromaticity,Flexibility,Aliphatic_Index,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,750.0,Fluorescence intensity,3151.8362,-1.492308,11.608322,12.577710,0.483758,0.192308,1.021711,0.269231,...,0,0,0,0,0,0,0,0,12,0
1,1400.0,Fluorescence intensity,2785.1939,-1.272727,11.839377,5.637233,0.256238,0.136364,1.000826,0.272727,...,1,0,0,0,0,0,0,0,2,0
2,75.0,Relative Mean Fluorescence intensity (%),1439.6794,-3.710000,11.999968,6.608334,0.660833,0.000000,1.021405,0.100000,...,0,0,0,0,0,0,0,0,1,0
3,95.0,Relative Mean Fluorescence intensity (%),1439.6794,-3.710000,11.999968,6.608334,0.660833,0.000000,1.006857,0.100000,...,0,0,0,0,0,0,0,0,1,0
4,66.0,Relative Mean Fluorescence intensity (%),1439.6794,-3.710000,11.999968,6.608334,0.660833,0.000000,1.039310,0.100000,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
872,80.0,Cellular uptake (%),515.6103,-3.325000,11.999968,2.554897,0.638724,0.000000,,0.000000,...,0,0,0,0,0,0,0,0,2,0
873,424.0,Relative fluorescence (%),178.2095,1.050000,5.518123,-0.467050,-0.233525,0.000000,,0.000000,...,0,0,0,0,0,0,0,0,0,0
874,100.0,Relative cellular uptake (%),2600.6253,-3.215789,7.516013,0.216729,0.011407,0.000000,0.950467,0.000000,...,0,0,0,0,0,0,0,0,0,0
875,25.0,Mean Fluorescence intensity,3712.9716,-3.611538,11.999968,8.179901,0.314612,0.000000,0.967821,0.000000,...,0,0,0,0,0,0,0,0,0,0


In [175]:
# One-hot encoding for cell_line
if "cell_line" in df.columns:
    enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    cell_line_encoded = enc.fit_transform(df[['cell_line']])
    cell_line_feature_names = enc.get_feature_names_out(["cell_line"])
    X_cell_line = pd.DataFrame(cell_line_encoded, columns=cell_line_feature_names)

In [154]:
# Функция для удаления выбросов методом IQR
def remove_outliers(df, target_column):
    """
    Удаляет выбросы из числовых колонок методом межквартильного размаха (IQR).

    Аргументы:
    df — pandas DataFrame с числовыми признаками.
    target_column — название столбца с таргетом (raw_efficiency).

    Возвращает:
    Очищенный DataFrame без выбросов.
    """
    df_clean = df.copy()

    Q1 = df_clean['raw_efficiency'].quantile(0.25)
    Q3 = df_clean['raw_efficiency'].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df_clean = df_clean[(df_clean['raw_efficiency'] >= lower_bound) & (df_clean['raw_efficiency'] <= upper_bound)]

    return df_clean

In [176]:
# Фильтруем датафрейм по столбцу 'uptake_type'
X_numerical_filtered = X_numerical[X_numerical['uptake_type'].isin(['Mean Fluorescence intensity', 'Fluorescence intensity'])].copy()
# X_numerical_filtered = X_numerical_filtered[X_numerical_filtered['raw_efficiency'] <= 50000]

# Удаляем выбросы из числовых признаков
X_numerical_filtered_no_outliers = remove_outliers(X_numerical_filtered, 'raw_efficiency')

# Проверяем размер до и после удаления выбросов
print(f"Размер данных ДО удаления выбросов: {X_numerical_filtered.shape}")
print(f"Размер данных ПОСЛЕ удаления выбросов: {X_numerical_filtered_no_outliers.shape}")

# Получаем индексы отфильтрованных молекул
filtered_indices = X_numerical_filtered_no_outliers.index

# Фильтруем массивы эмбеддингов
blomap_embeddings_filtered = blomap_embeddings[filtered_indices]
fingerprints_embeddings_filtered = fingerprints_embeddings[filtered_indices]
protbert_embeddings_filtered = protbert_embeddings[filtered_indices]
X_cell_line_filtered = X_cell_line.loc[filtered_indices]

Размер данных ДО удаления выбросов: (312, 23)
Размер данных ПОСЛЕ удаления выбросов: (268, 23)


In [180]:
target = 'raw_efficiency'
rdkit_descriptors = X_numerical_filtered_no_outliers.drop(columns=['uptake_type', target])

In [181]:
imputer = SimpleImputer(strategy="mean")
rdkit_descriptors = pd.DataFrame(imputer.fit_transform(rdkit_descriptors))

In [182]:
rdkit_descriptors = rdkit_descriptors.reset_index(drop=True)
blomap_embeddings_filtered = pd.DataFrame(blomap_embeddings_filtered).reset_index(drop=True)
fingerprints_embeddings_filtered = pd.DataFrame(fingerprints_embeddings_filtered).reset_index(drop=True)
protbert_embeddings_filtered = pd.DataFrame(protbert_embeddings_filtered).reset_index(drop=True)
X_cell_line_filtered = pd.DataFrame(X_cell_line_filtered).reset_index(drop=True)

In [183]:
list_of_dfs = [
rdkit_descriptors,
blomap_embeddings_filtered,
fingerprints_embeddings_filtered,
protbert_embeddings_filtered
]

# Объединяем датафреймы из списка по столбцам
combined_df_concat = pd.concat(list_of_dfs, axis=1)

print("Успешно объединенный датафрейм (pd.concat):")
print(combined_df_concat)
print("\nИнформация об объединенном датафрейме:")
combined_df_concat.info()

Успешно объединенный датафрейм (pd.concat):
          0         1          2          3         4         5         6     \
0    3151.8362 -1.492308  11.608322  12.577710  0.483758  0.192308  1.021711   
1    2785.1939 -1.272727  11.839377   5.637233  0.256238  0.136364  1.000826   
2    2078.3658 -1.123529  11.824485   3.657547  0.215150  0.000000  0.983251   
3    3208.5313 -0.066667   4.139095  -2.395473 -0.079849  0.100000  0.993229   
4    2624.9800  0.100000   4.783081  -1.396580 -0.053715  0.000000  0.999597   
..         ...       ...        ...        ...       ...       ...       ...   
263   424.5177  0.266667   8.249713   0.532924  0.177641  0.333333  0.996992   
264   390.5015  0.833333   8.249713   0.532924  0.177641  0.000000  0.996992   
265    89.0932  1.800000   5.570017  -0.392198 -0.392198  0.000000  0.996992   
266  3712.9716 -3.611538  11.999968   8.179901  0.314612  0.000000  0.967821   
267  3712.9716 -3.611538  11.999968   8.179901  0.314612  0.000000  0.967821

In [184]:
X = pd.concat([rdkit_descriptors, X_cell_line_filtered], axis=1)
y = np.log1p(X_numerical_filtered_no_outliers[target]).reset_index(drop=True)

In [185]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [82]:
def apply_scaler (train, test):
    train.columns = train.columns.astype(str)
    test.columns = test.columns.astype(str)

    scaler = MinMaxScaler(feature_range=(0, 1))
    train_scaled = pd.DataFrame(scaler.fit_transform(train))
    test_scaled = pd.DataFrame(scaler.transform(test))
    return train_scaled, test_scaled

In [83]:
def apply_pca (X_train, X_test, threshold=0.95):
    pca = PCA(n_components=threshold, svd_solver='full')
    train_transformed = pd.DataFrame(pca.fit_transform(X_train))
    test_transformed = pd.DataFrame(pca.transform(X_test))
    dispersion=round((sum(pca.explained_variance_ratio_)*100),2)
    number_of_comp = pca.n_components_
    print(number_of_comp,dispersion)

    return train_transformed, test_transformed

In [186]:
X_train_scaled, X_test_scaled = apply_scaler (X_train, X_test)
X_train_transformed, X_test_transformed = apply_pca (X_train_scaled, X_test_scaled)

30 95.22


## Describe

In [18]:
y_train.describe()

count    240.000000
mean       5.263885
std        2.535125
min        0.000000
25%        3.179944
50%        5.354222
75%        7.191704
max       10.732781
Name: raw_efficiency, dtype: float64

In [19]:
y_test.describe()

count    61.000000
mean      5.693010
std       2.559058
min       0.000000
25%       3.713572
50%       6.274762
75%       7.650169
max      10.070738
Name: raw_efficiency, dtype: float64

In [20]:
X_train_transformed.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
count,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,...,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0
mean,-1.480297e-17,1.110223e-16,5.921189000000001e-17,-5.551115e-18,-2.312965e-18,5.551115e-18,-1.480297e-17,-1.050086e-16,-3.23815e-17,5.1810410000000004e-17,...,-6.938894e-18,3.4694470000000005e-17,2.2204460000000003e-17,3.515706e-17,-3.7932620000000004e-17,-2.2435760000000003e-17,-3.851086e-17,-2.312965e-17,-1.480297e-17,2.6830390000000003e-17
std,1.272786,0.8186397,0.7376091,0.5797919,0.5195548,0.4817326,0.4394338,0.4151679,0.3866189,0.3497617,...,0.182031,0.1768045,0.1649148,0.1535916,0.1521741,0.1477393,0.1413964,0.1388675,0.1348582,0.1280971
min,-1.96544,-1.379869,-2.864736,-1.686678,-1.405151,-2.585827,-1.129805,-0.9184562,-1.413026,-1.096961,...,-0.6527295,-0.6869134,-0.4855656,-0.4161874,-0.4479006,-0.6320621,-0.4223442,-0.3943208,-0.5615375,-0.3236971
25%,-0.8618459,-0.8219735,-0.2332418,-0.3275711,-0.2466079,-0.174987,-0.2481082,-0.2719088,-0.17008,-0.1637774,...,-0.1065281,-0.06893674,-0.09215177,-0.07090407,-0.06426667,-0.06689714,-0.0751658,-0.0633355,-0.07163601,-0.08053163
50%,-0.1773972,0.05699123,0.07311377,0.1218503,-0.03167169,0.008145083,-0.0002538494,-0.01635784,-0.009801625,-0.05251453,...,-0.006726567,-0.002213239,-0.02267681,-0.004670004,0.01262641,0.001057987,-0.003116924,-0.01551345,0.002177639,-0.007635684
75%,0.4316677,0.7688228,0.4255782,0.3018353,0.1754976,0.1868054,0.2780078,0.2646072,0.1519039,0.1450116,...,0.08635289,0.08054209,0.07386376,0.07731287,0.07334667,0.05460041,0.06505729,0.0445758,0.06900418,0.07286904
max,6.289197,1.704504,1.800595,1.637605,1.900389,1.663029,1.617632,0.9819365,0.817667,1.108351,...,0.4892399,0.4924286,0.6584128,0.7565124,0.3814527,0.4342774,0.5933998,0.8514134,0.4089468,0.5327979


In [21]:
X_test_transformed.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
count,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,...,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0
mean,-0.12162,-0.132343,-0.09751,-0.04092,0.016798,0.023605,0.153955,0.013409,-0.065572,0.014037,...,-0.005066,-0.023699,-0.042413,-0.008293,-0.028573,-0.022994,-0.036142,0.02737,-0.007746,-0.007914
std,1.002221,0.820522,0.597391,0.72452,0.491218,0.568724,0.577743,0.389276,0.513935,0.377261,...,0.230022,0.220255,0.133789,0.138368,0.188349,0.133305,0.18393,0.148096,0.167668,0.150598
min,-2.244204,-1.385896,-1.738986,-2.206963,-1.106289,-2.161688,-0.835065,-0.754857,-2.082711,-1.463621,...,-0.68239,-0.573973,-0.484112,-0.376816,-0.505614,-0.335953,-0.587269,-0.225886,-0.736731,-0.42148
25%,-0.772537,-0.847,-0.415368,-0.316927,-0.348895,-0.184905,-0.177802,-0.278426,-0.190907,-0.161895,...,-0.127145,-0.118033,-0.106933,-0.090636,-0.140948,-0.102766,-0.133699,-0.060107,-0.066284,-0.109115
50%,-0.435549,-0.390371,-0.096181,0.077986,-0.027697,0.058828,0.068481,0.008451,-0.028296,0.037461,...,-0.006666,-0.007286,-0.058839,-0.010459,-0.003165,-0.011713,-0.029561,0.013217,-0.005586,0.000167
75%,0.331,0.381126,0.228356,0.296788,0.191613,0.216392,0.33223,0.200616,0.139683,0.201162,...,0.117236,0.089889,0.040441,0.070244,0.094913,0.060823,0.043551,0.082175,0.064995,0.085054
max,2.805501,1.822887,1.50323,1.462923,1.512882,2.146285,2.444992,0.756069,0.830455,0.900849,...,0.44896,0.400708,0.36166,0.281639,0.393675,0.334128,0.509213,0.669058,0.415241,0.343931


# Облучение

In [22]:
from sklearn.svm import SVR

In [23]:
# Определяем функцию для оценки модели
def evaluate_model(model, X_train, y_train, X_test, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    train_r2 = r2_score(y_train, y_pred_train)
    train_rmse = mean_squared_error(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    test_rmse = mean_squared_error(y_test, y_pred_test)

    print(f'Train RMSE: {train_rmse:.4f}')
    print(f'Train R2: {train_r2:.4f}')
    print(f'Test RMSE: {test_rmse:.4f}')
    print(f'Test R2: {test_r2:.4f}')

In [24]:
# Инициализируем модель SVR и оцениваем её
def svr_learning(X_train, y_train, X_test, y_test):
    model = SVR()
    model.fit(X_train, y_train)
    evaluate_model(model, X_train, y_train, X_test, y_test)

In [25]:
def svr_optuna(X_train, y_train, X_test, y_test):
    # Целевая функция для оптимизации
    def objective(trial):
        params = {
            "C": trial.suggest_float("C", 1, 100, log=True),
            "epsilon": trial.suggest_float("epsilon", 1e-3, 1.0, log=True)
        }
        model = SVR(**params)
        score = cross_val_score(
            model,
            X_train,
            y_train,
            cv=5,
            scoring="neg_root_mean_squared_error",
            n_jobs=-1
        )
        return score.mean()

    # Создание и оптимизация исследования
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

    print("Лучшие гиперпараметры:", study.best_params)
    print("Лучшая средняя ошибка (CV):", -study.best_value)

    # Обучаем модель с лучшими гиперпараметрами
    best_model = SVR(**study.best_params)
    best_model.fit(X_train, y_train)

    # Оцениваем модель (предположим, у тебя есть функция evaluate_model)
    evaluate_model(best_model, X_train, y_train, X_test, y_test)

In [187]:
svr_learning(X_train_transformed, y_train, X_test_transformed, y_test)

Train RMSE: 2.3035
Train R2: 0.5289
Test RMSE: 3.8219
Test R2: 0.3075


In [188]:
svr_optuna(X_train_transformed, y_train, X_test_transformed, y_test)

[I 2025-04-10 03:13:42,804] A new study created in memory with name: no-name-d875e656-bcd6-41a4-b9e2-ae4a1976fa8e
[I 2025-04-10 03:13:42,832] Trial 0 finished with value: -1.8012620778131727 and parameters: {'C': 2.7089932116486493, 'epsilon': 0.006328929747113507}. Best is trial 0 with value: -1.8012620778131727.
[I 2025-04-10 03:13:42,859] Trial 1 finished with value: -1.8428806249840535 and parameters: {'C': 22.014065982971623, 'epsilon': 0.08152541153836422}. Best is trial 0 with value: -1.8012620778131727.
[I 2025-04-10 03:13:42,886] Trial 2 finished with value: -1.8012821635782053 and parameters: {'C': 1.7459262484456486, 'epsilon': 0.021590315566974466}. Best is trial 0 with value: -1.8012620778131727.
[I 2025-04-10 03:13:42,912] Trial 3 finished with value: -1.828705000900802 and parameters: {'C': 5.493207050044478, 'epsilon': 0.009905093340584195}. Best is trial 0 with value: -1.8012620778131727.
[I 2025-04-10 03:13:42,950] Trial 4 finished with value: -1.9183345773340548 and 

Лучшие гиперпараметры: {'C': 55.17712606830758, 'epsilon': 0.7888087519680829}
Лучшая средняя ошибка (CV): 1.7586157312151918
Train RMSE: 0.6110
Train R2: 0.8751
Test RMSE: 3.9514
Test R2: 0.2841


In [None]:
def svr_optuna(X_train, y_train, X_test, y_test):
    # Определяем распределение гиперпараметров для оптимизации
    model = SVR()
    param_distrs = {
        "epsilon": optuna.distributions.FloatDistribution(1e-3, 1),
        "C": optuna.distributions.FloatDistribution(1, 100)
    }

    # Инициализируем OptunaSearchCV для поиска лучших гиперпараметров
    opt_search = OptunaSearchCV(model, param_distrs, cv=5, n_trials=30, scoring='neg_root_mean_squared_error', n_jobs=-1, random_state=42, refit=True)
    opt_search.fit(X_train, y_train)

    # Выводим лучшие гиперпараметры и метрику
    print('Best hyperparameters:', opt_search.best_params_)
    print('Best neg MSE for cross validation:', opt_search.best_score_)

    # Обучаем модель с лучшими гиперпараметрами и оцениваем её
    best_model = SVR(**opt_search.best_params_)
    best_model.fit(X_train, y_train)
    evaluate_model(best_model, X_train, y_train, X_test, y_test)

In [57]:
svr_optuna(X_train_transformed, y_train, X_test_transformed, y_test)

  opt_search = OptunaSearchCV(model, param_distrs, cv=5, n_trials=30, scoring='neg_root_mean_squared_error', n_jobs=1, random_state=42, refit=True)
[I 2025-04-08 17:55:52,933] A new study created in memory with name: no-name-7a95c365-bb58-476d-be84-f83a902c4c11
[I 2025-04-08 17:55:52,981] Trial 0 finished with value: -2.3240043517871554 and parameters: {'epsilon': 0.45524635116915074, 'C': 83.43057119918102}. Best is trial 0 with value: -2.3240043517871554.
[I 2025-04-08 17:55:53,026] Trial 1 finished with value: -2.347738995599049 and parameters: {'epsilon': 0.0699822895327306, 'C': 32.62527785293697}. Best is trial 0 with value: -2.3240043517871554.
[I 2025-04-08 17:55:53,067] Trial 2 finished with value: -2.296352759784128 and parameters: {'epsilon': 0.4857020057829874, 'C': 30.52790150514794}. Best is trial 2 with value: -2.296352759784128.
[I 2025-04-08 17:55:53,111] Trial 3 finished with value: -2.308963821457725 and parameters: {'epsilon': 0.21321384771823035, 'C': 17.5079369590

RecursionError: maximum recursion depth exceeded