In [60]:
import pandas as pd
import json
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import f1_score, make_scorer, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from joblib import dump
from evaluations import plot_classification_performance, plot_regression_performance

Использование индексов для валидации из статьи Clark et al.doi: 10.1038/s42004-023-01037-7

In [3]:
with open('CVIDS.json', 'r') as file:
    cvids = json.load(file)

# Преобразование индексов в список для каждого фолда
cv_indices = []
for train_indices, test_indices in [cvids[str(i)] for i in range(5)]:
    train_indices = list(map(int, train_indices))
    test_indices = list(map(int, test_indices))
    cv_indices.append([train_indices, test_indices])

# Список чисел, которые нужно пропустить, мы их удалили из датасета (в них цистеин мутировал)
skip_numbers = [461, 462, 463, 631, 632, 1366, 1367, 1368, 1536, 1537]

# Функция для перенумерации чисел
def renumber_lists(lists, skip_numbers):
    for sublist in lists:
        for inner_list in sublist:
            for i in range(len(inner_list)):
                # Находим количество чисел в skip_numbers, которые меньше текущего числа
                skip_count = sum(1 for num in skip_numbers if num < inner_list[i])
                # Уменьшаем число на количество пропущенных чисел
                inner_list[i] -= skip_count
                # Уменьшаем число на 1, чтобы начать нумерацию с 0
                inner_list[i] -= 1
    return lists

cv_indices = renumber_lists(cv_indices, skip_numbers)

Загрузка данных

In [26]:
abag_data = pd.read_csv('../feature_engineering/features/AbAg_features.csv', index_col='ID')

# AbRFC

In [17]:
def perform_custom_cv_clf(model, X, y, cv_indices):

    # Определение метрик: хотим F1-measure и MCC
    scorers = {
        'f1': make_scorer(f1_score, average='binary'),
        'mcc': make_scorer(matthews_corrcoef),
        'auc': make_scorer(roc_auc_score, needs_proba=True)
    }

    # Выполнение кросс-валидации для каждой метрики
    for name, scorer in scorers.items():
        scores = cross_val_score(model, X, y, cv=cv_indices, scoring=scorer, n_jobs=-1)
        print(f"Среднее значение {name}: {scores.mean()}")

In [19]:
abag_data['class'] = abag_data['ddG'].apply(lambda x: 0 if x > -0.21 else 1) # 0 - безвредная, 1 - вредная
y = abag_data['class']
X = abag_data.drop(['ddG', 'class'], axis=1)
X

Unnamed: 0_level_0,Hyd_Hyd_4,Hyd_Pos_4,Hyd_Neg_4,Hyd_Acc_4,Hyd_Don_4,Hyd_Aro_4,Hyd_Sul_4,Hyd_Neu_4,Pos_Hyd_4,Pos_Pos_4,...,∆Neu,∆AAvolume,∆AAhydropathy,∆AAarea,∆AAweight,∆AAcharge,∆AAflexibily,∆AAchemical,∆AAsize,∆AAhbonds
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,91.0,6,6,45,52,85,1,64,6,0.0,...,-1,-22.5,5.3,-35.0,-44.010,1.0,-17.0,-2.0,-1.0,-3.0
2,89.0,9,16,64,57,69,0,94,9,1.0,...,0,-78.1,-2.0,-55.0,-42.081,0.0,-8.0,0.0,-3.0,0.0
3,57.0,12,7,63,60,13,0,101,12,2.0,...,-1,-25.5,5.3,-45.0,-43.025,0.0,-35.0,-3.0,-1.0,-2.0
4,201.0,16,9,97,103,246,0,116,16,1.0,...,-2,-27.5,2.5,-25.0,-30.026,0.0,-2.0,-5.0,-1.0,-2.0
5,142.0,15,4,100,94,173,0,101,15,3.0,...,-2,-27.5,2.5,-25.0,-30.026,0.0,-2.0,-5.0,-1.0,-2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1806,161.0,0,6,66,76,224,0,96,0,1.0,...,2,27.5,-2.5,25.0,30.026,-0.0,2.0,5.0,1.0,2.0
1807,184.0,2,7,72,73,255,0,122,2,0.0,...,0,105.0,-3.1,115.0,92.097,-0.0,17.0,6.0,4.0,2.0
1808,164.0,0,5,60,72,221,1,97,0,0.0,...,0,105.0,-3.1,115.0,92.097,-0.0,17.0,6.0,4.0,2.0
1809,306.0,0,6,95,115,483,5,122,0,0.0,...,0,105.0,-3.1,115.0,92.097,-0.0,17.0,6.0,4.0,2.0


In [20]:
# параметры из статьи Clark et al. , только random_state с None на 42 поменял
clf_params = {"bootstrap": True, "class_weight": None, "criterion": "entropy", "max_depth": 50, "max_features": "sqrt", "max_leaf_nodes": None, "min_impurity_decrease": 0.0, "min_samples_leaf": 10, "min_samples_split": 10, "min_weight_fraction_leaf": 0.0, "n_estimators": 1000, "n_jobs": None, "oob_score": False, "random_state": 42, "verbose": 0, "warm_start": False}

In [21]:
AbRFC_from_article = RandomForestClassifier(**clf_params)
AbRFC_from_article.fit(X, y)

In [22]:
dump(AbRFC_from_article, 'AbRFC_from_article.joblib')

['AbRFC_from_article.joblib']

In [23]:
perform_custom_cv_clf(AbRFC_from_article, X, y, cv_indices)



Среднее значение f1: 0.7863356521781405
Среднее значение mcc: 0.289626203701541
Среднее значение auc: 0.6725439581363543


In [24]:
f1_scorer = make_scorer(f1_score, average='binary')

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'bootstrap': [True]
}

AbRFC = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=AbRFC, param_grid=param_grid, cv=cv_indices, scoring=f1_scorer, n_jobs=-1)
grid_search.fit(X, y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Лучшие параметры: {best_params}")
print(f"Лучшая оценка: {best_score}")

Лучшие параметры: {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Лучшая оценка: 0.7888496073173672


In [25]:
AbRFС = RandomForestClassifier(**best_params)
AbRFС.fit(X, y)

In [26]:
dump(AbRFC, 'AbRFC.joblib')

['AbRFC.joblib']

In [27]:
perform_custom_cv_clf(AbRFС, X, y, cv_indices)



Среднее значение f1: 0.778502837602464
Среднее значение mcc: 0.2701564900212405
Среднее значение auc: 0.6446701870994053


# AbRFR

In [None]:
y = abag_data['ddG']
X = abag_data.drop(['ddG', 'class'], axis=1)

In [None]:
# Функция для вычисления корреляции Пирсона
def pearson_correlation(y_true, y_pred):
    correlation, _ = pearsonr(y_true, y_pred)
    return correlation
pearson_scorer = make_scorer(pearson_correlation, greater_is_better=True)

In [None]:
def perform_custom_cv_reg(model, X, y):

    # Функция для вычисления корреляции Пирсона
    def pearson_correlation(y_true, y_pred):
        correlation, _ = pearsonr(y_true, y_pred)
        return correlation
    # Создаем скорер для корреляции Пирсона
    pearson_scorer = make_scorer(pearson_correlation, greater_is_better=True)

    # Определение метрик: хотим r2, rmse и коэффициент Пирсона смотреть
    scorers = {
        'r2': make_scorer(r2_score),
        'rmse': make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)), greater_is_better=False),
        'pearson': pearson_scorer
    }

    # Выполнение кросс-валидации для каждой метрики
    for name, scorer in scorers.items():
        scores = cross_val_score(model, X, y, cv=cv_indices, scoring=scorer, n_jobs=-1)
        print(f"Среднее значение {name}: {scores.mean()}")

In [None]:
randomsearch_pearson = {'warm_start': True, 'random_state': None, 'oob_score': True, 'n_estimators': 500, 'min_weight_fraction_leaf': 0.0, 'min_samples_split': 5, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.3, 'max_leaf_nodes': None, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': True}

In [None]:
randomsearch_rmse = {'warm_start': True, 'random_state': 42, 'oob_score': True, 'n_estimators': 500, 'min_weight_fraction_leaf': 0.3, 'min_samples_split': 5, 'min_samples_leaf': 10, 'min_impurity_decrease': 0.0, 'max_leaf_nodes': None, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': True}

In [43]:
# из статьи параметры
reg_params = {"bootstrap": True, "max_depth": 50, "max_features": "sqrt", "max_leaf_nodes": None, "min_impurity_decrease": 0.0, "min_samples_leaf": 10, "min_samples_split": 10, "min_weight_fraction_leaf": 0.0, "n_estimators": 1000, "n_jobs": None, "oob_score": False, "random_state": 42, "verbose": 0, "warm_start": False}

In [36]:
y = abag_data['ddG']
X = abag_data.drop(['ddG'], axis=1)

In [47]:
my_regparams = {'bootstrap': False, 'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
AbRFR = RandomForestRegressor(**my_regparams)
AbRFR.fit(X, y)

In [48]:
dump(AbRFR, 'AbRFR.joblib')

['AbRFR.joblib']

In [55]:
AbRFR_from_article = RandomForestRegressor(**reg_params)
AbRFR_from_article.fit(X, y)


In [56]:
dump(AbRFR_from_article, 'AbRFR_from_article.joblib')

['AbRFR_from_article.joblib']

Тест на одном из фолдов кросс-валидации

In [49]:
# Создание обучающего и тестового наборов на основе индексов из кросс-валидации из статьи Clark
train_indices, test_indices = cv_indices[4][0], cv_indices[4][1]

X_train = abag_data.iloc[train_indices]
X_test = abag_data.iloc[train_indices]
y_train = abag_data.iloc[train_indices]['ddG']
y_test = abag_data.iloc[test_indices]['ddG']

In [52]:
AbRFR_from_article_4fold = RandomForestRegressor(**reg_params)
AbRFR_from_article_4fold.fit(X_train, y_train)

In [54]:
dump(AbRFR_from_article_4fold, 'AbRFR_from_article_4fold.joblib')

['AbRFR_from_article_4fold.joblib']