In [1]:
from kan import KAN
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
import torch
from sklearn.utils import shuffle
import random
import numpy as np
torch.set_default_dtype(torch.float32)

In [2]:
protein_feature_train = []

# 打开并读取文件
with open('/tmp/pycharm_project_763/feature/feature_train/phychem_train.txt', 'r') as file:
    # 逐行读取文件内容
    for line in file:
        if line.startswith('>'):  # 跳过以'>'开头的行（序列标识行）
            continue
        # 分割行中的每个特征并转换为浮点数
        features = [float(x) for x in line.strip().split('\t')]
        # 将特征列表添加到主列表中
        protein_feature_train.append(features)
# protein_feature_train[:5]   

In [3]:
protein_feature_test = []

# 打开并读取文件
with open('/tmp/pycharm_project_763/feature/feature_test/phychem_test.txt', 'r') as file:
    # 逐行读取文件内容
    for line in file:
        if line.startswith('>'):  # 跳过以'>'开头的行（序列标识行）
            continue
        # 分割行中的每个特征并转换为浮点数
        features = [float(x) for x in line.strip().split('\t')]
        # 将特征列表添加到主列表中
        protein_feature_test.append(features)

In [4]:
import pandas as pd
df = pd.read_csv('/tmp/pycharm_project_763/data/trainCPP.csv')

# 提取序列和标签
sequences = df['sequence'].tolist()
y = df['label'].tolist()

df2= pd.read_csv('/tmp/pycharm_project_763/data/testCPP.csv')

# 提取序列和标签
sequences_test = df2['sequence'].tolist()
y_test = df2['label'].tolist()

In [5]:
from sklearn.utils import shuffle
X_train, y_train = shuffle(protein_feature_train, y, random_state=42)
X_test, y_test = shuffle(protein_feature_test, y_test, random_state=42)

In [6]:
X_train=np.array(X_train)
X_test=np.array(X_test)
y_train=np.array(y_train)
y_test=np.array(y_test)

In [7]:
X_train.shape

(1164, 24)

In [8]:
y_train = y_train.reshape((y_train.shape[0], 1))
y_test = y_test.reshape((y_test.shape[0], 1))

In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50,50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}


In [10]:

mlp = MLPClassifier(max_iter=30, random_state=1)

In [11]:
# 创建GridSearchCV对象
grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, n_jobs=-1, cv=3, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


  y = column_or_1d(y, warn=True)


GridSearchCV(cv=3, estimator=MLPClassifier(max_iter=30, random_state=1),
             n_jobs=-1,
             param_grid={'activation': ['tanh', 'relu'],
                         'alpha': [0.0001, 0.05],
                         'hidden_layer_sizes': [(50,), (100,), (50, 50)],
                         'learning_rate': ['constant', 'adaptive'],
                         'solver': ['sgd', 'adam']},
             verbose=2)

In [12]:
# 最佳参数和模型
print("Best parameters found: ", grid_search.best_params_)
best_mlp = grid_search.best_estimator_

Best parameters found:  {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50), 'learning_rate': 'constant', 'solver': 'adam'}


In [13]:
# 使用最佳模型进行预测
predictions_train = best_mlp.predict(X_train)
predictions_test = best_mlp.predict(X_test)

In [14]:
train_accuracy = accuracy_score(y_train, predictions_train)
test_accuracy = accuracy_score(y_test, predictions_test)

print(f'Train Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Train Accuracy: 0.7457
Test Accuracy: 0.7124


In [14]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, alpha=1e-4,
                    solver='sgd', verbose=10, random_state=1,
                    learning_rate_init=.1)

In [15]:
mlp.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.71159803
Iteration 2, loss = 0.66928930
Iteration 3, loss = 0.65123909
Iteration 4, loss = 0.63663658
Iteration 5, loss = 0.62767529
Iteration 6, loss = 0.61979814
Iteration 7, loss = 0.61249474
Iteration 8, loss = 0.60339833
Iteration 9, loss = 0.59612704
Iteration 10, loss = 0.59018149
Iteration 11, loss = 0.58316539
Iteration 12, loss = 0.57642724
Iteration 13, loss = 0.57137561
Iteration 14, loss = 0.56464076
Iteration 15, loss = 0.56007672
Iteration 16, loss = 0.55377403
Iteration 17, loss = 0.54935112
Iteration 18, loss = 0.54339061
Iteration 19, loss = 0.53814306
Iteration 20, loss = 0.53493608
Iteration 21, loss = 0.52850590
Iteration 22, loss = 0.52281338
Iteration 23, loss = 0.51865820
Iteration 24, loss = 0.51370225
Iteration 25, loss = 0.50791669
Iteration 26, loss = 0.50459006
Iteration 27, loss = 0.50010764
Iteration 28, loss = 0.49475367
Iteration 29, loss = 0.49009283
Iteration 30, loss = 0.48738877
Iteration 31, loss = 0.48313797
Iteration 32, los

MLPClassifier(learning_rate_init=0.1, max_iter=300, random_state=1,
              solver='sgd', verbose=10)

In [16]:
predictions_train = mlp.predict(X_train)
predictions_test = mlp.predict(X_test)

# 计算准确率
train_accuracy = accuracy_score(y_train, predictions_train)
test_accuracy = accuracy_score(y_test, predictions_test)

print(f'Train Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Train Accuracy: 0.9545
Test Accuracy: 0.8027


In [17]:
from sklearn.metrics import roc_auc_score, confusion_matrix, matthews_corrcoef, recall_score

probabilities_train = mlp.predict_proba(X_train)[:, 1]
probabilities_test = mlp.predict_proba(X_test)[:, 1]
train_auc = roc_auc_score(y_train, probabilities_train)
test_auc = roc_auc_score(y_test, probabilities_test)

# 计算混淆矩阵
cm_train = confusion_matrix(y_train, predictions_train)
cm_test = confusion_matrix(y_test, predictions_test)

# 计算Specificity (SP)
tn_train, fp_train, fn_train, tp_train = cm_train.ravel()
tn_test, fp_test, fn_test, tp_test = cm_test.ravel()
train_specificity = tn_train / (tn_train + fp_train)
test_specificity = tn_test / (tn_test + fp_test)

# 计算MCC
train_mcc = matthews_corrcoef(y_train, predictions_train)
test_mcc = matthews_corrcoef(y_test, predictions_test)

# 计算Sensitivity (SN)
train_sensitivity = recall_score(y_train, predictions_train)
test_sensitivity = recall_score(y_test, predictions_test)

print(f'Train Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')
print(f'Train AUC: {train_auc:.4f}')
print(f'Test AUC: {test_auc:.4f}')
print(f'Train Specificity: {train_specificity:.4f}')
print(f'Test Specificity: {test_specificity:.4f}')
print(f'Train MCC: {train_mcc:.4f}')
print(f'Test MCC: {test_mcc:.4f}')
print(f'Train Sensitivity: {train_sensitivity:.4f}')
print(f'Test Sensitivity: {test_sensitivity:.4f}')

Train Accuracy: 0.9545
Test Accuracy: 0.8027
Train AUC: 0.9931
Test AUC: 0.8762
Train Specificity: 0.9296
Test Specificity: 0.7450
Train MCC: 0.9101
Test MCC: 0.6092
Train Sensitivity: 0.9794
Test Sensitivity: 0.8600
