<a href="https://colab.research.google.com/github/xuanwWu11/bishe/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

data = pd.read_csv('/content/df1.txt',delimiter='\t')
amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
amino_acid_to_int = {acid: i for i, acid in enumerate(amino_acids)}

def sequence_to_int(sequence):
    encoded_sequence = []
    for acid in sequence:
        if acid in amino_acid_to_int:
            encoded_sequence.append(amino_acid_to_int[acid])
        else:
            # 如果氨基酸不在编码字典中，则将其编码为一个特殊值（例如，最后一个索引）
            encoded_sequence.append(len(amino_acids) - 1)
    return encoded_sequence
data['Sequence_encoded'] = data['Sequence.window'].apply(sequence_to_int)
X = pad_sequences(data['Sequence_encoded'])
y = data['Label']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [4]:
data

Unnamed: 0,Sequence.window,Label,Sequence_encoded
0,VASVESSSGEAFHVGKTPIVGQPSIPGGPVR,0,"[17, 0, 15, 17, 3, 15, 15, 15, 5, 3, 0, 4, 6, ..."
1,SLLAPLNVELDPEIQKVRAQEREQIKALNNK,0,"[15, 9, 9, 0, 12, 9, 11, 17, 3, 9, 2, 12, 3, 7..."
2,LSSGTLVPGSPTLPAKPSPSPGRAQEPAPRS,1,"[9, 15, 15, 5, 16, 9, 17, 12, 5, 15, 12, 16, 9..."
3,DAEKAFDKIQQPFMLKTLNKLGIDGMYLKII,1,"[2, 0, 3, 8, 0, 4, 2, 8, 7, 13, 13, 12, 4, 10,..."
4,AFDKIQQPFMLKTLNKLGIDGMYLKIIRAIY,1,"[0, 4, 2, 8, 7, 13, 13, 12, 4, 10, 9, 8, 16, 9..."
...,...,...,...
4710,RSVKANGQVSLPHFPRTHRLPKEMTPVEPAT,1,"[14, 15, 17, 8, 0, 11, 5, 13, 17, 15, 9, 12, 6..."
4711,RIVSRGRTQLFSLNPRSGTLVTAGRIDREEL,1,"[14, 7, 17, 15, 14, 5, 14, 16, 13, 9, 4, 15, 9..."
4712,WPSSGGSEPSVTVPLRSMSDPDQDFDKEPDS,1,"[18, 12, 15, 15, 5, 5, 15, 3, 12, 15, 17, 16, ..."
4713,GPSQPTKWSHLATEARALARSHRDTATKIAA,0,"[5, 12, 15, 13, 12, 16, 8, 18, 15, 6, 9, 0, 16..."


In [None]:
# 划分训练集、验证集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# 初始化模型
model = Sequential()
model.add(Embedding(input_dim=np.max(X_train) + 1, output_dim=128))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])

# 训练模型
history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_val, y_val), verbose=2)

# 使用StratifiedKFold进行交叉验证
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 初始化ROC曲线参数
mean_fpr = np.linspace(0, 1, 100)
tprs = []
aucs = []

# 绘制ROC曲线
plt.figure()  # 创建新的图形窗口
for train_index, val_index in skf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    model.fit(X_train_fold, y_train_fold, epochs=5, batch_size=64, verbose=0)
    y_score = model.predict(X_val_fold)
    fpr, tpr, _ = roc_curve(y_val_fold, y_score)
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = roc_auc_score(y_val_fold, y_score)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3)

# 计算平均AUC和绘制平均ROC曲线
mean_tpr = np.mean(tprs, axis=0)
mean_auc = roc_auc_score(y_val_fold, y_score)
plt.plot(mean_fpr, mean_tpr, 'b', label='ROC (AUC = %0.2f)' % mean_auc)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.savefig('roc_curve.pdf', format='pdf')
plt.show()



Epoch 1/100
45/45 - 13s - loss: 0.6126 - accuracy: 0.7112 - val_loss: 0.5807 - val_accuracy: 0.7381 - 13s/epoch - 294ms/step
Epoch 2/100
45/45 - 9s - loss: 0.5913 - accuracy: 0.7215 - val_loss: 0.5768 - val_accuracy: 0.7381 - 9s/epoch - 192ms/step
Epoch 3/100
45/45 - 9s - loss: 0.5889 - accuracy: 0.7215 - val_loss: 0.5811 - val_accuracy: 0.7381 - 9s/epoch - 208ms/step
Epoch 4/100
45/45 - 8s - loss: 0.5815 - accuracy: 0.7222 - val_loss: 0.5657 - val_accuracy: 0.7402 - 8s/epoch - 169ms/step
Epoch 5/100
45/45 - 9s - loss: 0.5670 - accuracy: 0.7229 - val_loss: 0.5509 - val_accuracy: 0.7370 - 9s/epoch - 202ms/step
Epoch 6/100
45/45 - 9s - loss: 0.5600 - accuracy: 0.7310 - val_loss: 0.5489 - val_accuracy: 0.7402 - 9s/epoch - 211ms/step
Epoch 7/100
45/45 - 8s - loss: 0.5574 - accuracy: 0.7356 - val_loss: 0.5549 - val_accuracy: 0.7296 - 8s/epoch - 167ms/step
Epoch 8/100
45/45 - 10s - loss: 0.5568 - accuracy: 0.7335 - val_loss: 0.5448 - val_accuracy: 0.7370 - 10s/epoch - 228ms/step
Epoch 9/100
