In [1]:
# ! python featurizer.py

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tool import model as md
from tool import config as cfg
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

def load_and_preprocess_data():
    """
    Load and preprocess the dataset.

    Args:
        None.

    Returns:
        pd.DataFrame: Preprocessed dataset containing the following columns:
            - 'uniprot_id': UniProt ID.
            - 'seq': Sequence.
            - 'f1': Feature value.
            - 'new_label': Encoded label value using LabelEncoder.
            - 'label': Original label value.
    """
    dataset = pd.read_csv(cfg.DATA_PATH)
    feature = pd.read_feather(f'{cfg.FEATURE_PATH}feature_esm2.feather')
    dataset = dataset.rename(columns={'Entry': 'uniprot_id', 'Sequence': 'seq'})
    data_df = dataset.merge(feature, on='uniprot_id', how='left')
    data_df = data_df[~data_df.f1.isnull()]
    data_df['label'] = LabelEncoder().fit_transform(data_df['label'])

    return data_df

def reshape_features(data):
    """
    Reshape input data to have 3 dimensions.

    Args:
        data (np.ndarray): Input data to be reshaped.

    Returns:
        np.ndarray: Reshaped data with shape (n_samples, 1, n_features).
    """
    return np.array(data).reshape(data.shape[0],1,-1)


2024-02-22 03:01:17.478879: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-22 03:01:17.478922: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-22 03:01:17.478957: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-22 03:01:17.491311: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Load and preprocess the dataset.
dataset = load_and_preprocess_data()

# Create a Pandas Excel writer
excel_writer = pd.ExcelWriter('output/cv_labels.xlsx', engine='xlsxwriter')
best_val_accuracy = 0.0
best_model_path = ''
folder_path = "model/"

# StratifiedKFold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold, (train_index, val_index) in enumerate(skf.split(dataset.iloc[:, 3:], dataset['label'])):
    print(f"\nFold {fold + 1}:")

    train_data, vali_data = dataset.iloc[train_index], dataset.iloc[val_index]

    X_train = reshape_features(train_data.iloc[:, 3:])
    X_val = reshape_features(vali_data.iloc[:, 3:])

    print(f'X_train shape: {X_train.shape}')
    print(f'X_val shape: {X_val.shape}')

    gru_attention_model = md.GRUWithAttentionModel(input_shape=cfg.INPUT_SHAPE, num_classes=cfg.NUM_CLASSES)
    gru_attention_model.compile_model()

    history = gru_attention_model.train(
        X_train,
        train_data['label'],
        X_val,
        vali_data['label'],
        batch_size=cfg.BATCH_SIZE,
        epochs=cfg.EPOCHS
    )

    # Track and save only the best model
    if history.history['val_accuracy'][-1] > best_val_accuracy:
        best_val_accuracy = history.history['val_accuracy'][-1]
        gru_attention_model.save_model(f'{folder_path}deepsub_new.h5')

    # eval
    val_predictions = gru_attention_model.model.predict(X_val, batch_size=cfg.BATCH_SIZE)
    ground_truth_labels = vali_data['label'].values
    predicted_labels = np.argmax(val_predictions, axis=1)

    # Export data to different sheets
    export_data = pd.DataFrame({'GroundTruth': ground_truth_labels, 'PredictedLabels': predicted_labels})
    sheet_name = f'fold{fold}'
    export_data.to_excel(excel_writer, sheet_name=sheet_name, index=False)
excel_writer.close()
print("\nTrain Over")


Fold 1:
X_train shape: (86644, 1, 1280)
X_val shape: (9628, 1, 1280)


2024-02-22 03:01:45.314082: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 31277 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:73:00.0, compute capability: 8.6
2024-02-22 03:01:46.064580: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Epoch 1/200


2024-02-22 03:01:57.777702: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700
2024-02-22 03:01:58.198902: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f591c00abc0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-02-22 03:01:58.198959: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6
2024-02-22 03:01:58.213799: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-02-22 03:01:58.436892: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 7

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score
import numpy as np
from sklearn.metrics import classification_report
accuracies, f1_scores, recalls = [], [], []
all_ground_truth_labels = []
all_predicted_labels = []

for fold in range(10):
    print(f"\nFold {fold + 1}:")

    # 读取每个折叠的真实标签和预测标签
    fold_data = pd.read_excel('output/cv_labels.xlsx', sheet_name=f'fold{fold}')

    ground_truth_labels = fold_data['GroundTruth'].values
    predicted_labels = fold_data['PredictedLabels'].values
    # 将当前折叠的标签添加到列表中
    all_ground_truth_labels.append(ground_truth_labels)
    all_predicted_labels.append(predicted_labels)
    
    # 计算每个折叠的准确率、F1 分数和召回率
    accuracy = accuracy_score(ground_truth_labels, predicted_labels)
    f1 = f1_score(ground_truth_labels, predicted_labels, average='weighted')
    recall = recall_score(ground_truth_labels, predicted_labels, average='weighted')

    accuracies.append(accuracy)
    f1_scores.append(f1)
    recalls.append(recall)

    print(f'Accuracy: {accuracy:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print(f'Recall: {recall:.4f}')

# 计算交叉验证的平均准确率、F1 分数和召回率
avg_accuracy = np.mean(accuracies)
avg_f1 = np.mean(f1_scores)
avg_recall = np.mean(recalls)

print(f'Average Accuracy Across Folds: {avg_accuracy:.4f}')
print(f'Average F1 Score Across Folds: {avg_f1:.4f}')
print(f'Average Recall Across Folds: {avg_recall:.4f}')




Fold 1:
Accuracy: 0.9778
F1 Score: 0.9777
Recall: 0.9778

Fold 2:
Accuracy: 0.9758
F1 Score: 0.9757
Recall: 0.9758

Fold 3:
Accuracy: 0.9759
F1 Score: 0.9758
Recall: 0.9759

Fold 4:
Accuracy: 0.9758
F1 Score: 0.9758
Recall: 0.9758

Fold 5:
Accuracy: 0.9758
F1 Score: 0.9757
Recall: 0.9758

Fold 6:
Accuracy: 0.9744
F1 Score: 0.9744
Recall: 0.9744

Fold 7:
Accuracy: 0.9743
F1 Score: 0.9742
Recall: 0.9743

Fold 8:
Accuracy: 0.9756
F1 Score: 0.9755
Recall: 0.9756

Fold 9:
Accuracy: 0.9746
F1 Score: 0.9746
Recall: 0.9746

Fold 10:
Accuracy: 0.9769
F1 Score: 0.9768
Recall: 0.9769
Average Accuracy Across Folds: 0.9757
Average F1 Score Across Folds: 0.9756
Average Recall Across Folds: 0.9757


In [None]:
# 合并所有折叠的标签
all_ground_truth_labels = np.concatenate(all_ground_truth_labels)
all_predicted_labels = np.concatenate(all_predicted_labels)

In [None]:
print("\nOverall Classification Report:")
report = pd.DataFrame(classification_report(all_ground_truth_labels, all_predicted_labels,output_dict=True)).T
report


Overall Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.973111,0.96122,0.967129,25451.0
1,0.974282,0.986222,0.980215,60384.0
2,0.983095,0.962372,0.972623,5076.0
3,0.975273,0.965651,0.970438,13887.0
4,0.985384,0.967703,0.976463,836.0
5,0.987909,0.973271,0.980536,6884.0
6,0.892857,0.714286,0.793651,35.0
7,0.977925,0.940552,0.958874,942.0
8,0.998108,0.986904,0.992474,1069.0
9,0.981516,0.958484,0.969863,554.0
