#### Import Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# from tool import model as md
from tool import config as cfg
from sklearn.metrics import classification_report
from test import run
dataset = pd.read_csv(cfg.DATA_PATH)
dataset = dataset.rename(columns={'Entry': 'uniprot_id', 'Sequence': 'seq'})
dataset

2024-02-22 08:24:55.805663: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-22 08:24:55.805715: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-22 08:24:55.805752: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-22 08:24:55.816044: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Unnamed: 0,uniprot_id,seq,label
0,P61981,MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLS...,2
1,P31947,MERASLIQKAKLAEQAERYEDMAAFMKGAVEKGEELSCEERNLLSV...,2
2,Q7NXD4,MANIDLHFHSRTSDGALTPTEVIDRAAARAPALLALTDHDCTGGLA...,1
3,B0XQT1,MGKSILLINGPNLNLLGTREPHIYGNTTLADVEASCKAHAESLGAT...,12
4,B8NIM4,MGKSILLINGPNLNLLGTREPHIYGSTTLADVEASSKAHAASLGAT...,12
...,...,...,...
96319,Q9ULJ3,MEGLLHYINPAHAISLLSALNEERLKGQLCDVLLIVGDQKFRAHKN...,2
96320,Q6XR72,MGRYSGKTCRLLFMLVLTVAFFVAELVSGYLGNSIALLSDSFNMLS...,2
96321,Q07157,MSARAAAAKSTAMEETAIWEQHTVTLHRAPGFGFGIAISGGRDNPH...,2
96322,Q9UDY2,MPVRGDRGFPPRRELSGWLRAPGMEELIWEQYTVTLQKDSKRGFGI...,2


#### Split

In [2]:
train, test = train_test_split(dataset, test_size=0.2,stratify=dataset['label'], random_state=42)
print("TrainSet:", len(train))
print("TestSet:", len(test))
test_fasta_df = test[["uniprot_id","seq"]]
train_fasta_df = train[["uniprot_id","seq"]]

TrainSet: 77059
TestSet: 19265


In [3]:
train.to_csv("train.csv",index=False)

In [4]:
test.to_csv("quenn.csv",index=False)

#### Make Fasta

In [5]:

fasta_file_path = 'blastp/test.fasta'
with open(fasta_file_path, 'w') as fasta_file:
    for index, row in test_fasta_df.iterrows():
        header = '>' + row['uniprot_id'] + '\n'
        sequence = row['seq'] + '\n'
        fasta_file.write(header)
        fasta_file.write(sequence)

print(f'Fasta Saved: {fasta_file_path}')


fasta_file_path = 'blastp/train.fasta'

with open(fasta_file_path, 'w') as fasta_file:
    for index, row in train_fasta_df.iterrows():
        header = '>' + row['uniprot_id'] + '\n'
        sequence = row['seq'] + '\n'
        fasta_file.write(header)
        fasta_file.write(sequence)

print(f'Fasta Saved: {fasta_file_path}')

Fasta Saved: blastp/test.fasta
Fasta Saved: blastp/train.fasta


#### blastp code:

    makeblastdb -in train.fasta -dbtype prot
    blastp -query test.fasta -subject train.fasta -out output.txt -outfmt 6
    blastp -query test.fasta -subject train.fasta -out output.txt -outfmt 6 -max_target_seqs 1 -num_threads 120 -evalue 1e-5

#### Evaluate

In [6]:
# 读取BLAST输出文件到DataFrame
blast_columns = ["Query", "Subject", "Identity", "Length", "Mismatches", "Gaps", "Query_start", "Query_end", "Subject_start", "Subject_end", "E_value", "Bit_score"]
blast_df = pd.read_csv("blastp/output.txt", sep='\t', header=None, names=blast_columns)

# 按照Query列分组，取每个组中最相似的记录
best_matches_df = blast_df.loc[blast_df.groupby('Query')['Bit_score'].idxmax()]

best_matches_df

Unnamed: 0,Query,Subject,Identity,Length,Mismatches,Gaps,Query_start,Query_end,Subject_start,Subject_end,E_value,Bit_score
3647263,A0A024BTN9,K9N7B7,88.400,500,54,2,1,498,1,498,0.000000e+00,922.0
2090201,A0A086F3E3,S5VBU1,35.156,128,75,2,3,122,4,131,2.450000e-19,82.4
973494,A0A0B5L585,F1DBB2,95.803,548,2,1,1,548,1,527,0.000000e+00,1062.0
1273177,A0A0B6CGH9,A2A1A0,43.855,358,177,6,25,370,7,352,5.880000e-96,290.0
6020572,A0A0C3RR82,P9WEM8,53.333,375,168,5,3,374,2,372,5.660000e-137,404.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5676004,V5NC32,V5NDL4,76.678,596,138,1,5,600,1,595,0.000000e+00,954.0
4202908,V5YM14,B2RID1,32.798,747,446,17,1,720,1,718,1.070000e-103,333.0
5700944,V9M2S5,O23530,33.362,1148,647,32,24,1133,21,1088,8.040000e-153,501.0
4303263,W3VKA4,Q2SR15,36.538,52,31,1,11,60,9,60,1.500000e+00,32.3


In [7]:
blast_df

Unnamed: 0,Query,Subject,Identity,Length,Mismatches,Gaps,Query_start,Query_end,Subject_start,Subject_end,E_value,Bit_score
0,Q74L27,Q045X9,97.595,499,12,0,1,499,1,499,0.000,1013.0
1,Q74L27,A8YTD2,87.375,499,63,0,1,499,1,499,0.000,898.0
2,Q74L27,Q5FM35,86.974,499,65,0,1,499,1,499,0.000,895.0
3,Q74L27,Q1G8Y5,82.565,499,87,0,1,499,1,499,0.000,848.0
4,Q74L27,O86083,80.561,499,97,0,1,499,1,499,0.000,824.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6102392,B3CP03,Q8EB99,36.207,58,30,2,5,55,196,253,0.008,38.5
6102393,B3CP03,Q87ST2,33.333,60,33,1,5,57,218,277,0.028,37.0
6102394,B3CP03,Q6D0E4,36.735,49,24,2,21,62,241,289,0.240,33.9
6102395,B3CP03,Q65RA1,27.869,61,37,1,4,57,221,281,0.310,33.9


In [8]:
label_merge = pd.merge(best_matches_df,train[['uniprot_id','label']], left_on="Subject", right_on="uniprot_id")
label_merge = pd.merge(label_merge,test[['uniprot_id','label']], left_on="Query", right_on="uniprot_id")
result = label_merge[['Query', 'Subject', 'Identity', 'Length', 'Mismatches', 'Gaps',
       'Query_start', 'Query_end', 'Subject_start', 'Subject_end', 'E_value',
       'Bit_score','label_x', 'label_y']]
result.columns = ['Query', 'Subject', 'Identity', 'Length', 'Mismatches', 'Gaps',
       'Query_start', 'Query_end', 'Subject_start', 'Subject_end', 'E_value',
       'Bit_score','Predict', 'GroundTruth']
result

Unnamed: 0,Query,Subject,Identity,Length,Mismatches,Gaps,Query_start,Query_end,Subject_start,Subject_end,E_value,Bit_score,Predict,GroundTruth
0,A0A024BTN9,K9N7B7,88.400,500,54,2,1,498,1,498,0.000000e+00,922.0,1,1
1,A0A086F3E3,S5VBU1,35.156,128,75,2,3,122,4,131,2.450000e-19,82.4,4,4
2,K9UJK2,S5VBU1,44.615,195,105,2,8,201,2,194,5.840000e-46,151.0,4,4
3,A0A0B5L585,F1DBB2,95.803,548,2,1,1,548,1,527,0.000000e+00,1062.0,4,4
4,A0A0B6CGH9,A2A1A0,43.855,358,177,6,25,370,7,352,5.880000e-96,290.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19235,U5PZ28,Q98UF9,78.099,484,105,1,1,483,1,484,0.000000e+00,776.0,1,2
19236,V5NC32,V5NDL4,76.678,596,138,1,5,600,1,595,0.000000e+00,954.0,1,1
19237,V5YM14,B2RID1,32.798,747,446,17,1,720,1,718,1.070000e-103,333.0,2,2
19238,V9M2S5,O23530,33.362,1148,647,32,24,1133,21,1088,8.040000e-153,501.0,2,2


In [9]:
blast_report = pd.DataFrame(classification_report(result["GroundTruth"].values, result["Predict"].values, zero_division=0, output_dict=True, target_names=[1,2,3,4,5,6,7,8,10,12])).T
blast_report

Unnamed: 0,precision,recall,f1-score,support
1,0.964105,0.95263,0.958333,4201.0
2,0.977344,0.978308,0.977826,10142.0
3,0.952381,0.942675,0.947503,785.0
4,0.95568,0.9686,0.962097,2293.0
5,0.964539,0.964539,0.964539,141.0
6,0.964516,0.978723,0.971568,1222.0
7,1.0,1.0,1.0,3.0
8,0.941176,0.941176,0.941176,153.0
10,0.985437,0.990244,0.987835,205.0
12,0.978495,0.957895,0.968085,95.0


### Identity Range

In [10]:
blast_columns = ["Query", "Subject", "Identity", "Length", "Mismatches", "Gaps", "Query_start", "Query_end", "Subject_start", "Subject_end", "E_value", "Bit_score"]
blast_df = pd.read_csv("blastp/output.txt", sep='\t', header=None, names=blast_columns)
blast_df

Unnamed: 0,Query,Subject,Identity,Length,Mismatches,Gaps,Query_start,Query_end,Subject_start,Subject_end,E_value,Bit_score
0,Q74L27,Q045X9,97.595,499,12,0,1,499,1,499,0.000,1013.0
1,Q74L27,A8YTD2,87.375,499,63,0,1,499,1,499,0.000,898.0
2,Q74L27,Q5FM35,86.974,499,65,0,1,499,1,499,0.000,895.0
3,Q74L27,Q1G8Y5,82.565,499,87,0,1,499,1,499,0.000,848.0
4,Q74L27,O86083,80.561,499,97,0,1,499,1,499,0.000,824.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6102392,B3CP03,Q8EB99,36.207,58,30,2,5,55,196,253,0.008,38.5
6102393,B3CP03,Q87ST2,33.333,60,33,1,5,57,218,277,0.028,37.0
6102394,B3CP03,Q6D0E4,36.735,49,24,2,21,62,241,289,0.240,33.9
6102395,B3CP03,Q65RA1,27.869,61,37,1,4,57,221,281,0.310,33.9


In [11]:
blast_per30 = blast_df[blast_df["Identity"]<20]
test_dataset = dataset[dataset['uniprot_id'].isin(blast_per30.Query.unique())]
test_dataset

Unnamed: 0,uniprot_id,seq,label
56,Q9NZK5,MLVDGPSERPALCFLLLAVAMSFFGSALSIDETRAHLLLKEKMMRL...,2
106,Q95182,MKLLLLCLGLILVCAQQEENSDVAIRNFDISKISGEWYSIFLASDV...,2
458,Q5HP11,MKSIQVPIILVGFMGTGKTTVGKYLSDLYNLSYVDLDNFIEVNECK...,1
604,B8IER1,MKLLHIDTSILGAGSVSRELSALIVERLTRGTQAEVTYRDLAAENL...,2
608,Q2ST93,MSKVLVLKTTAQADEVSNSVALTNRFLEEYKKFNPDDEIIIVDLNK...,2
...,...,...,...
96159,G3XD01,MSYYQHPSAIVDDGAQIGSDSRVWHFVHICAGARIGAGVSLGQNVF...,3
96184,C5B9M4,MSEKYVVTWDVLQMHTRKLAARLLPAERWTGIIAVSRGGLVPAAIL...,4
96196,A6LW55,MESLHKRILEEGQALSENVLKVDSFLNHQVDPELMYEMGTYFKNYF...,2
96255,B7ICU2,MKKLKSFGGKNLSGKSMNQLQKLQEEMQKKLQEVEEGFSNVEVEVS...,2


In [12]:
label, predict = run(test_dataset)

2024-02-22 08:25:25.307485: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 19292 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:73:00.0, compute capability: 8.6
2024-02-22 08:25:25.883930: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


In [None]:
pd.DataFrame(classification_report(label, predict, zero_division=0, output_dict=True)).T

Unnamed: 0,precision,recall,f1-score,support
0,0.935622,0.931624,0.933619,234.0
1,0.977038,0.988386,0.982679,861.0
2,1.0,0.915254,0.955752,59.0
3,0.959302,0.948276,0.953757,174.0
4,1.0,0.894737,0.944444,19.0
5,0.97619,0.97619,0.97619,42.0
6,0.0,0.0,0.0,3.0
7,0.0,0.0,0.0,7.0
8,0.0,0.0,0.0,0.0
accuracy,0.962116,0.962116,0.962116,0.962116
