#### Import Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# from tool import model as md
from tool import config as cfg
from sklearn.metrics import classification_report
from test import run
dataset = pd.read_csv(cfg.DATA_PATH)
dataset = dataset.rename(columns={'Entry': 'uniprot_id', 'Sequence': 'seq'})
dataset

2024-02-22 11:12:54.756730: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-22 11:12:54.756788: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-22 11:12:54.760717: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-22 11:12:55.115795: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Unnamed: 0,uniprot_id,seq,label
0,P61981,MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLS...,2
1,P31947,MERASLIQKAKLAEQAERYEDMAAFMKGAVEKGEELSCEERNLLSV...,2
2,Q7NXD4,MANIDLHFHSRTSDGALTPTEVIDRAAARAPALLALTDHDCTGGLA...,1
3,B0XQT1,MGKSILLINGPNLNLLGTREPHIYGNTTLADVEASCKAHAESLGAT...,12
4,B8NIM4,MGKSILLINGPNLNLLGTREPHIYGSTTLADVEASSKAHAASLGAT...,12
...,...,...,...
96319,Q9ULJ3,MEGLLHYINPAHAISLLSALNEERLKGQLCDVLLIVGDQKFRAHKN...,2
96320,Q6XR72,MGRYSGKTCRLLFMLVLTVAFFVAELVSGYLGNSIALLSDSFNMLS...,2
96321,Q07157,MSARAAAAKSTAMEETAIWEQHTVTLHRAPGFGFGIAISGGRDNPH...,2
96322,Q9UDY2,MPVRGDRGFPPRRELSGWLRAPGMEELIWEQYTVTLQKDSKRGFGI...,2


#### Split

In [2]:
train, test = train_test_split(dataset, test_size=0.2,stratify=dataset['label'], random_state=42)
print("TrainSet:", len(train))
print("TestSet:", len(test))
test_fasta_df = test[["uniprot_id","seq"]]
train_fasta_df = train[["uniprot_id","seq"]]

TrainSet: 77059
TestSet: 19265


#### Make Fasta

In [4]:
fasta_file_path = 'blastp/test.fasta'
with open(fasta_file_path, 'w') as fasta_file:
    for index, row in test_fasta_df.iterrows():
        header = '>' + row['uniprot_id'] + '\n'
        sequence = row['seq'] + '\n'
        fasta_file.write(header)
        fasta_file.write(sequence)

print(f'Fasta Saved: {fasta_file_path}')


fasta_file_path = 'blastp/train.fasta'

with open(fasta_file_path, 'w') as fasta_file:
    for index, row in train_fasta_df.iterrows():
        header = '>' + row['uniprot_id'] + '\n'
        sequence = row['seq'] + '\n'
        fasta_file.write(header)
        fasta_file.write(sequence)

print(f'Fasta Saved: {fasta_file_path}')

Fasta Saved: blastp/test.fasta
Fasta Saved: blastp/train.fasta


#### blastp code:

    makeblastdb -in train.fasta -dbtype prot
    blastp -query test.fasta -subject train.fasta -out output.txt -outfmt 6


#### Evaluate

In [8]:
# 读取BLAST输出文件到DataFrame
blast_columns = ["Query", "Subject", "Identity", "Length", "Mismatches", "Gaps", "Query_start", "Query_end", "Subject_start", "Subject_end", "E_value", "Bit_score"]
blast_df = pd.read_csv("blastp/output.txt", sep='\t', header=None, names=blast_columns)

# 按照Query列分组，取每个组中最相似的记录
best_matches_df = blast_df.loc[blast_df.groupby('Query')['Bit_score'].idxmax()]

best_matches_df

Unnamed: 0,Query,Subject,Identity,Length,Mismatches,Gaps,Query_start,Query_end,Subject_start,Subject_end,E_value,Bit_score
8062617,A0A009IHW8,C0RGW8,42.913,254,133,1,13,266,34,275,1.300000e-63,200.0
6590821,A0A023W421,A9VIN2,65.068,146,51,0,1,146,1,146,1.330000e-68,203.0
6863027,A0A024BTN9,J7H670,93.173,498,34,0,1,498,17,514,0.000000e+00,979.0
1588562,A0A024SIB3,P29417,60.269,297,117,1,45,340,29,325,1.440000e-126,366.0
10526973,A0A059ZV61,P31922,49.801,753,368,5,45,792,51,798,0.000000e+00,769.0
...,...,...,...,...,...,...,...,...,...,...,...,...
7735522,W8JWV8,Q9SK86,45.161,372,196,7,12,376,14,384,3.280000e-108,322.0
7304864,W8JWW7,W8JCF0,91.758,364,28,1,1,364,1,362,0.000000e+00,691.0
7442242,X1WER2,A7MAZ3,78.304,401,82,2,3,398,4,404,0.000000e+00,646.0
704731,X2L4E2,Q6STF1,86.895,496,65,0,1,496,8,503,0.000000e+00,917.0


In [9]:
label_merge = pd.merge(best_matches_df,test[['uniprot_id','label']], left_on="Subject", right_on="uniprot_id")
label_merge = pd.merge(label_merge,train[['uniprot_id','label']], left_on="Query", right_on="uniprot_id")
result = label_merge[['Query', 'Subject', 'Identity', 'Length', 'Mismatches', 'Gaps',
       'Query_start', 'Query_end', 'Subject_start', 'Subject_end', 'E_value',
       'Bit_score','label_x', 'label_y']]
result.columns = ['Query', 'Subject', 'Identity', 'Length', 'Mismatches', 'Gaps',
       'Query_start', 'Query_end', 'Subject_start', 'Subject_end', 'E_value',
       'Bit_score','Predict', 'GroundTruth']
result

Unnamed: 0,Query,Subject,Identity,Length,Mismatches,Gaps,Query_start,Query_end,Subject_start,Subject_end,E_value,Bit_score,Predict,GroundTruth
0,A0A009IHW8,C0RGW8,42.913,254,133,1,13,266,34,275,1.300000e-63,200.0,2,2
1,A0A023W421,A9VIN2,65.068,146,51,0,1,146,1,146,1.330000e-68,203.0,2,2
2,A0A024BTN9,J7H670,93.173,498,34,0,1,498,17,514,0.000000e+00,979.0,2,1
3,A0A024SIB3,P29417,60.269,297,117,1,45,340,29,325,1.440000e-126,366.0,1,1
4,A0A059ZV61,P31922,49.801,753,368,5,45,792,51,798,0.000000e+00,769.0,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91998,W8JWV8,Q9SK86,45.161,372,196,7,12,376,14,384,3.280000e-108,322.0,2,2
91999,W8JWW7,W8JCF0,91.758,364,28,1,1,364,1,362,0.000000e+00,691.0,2,2
92000,X1WER2,A7MAZ3,78.304,401,82,2,3,398,4,404,0.000000e+00,646.0,2,2
92001,X2L4E2,Q6STF1,86.895,496,65,0,1,496,8,503,0.000000e+00,917.0,2,2


In [12]:
blast_report = pd.DataFrame(classification_report(result["GroundTruth"].values, result["Predict"].values, zero_division=0, output_dict=True, target_names=[1,2,3,4,5,6,7,8,10,12])).T.head(11)
blast_report

Unnamed: 0,precision,recall,f1-score,support
1,0.941129,0.926589,0.933803,20324.0
2,0.963569,0.965506,0.964537,48269.0
3,0.935171,0.927568,0.931354,4059.0
4,0.936746,0.951176,0.943906,11101.0
5,0.987461,0.944528,0.965517,667.0
6,0.95132,0.962209,0.956734,5504.0
7,0.95,0.678571,0.791667,28.0
8,0.867684,0.904509,0.885714,754.0
10,0.945006,0.984795,0.96449,855.0
12,0.913349,0.882353,0.897583,442.0


### Identity Range

In [10]:
blast_columns = ["Query", "Subject", "Identity", "Length", "Mismatches", "Gaps", "Query_start", "Query_end", "Subject_start", "Subject_end", "E_value", "Bit_score"]
blast_df = pd.read_csv("blastp/output1.txt", sep='\t', header=None, names=blast_columns)
blast_df

Unnamed: 0,Query,Subject,Identity,Length,Mismatches,Gaps,Query_start,Query_end,Subject_start,Subject_end,E_value,Bit_score
0,Q63HB2,Q81VV3,100.000,485,0,0,1,485,1,485,0.000000e+00,999.0
1,Q63HB2,A0R8F9,100.000,485,0,0,1,485,1,485,0.000000e+00,999.0
2,Q63HB2,Q6HPT0,100.000,485,0,0,1,485,1,485,0.000000e+00,999.0
3,Q63HB2,Q73FB9,99.794,485,1,0,1,485,1,485,0.000000e+00,996.0
4,Q63HB2,A9VNA0,96.082,485,19,0,1,485,1,485,0.000000e+00,971.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6844813,A8AXG8,C0RJL2,54.098,122,53,1,4,122,3,124,1.330000e-20,81.3
6844814,A8AXG8,C4LBV3,49.180,122,61,1,1,122,1,121,1.490000e-20,81.3
6844815,A8AXG8,Q4A7A8,45.690,116,61,1,6,121,7,120,1.590000e-20,81.3
6844816,A8AXG8,A2SLG6,56.000,125,52,1,1,122,1,125,1.710000e-20,81.3


In [16]:
blast_per30 = blast_df[blast_df["Identity"]<20]
test_dataset = dataset[dataset['uniprot_id'].isin(blast_per30.Query.unique())]
test_dataset

Unnamed: 0,uniprot_id,seq,label
41,A0A0H3JRU9,MKQIKKLLVANRGEIAIRIFRAAAELDISTVAIYSNEDKSSLHRYK...,4
43,A0A0H3LKL4,MQGKPRIAVIGAGLGGTAGAALMARAGFNVRLYEQAPAFSRLGAGI...,1
242,A4ECA9,MNLREKYGEWGLILGATEGVGKAFCEKIAAGGMNVVMVGRREEKLN...,2
323,A7MB74,MTVKTEAARDTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,2
330,A7VJC2,MEKTLETVPLERKKREKEQFRKLFIGGLSFETTEESLRNYYEQWGK...,2
...,...,...,...
114832,Q9ZD33,MYNTDIVIIGSGPVGLFAVFQAGMLGMKCHVIDAQEVIGGQCITLY...,2
114841,Q9ZDY3,MEEYFNKTGYLFSGNAVFVEELYRQYLANPNSVDQTWQEFFADIKD...,2
114865,Q9ZJP7,MDTPNKDDSIIRFSVSLQQNLLDELDNRIIKNGYSSRSELVRDMIR...,4
114882,Q9ZL58,MKAFLKICMVLIFVGVAHAKNPLTLSKEEEVLQNLQSFSAHFKQVL...,1


In [17]:
label, predict = run(test_dataset)



In [18]:
pd.DataFrame(classification_report(label, predict, zero_division=0, output_dict=True, target_names=[1,2,3,4,5,6,7,8,10,12])).T

Unnamed: 0,precision,recall,f1-score,support
1,0.955128,0.937107,0.946032,318.0
2,0.972927,0.981229,0.97706,1172.0
3,1.0,0.932432,0.965035,74.0
4,0.952862,0.959322,0.956081,295.0
5,1.0,1.0,1.0,23.0
6,0.96875,0.96875,0.96875,64.0
7,1.0,1.0,1.0,1.0
8,1.0,0.909091,0.952381,11.0
10,1.0,1.0,1.0,7.0
12,1.0,1.0,1.0,4.0
