In [1]:
import pandas as pd
import os
from tqdm.auto import tqdm
from pandarallel import pandarallel
from rdkit import Chem
from tqdm import tqdm as top_tqdm

In [2]:
def get_structure_sequence(pdb_file):
    try:
        mol = Chem.MolFromPDBFile(pdb_file)
        protein_sequence = Chem.MolToSequence(mol)
    except:
        protein_sequence = ''
    return protein_sequence

def multiprocess_structure_check(df, nb_workers, pdb_file_path):
    
    if nb_workers != 0:

        pandarallel.initialize(nb_workers=nb_workers, progress_bar=True)
        df['pdb_files'] = df['alphafolddb-id'].parallel_apply(
            lambda x: os.path.join(pdb_file_path, f'AF-{x}-F1-model_v4.pdb'))
        df['aa_sequence_calculated'] = df['pdb_files'].parallel_apply(
            lambda x: get_structure_sequence(x))
    else:
        top_tqdm.pandas(desc='pandas bar')
        df['pdb_files'] = df['alphafolddb-id'].progress_apply(
            lambda x: os.path.join(pdb_file_path, f'AF-{x}-F1-model_v4.pdb'))
        df['aa_sequence_calculated'] = df['pdb_files'].progress_apply(
            lambda x: get_structure_sequence(x))
    
    df['is_valid'] = (df['aa_sequence_calculated'] == df['aa_sequence'])

    return df
def get_blast_database(dir, fasta_path):
    database_df = pd.DataFrame()
    csv_fnames = os.listdir(dir)
    pbar = tqdm(
        csv_fnames,
        total=len(csv_fnames)
    )
    for fname in pbar:
        df = pd.read_csv(os.path.join(dir, fname))
        df = df[['alphafolddb-id', 'aa_sequence', 'site_labels', 'site_types']]
        database_df = pd.concat([database_df, df])
    
    database_df = database_df.drop_duplicates(subset=['alphafolddb-id', 'aa_sequence','site_labels', 'site_types']).reset_index(drop=True)
    database_df['alphafolddb-id'] = database_df['alphafolddb-id'].apply(lambda x:x.replace(';',''))

    with open(fasta_path, 'w', encoding='utf-8') as f:
        for idx, row in tqdm(database_df.iterrows(), total=len(database_df)):
            f.write('>{}\n'.format(row['alphafolddb-id']))
            f.write('{}\n'.format(row['aa_sequence']))
    return database_df

def get_query_database(path, fasta_path, pdb_file_path):
    database_df = pd.read_csv(path)
    database_df = database_df[['alphafolddb-id', 'aa_sequence','site_labels', 'site_types']]
    database_df['alphafolddb-id'] = database_df['alphafolddb-id'].apply(lambda x:x.replace(';',''))
    
    database_df = multiprocess_structure_check(database_df, nb_workers=12, pdb_file_path=pdb_file_path)
    
    write_database_df = database_df.drop_duplicates(subset=['alphafolddb-id', 'aa_sequence','site_labels', 'site_types']).reset_index(drop=True)


    with open(fasta_path, 'w', encoding='utf-8') as f:
        for idx, row in tqdm(write_database_df.iterrows(), total=len(write_database_df)):
            f.write('>{}\n'.format(row['alphafolddb-id']))
            f.write('{}\n'.format(row['aa_sequence']))
    return database_df


In [3]:
dataset_path = '../../dataset/ec_site_dataset/uniprot_ecreact_cluster_split_merge_dataset_limit_100'
test_dataset_fasta_path = os.path.join(dataset_path, 'test_dataset.fasta')
baseline_results_path = 'baseline_results'

train_database_df = pd.read_pickle('../../dataset/raw_dataset/ec_datasets/split_ec_dataset/train_ec_uniprot_dataset_cluster_sample.pkl')
test_dataset = get_query_database(os.path.join(dataset_path, 'test_dataset', 'uniprot_ecreact_merge.csv'), fasta_path=test_dataset_fasta_path, pdb_file_path=os.path.join(os.path.dirname(dataset_path), 'structures', 'alphafolddb_download'))
test_dataset = test_dataset.loc[test_dataset['is_valid']]
test_dataset

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=75), Label(value='0 / 75'))), HBox…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=75), Label(value='0 / 75'))), HBox…

  0%|          | 0/853 [00:00<?, ?it/s]

Unnamed: 0,alphafolddb-id,aa_sequence,site_labels,site_types,pdb_files,aa_sequence_calculated,is_valid
0,A0A1S7LCW6,MKLKGTTIVALGMLVVAIMVLASMIDLPGSDMSATPAPPDTPRGAP...,"[[206], [212], [215], [216], [246], [252], [25...","[0, 0, 0, 0, 0, 0, 0, 0]",../../dataset/ec_site_dataset/structures/alpha...,MKLKGTTIVALGMLVVAIMVLASMIDLPGSDMSATPAPPDTPRGAP...,True
1,Q9F0J6,MQATKIIDGFHLVGAIDWNSRDFHGYTLSPMGTTYNAYLVEDEKTT...,"[[79], [81], [83], [146], [165], [165], [226]]","[0, 0, 0, 0, 0, 0, 0]",../../dataset/ec_site_dataset/structures/alpha...,MQATKIIDGFHLVGAIDWNSRDFHGYTLSPMGTTYNAYLVEDEKTT...,True
2,Q5BEJ7,MADHEQEQEPLSIAIIGGGIIGLMTALGLLHRNIGKVTIYERASAW...,"[[41, 42], [245, 247], [320], [330, 334]]","[0, 0, 0, 0]",../../dataset/ec_site_dataset/structures/alpha...,MADHEQEQEPLSIAIIGGGIIGLMTALGLLHRNIGKVTIYERASAW...,True
3,Q9HUH4,MPQALSTDILIVGGGIAGLWLNARLRRAGYATVLVESASLGGGQSV...,"[[17], [36], [44, 45], [49, 51], [346, 347]]","[0, 0, 0, 0, 0]",../../dataset/ec_site_dataset/structures/alpha...,MPQALSTDILIVGGGIAGLWLNARLRRAGYATVLVESASLGGGQSV...,True
4,P96692,MAEFTHLVNERRSASNFLSGHPITKEDLNEMFELVALAPSAFNLQH...,"[[11, 13], [68, 70], [157, 158], [193], [196]]","[0, 0, 0, 0, 0]",../../dataset/ec_site_dataset/structures/alpha...,MAEFTHLVNERRSASNFLSGHPITKEDLNEMFELVALAPSAFNLQH...,True
...,...,...,...,...,...,...,...
889,O30144,MFLKVRAEKRLGNFRLNVDFEMGRDYCVLLGPTGAGKSVFLELIAG...,"[[31, 38]]",[0],../../dataset/ec_site_dataset/structures/alpha...,MFLKVRAEKRLGNFRLNVDFEMGRDYCVLLGPTGAGKSVFLELIAG...,True
890,P28737,MSRKFDLKTITDLSVLVGTGISLYYLVSRLLNDVESGPLSGKSRES...,"[[133, 140]]",[0],../../dataset/ec_site_dataset/structures/alpha...,MSRKFDLKTITDLSVLVGTGISLYYLVSRLLNDVESGPLSGKSRES...,True
891,P37093,MTEMVISPAERQSIRRLPFSFANRFKLVLDWNEDFSQASIYYLAPL...,"[[397], [400], [430], [433]]","[0, 0, 0, 0]",../../dataset/ec_site_dataset/structures/alpha...,MTEMVISPAERQSIRRLPFSFANRFKLVLDWNEDFSQASIYYLAPL...,True
892,P94360,MAELRMEHIYKFYDQKEPAVDDFNLHIADKEFIVFVGPSGCGKSTT...,"[[37, 44]]",[0],../../dataset/ec_site_dataset/structures/alpha...,MAELRMEHIYKFYDQKEPAVDDFNLHIADKEFIVFVGPSGCGKSTT...,True


In [10]:
len(set(test_dataset['alphafolddb-id']))

851

In [4]:
import subprocess

deepfri_model_root = '/home/xiaoruiwang/data/ubuntu_work_beta/protein_work/DeepFRI'
args = [
                "--fasta_fn",
                os.path.abspath(test_dataset_fasta_path),
                "-ont",
                "ec",
                "-v",
                "--saliency",
                "--use_guided_grads",
                "-o", os.path.abspath(os.path.join(baseline_results_path, 'DeepFRI')),
                "--model_config", os.path.abspath(os.path.join(deepfri_model_root, 'trained_models/model_config.json'))

        ]

deepfir_results_fname = 'DeepFRI_EC_saliency_maps.json'

# activate_cmd = f"source activate deepfri_env"
python_cmd = '/home/xiaoruiwang/software/miniconda3/envs/deepfri_env/bin/python {} {}'.format(os.path.join(deepfri_model_root, 'predict.py'), ' '.join(args))

# command = f'{activate_cmd} && {python_cmd}'
command = f'{python_cmd}'
subprocess.run(command, shell=True, check=True, cwd=deepfri_model_root)

2024-05-18 17:07:56.306005: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/xiaoruiwang/software/amber20/lib:/home/xiaoruiwang/software/openbabel/lib/:
2024-05-18 17:07:56.306024: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-05-18 17:07:56.963107: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2024-05-18 17:07:56.984536: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-05-18 17:07:56.984620: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVID

### Computing predictions from fasta...
Protein GO-term/EC-number Score GO-term/EC-number name
Q5BEJ7 1.1.1.- 0.14849 1.1.1.-
Q9HUH4 1.1.1.- 0.10051 1.1.1.-
P96692 2.7.7.- 0.27287 2.7.7.-
L0E4H0 1.1.1.- 0.18617 1.1.1.-
O34678 1.1.1.- 0.20533 1.1.1.-
P31808 1.1.1.- 0.37966 1.1.1.-
L0E4F8 1.1.1.- 0.26933 1.1.1.-
Q05016 1.1.1.- 0.59278 1.1.1.-
A5CAL1 1.1.1.- 0.15057 1.1.1.-
Q59787 1.1.1.- 0.75677 1.1.1.-
Q9LBG2 1.1.1.- 0.78132 1.1.1.-
P14295 1.1.1.- 0.59457 1.1.1.-
Q4DJ07 2.7.7.- 0.11959 2.7.7.-
P42327 1.1.1.- 0.85313 1.1.1.-
P42327 1.1.1.1 0.12235 1.1.1.1
P73574 1.1.1.- 0.93364 1.1.1.-
P73574 1.1.1.37 0.12716 1.1.1.37
O67610 1.1.1.- 0.75755 1.1.1.-
Q9RPT1 1.1.1.- 0.71428 1.1.1.-
P11759 1.1.1.- 0.14429 1.1.1.-
O34661 1.1.1.- 0.69844 1.1.1.-
P75691 1.1.1.- 0.63244 1.1.1.-
P50098 1.1.1.- 0.13695 1.1.1.-
Q19905 1.1.1.- 0.24516 1.1.1.-
Q0P8H3 1.1.1.- 0.42166 1.1.1.-
Q606Q2 1.1.1.- 0.89819 1.1.1.-
Q606Q2 1.1.1.23 0.25086 1.1.1.23
P44774 1.1.1.- 0.24231 1.1.1.-
Q5KWX7 1.1.1.- 0.93289 1.1.1.-
Q5

CompletedProcess(args='/home/xiaoruiwang/software/miniconda3/envs/deepfri_env/bin/python /home/xiaoruiwang/data/ubuntu_work_beta/protein_work/DeepFRI/predict.py --fasta_fn /home/xiaoruiwang/data/ubuntu_work_beta/single_step_work/EasIFA_v2/dataset/ec_site_dataset/uniprot_ecreact_cluster_split_merge_dataset_limit_100/test_dataset.fasta -ont ec -v --saliency --use_guided_grads -o /home/xiaoruiwang/data/ubuntu_work_beta/single_step_work/EasIFA_v2/script/baseline_methods/baseline_results/DeepFRI --model_config /home/xiaoruiwang/data/ubuntu_work_beta/protein_work/DeepFRI/trained_models/model_config.json', returncode=0)

In [5]:
import json
with open(os.path.join(baseline_results_path, deepfir_results_fname), 'r') as f:
    deepfir_results = json.load(f)

In [11]:
caculated_ids = pd.DataFrame(deepfir_results).columns.tolist()

In [9]:
len(deepfir_results)

466

In [16]:
test_dataset.loc[~test_dataset['alphafolddb-id'].isin(caculated_ids)]['aa_sequence'].tolist()[0]

'MKLKGTTIVALGMLVVAIMVLASMIDLPGSDMSATPAPPDTPRGAPIVGGQGQAMGLPVAMQRRRGEQRAPVPALSDANGGFVAPNVQFSEAHWQGMEALPLSIELKRKLKLPLDLEGLLIDETSLNAAVSGLLAGDVLVAINGRKVKTLKKMQKETRRVQMDRRASLTVYRKGRLLTLTLSEEKNLGLAQVETAPMILPGDIMPHPYRGPCTQCHAIGTTGHITPDPDGIVLPPGPIRAGAKMPHRDRGPCAACHAIIQ'