In [1]:
import os
import pandas as pd

In [2]:
# 加载从uniprot下载的酶数据集
uniprot_raw_path = 'ec_datasets/uniprot_raw'

download_uniprot_dataset = pd.read_csv(os.path.join(uniprot_raw_path, 'uniprot-download_sequence_site_ec.tsv'), sep='\t')
download_uniprot_dataset.head()

Unnamed: 0,Entry,Organism,Length,EC number,AlphaFoldDB,Active site,Binding site,Site,PDB,Sequence,in_ec_react
0,A0A1B0GTW7,Homo sapiens (Human),788,3.4.24.-,A0A1B0GTW7;,"ACT_SITE 306; /evidence=""ECO:0000255|PROSITE-P...","BINDING 305; /ligand=""Zn(2+)""; /ligand_id=""ChE...",,,MLLLLLLLLLLPPLVLRVAASRCLHDETQKSVSLLRPPFSQLPSKS...,True
1,A1L3X0,Homo sapiens (Human),281,2.3.1.199,A1L3X0;,"ACT_SITE 150; /note=""Nucleophile""; /evidence=""...","BINDING 124; /ligand=""3-oxoeicosanoyl-CoA""; /l...",,6Y7F;,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...,True
2,A2RUC4,Homo sapiens (Human),315,1.14.11.42,A2RUC4;,,"BINDING 106; /ligand=""2-oxoglutarate""; /ligand...",,3AL5;3AL6;,MAGQHLPVPRLEGVSREQFMQHLYPQRKPLVLEGIDLGPCTSKWTV...,True
3,A5PLL7,Homo sapiens (Human),270,1.14.19.77,A5PLL7;,,,"SITE 95; /note=""Essential for catalytic activi...",,MAGAENWPGQQLELDEDEASCCRWGAQHAGARELAALYSPGKRLQE...,True
4,C9JRZ8,Homo sapiens (Human),316,1.1.1.-; 1.1.1.216; 1.1.1.300; 1.1.1.54; 1.1.1.64,C9JRZ8;,"ACT_SITE 49; /note=""Proton donor""; /evidence=""...","BINDING 20..22; /ligand=""NADP(+)""; /ligand_id=...","SITE 78; /note=""Lowers pKa of active site Tyr""...",,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...,True


In [3]:
download_uniprot_dataset = download_uniprot_dataset.drop(['in_ec_react'], axis=1)
download_uniprot_dataset['Length'].describe()

count    238979.000000
mean        417.244440
std         334.709189
min           4.000000
25%         253.000000
50%         352.000000
75%         485.000000
max       35213.000000
Name: Length, dtype: float64

In [4]:
# 为了节省计算资源，将最大的蛋白序列数限定在600个氨基酸
download_uniprot_dataset = download_uniprot_dataset.loc[download_uniprot_dataset['Length'] <= 600]
download_uniprot_dataset = download_uniprot_dataset.loc[download_uniprot_dataset['AlphaFoldDB'].notna()]
download_uniprot_dataset['Length'].describe()

count    204265.000000
mean        332.479994
std         121.810267
min          16.000000
25%         240.000000
50%         327.000000
75%         425.000000
max         600.000000
Name: Length, dtype: float64

In [5]:
download_uniprot_dataset.columns.tolist()

['Entry',
 'Organism',
 'Length',
 'EC number',
 'AlphaFoldDB',
 'Active site',
 'Binding site',
 'Site',
 'PDB',
 'Sequence']

In [6]:
download_uniprot_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 204265 entries, 1 to 238978
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Entry         204265 non-null  object
 1   Organism      204265 non-null  object
 2   Length        204265 non-null  int64 
 3   EC number     204265 non-null  object
 4   AlphaFoldDB   204265 non-null  object
 5   Active site   75520 non-null   object
 6   Binding site  133961 non-null  object
 7   Site          21044 non-null   object
 8   PDB           8990 non-null    object
 9   Sequence      204265 non-null  object
dtypes: int64(1), object(9)
memory usage: 17.1+ MB


In [7]:
for col in [ 'Active site', 'Binding site', 'Site']:
    print(f'列 {col} 包含位点的数据：')
    print(len(download_uniprot_dataset.loc[~pd.isna(download_uniprot_dataset[col])]))

print('至少包含一种位点的数据：')
print(len(download_uniprot_dataset.loc[~(pd.isna(download_uniprot_dataset['Site']) & pd.isna(download_uniprot_dataset['Active site']) & pd.isna(download_uniprot_dataset['Binding site']))]))
    

列 Active site 包含位点的数据：
75520
列 Binding site 包含位点的数据：
133961
列 Site 包含位点的数据：
21044
至少包含一种位点的数据：
155987


In [8]:
def split_id(id_data):
    if pd.notna(id_data):
        return [x.strip() for x in id_data.split(';') if x]
    else:
        return id_data

In [9]:
# col_splits = ['EC number', 'PDB']
# for col in col_splits:
#     download_uniprot_dataset[col] = download_uniprot_dataset[col].apply(lambda x:split_id(x))

In [10]:
download_uniprot_dataset.head()

Unnamed: 0,Entry,Organism,Length,EC number,AlphaFoldDB,Active site,Binding site,Site,PDB,Sequence
1,A1L3X0,Homo sapiens (Human),281,2.3.1.199,A1L3X0;,"ACT_SITE 150; /note=""Nucleophile""; /evidence=""...","BINDING 124; /ligand=""3-oxoeicosanoyl-CoA""; /l...",,6Y7F;,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...
2,A2RUC4,Homo sapiens (Human),315,1.14.11.42,A2RUC4;,,"BINDING 106; /ligand=""2-oxoglutarate""; /ligand...",,3AL5;3AL6;,MAGQHLPVPRLEGVSREQFMQHLYPQRKPLVLEGIDLGPCTSKWTV...
3,A5PLL7,Homo sapiens (Human),270,1.14.19.77,A5PLL7;,,,"SITE 95; /note=""Essential for catalytic activi...",,MAGAENWPGQQLELDEDEASCCRWGAQHAGARELAALYSPGKRLQE...
4,C9JRZ8,Homo sapiens (Human),316,1.1.1.-; 1.1.1.216; 1.1.1.300; 1.1.1.54; 1.1.1.64,C9JRZ8;,"ACT_SITE 49; /note=""Proton donor""; /evidence=""...","BINDING 20..22; /ligand=""NADP(+)""; /ligand_id=...","SITE 78; /note=""Lowers pKa of active site Tyr""...",,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...
9,O00506,Homo sapiens (Human),426,2.7.11.1,O00506;,"ACT_SITE 140; /note=""Proton acceptor""; /eviden...","BINDING 26..34; /ligand=""ATP""; /ligand_id=""ChE...",,2XIK;3W8H;4NZW;7Z4V;,MAHLRGFANQHSRVDPEELFTKLDRIGKGSFGEVYKGIDNHTKEVV...


In [11]:
download_uniprot_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 204265 entries, 1 to 238978
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Entry         204265 non-null  object
 1   Organism      204265 non-null  object
 2   Length        204265 non-null  int64 
 3   EC number     204265 non-null  object
 4   AlphaFoldDB   204265 non-null  object
 5   Active site   75520 non-null   object
 6   Binding site  133961 non-null  object
 7   Site          21044 non-null   object
 8   PDB           8990 non-null    object
 9   Sequence      204265 non-null  object
dtypes: int64(1), object(9)
memory usage: 17.1+ MB


In [12]:
from collections import OrderedDict
def convert_site_data(site_data, split_flag):
    if pd.notna(site_data):
        site_data_list = [x.strip() for x in site_data.split(split_flag)]
        site_data_list = [x for x in site_data_list if x]
        reformate_site_data_list  = []
        for one_site in site_data_list:
            one_site_dict = OrderedDict()
            one_site_list = one_site.split('; /')
            for i, data in enumerate(one_site_list):
                if i == 0:
                    one_site_dict['function'] = split_flag
                    one_site_dict['position'] = data
                else:
                    info_name, info = data.split('=')
                    info = info.replace('\";', '')
                    info = info.replace('\"', '')
                    one_site_dict[info_name] = info
            reformate_site_data_list.append(one_site_dict)
        return reformate_site_data_list    
    else:
        return site_data

In [13]:
print('去重之后的序列数：')
print(len(set(download_uniprot_dataset['Sequence'])))

去重之后的序列数：
170757


In [14]:
download_uniprot_dataset = download_uniprot_dataset.drop_duplicates(subset=['Active site', 'Binding site', 'Site', 'Sequence']).reset_index(drop=True)
download_uniprot_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172054 entries, 0 to 172053
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Entry         172054 non-null  object
 1   Organism      172054 non-null  object
 2   Length        172054 non-null  int64 
 3   EC number     172054 non-null  object
 4   AlphaFoldDB   172054 non-null  object
 5   Active site   64071 non-null   object
 6   Binding site  112908 non-null  object
 7   Site          17879 non-null   object
 8   PDB           8605 non-null    object
 9   Sequence      172054 non-null  object
dtypes: int64(1), object(9)
memory usage: 13.1+ MB


In [15]:
download_uniprot_dataset.head()

Unnamed: 0,Entry,Organism,Length,EC number,AlphaFoldDB,Active site,Binding site,Site,PDB,Sequence
0,A1L3X0,Homo sapiens (Human),281,2.3.1.199,A1L3X0;,"ACT_SITE 150; /note=""Nucleophile""; /evidence=""...","BINDING 124; /ligand=""3-oxoeicosanoyl-CoA""; /l...",,6Y7F;,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...
1,A2RUC4,Homo sapiens (Human),315,1.14.11.42,A2RUC4;,,"BINDING 106; /ligand=""2-oxoglutarate""; /ligand...",,3AL5;3AL6;,MAGQHLPVPRLEGVSREQFMQHLYPQRKPLVLEGIDLGPCTSKWTV...
2,A5PLL7,Homo sapiens (Human),270,1.14.19.77,A5PLL7;,,,"SITE 95; /note=""Essential for catalytic activi...",,MAGAENWPGQQLELDEDEASCCRWGAQHAGARELAALYSPGKRLQE...
3,C9JRZ8,Homo sapiens (Human),316,1.1.1.-; 1.1.1.216; 1.1.1.300; 1.1.1.54; 1.1.1.64,C9JRZ8;,"ACT_SITE 49; /note=""Proton donor""; /evidence=""...","BINDING 20..22; /ligand=""NADP(+)""; /ligand_id=...","SITE 78; /note=""Lowers pKa of active site Tyr""...",,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...
4,O00506,Homo sapiens (Human),426,2.7.11.1,O00506;,"ACT_SITE 140; /note=""Proton acceptor""; /eviden...","BINDING 26..34; /ligand=""ATP""; /ligand_id=""ChE...",,2XIK;3W8H;4NZW;7Z4V;,MAHLRGFANQHSRVDPEELFTKLDRIGKGSFGEVYKGIDNHTKEVV...


In [16]:

# 重新整理活性位点的数据，用字典储存每一个位点
reformate_cols = ['Active site', 'Binding site', 'Site',]
split_flag_dict = {
    'Active site':'ACT_SITE', 'Binding site':'BINDING', 'Site':'SITE',
}
for col in reformate_cols:
    download_uniprot_dataset['{} reformate'.format(col)] = download_uniprot_dataset[col].apply(lambda x:convert_site_data(x, split_flag_dict[col]))

In [17]:
download_uniprot_dataset.head()

Unnamed: 0,Entry,Organism,Length,EC number,AlphaFoldDB,Active site,Binding site,Site,PDB,Sequence,Active site reformate,Binding site reformate,Site reformate
0,A1L3X0,Homo sapiens (Human),281,2.3.1.199,A1L3X0;,"ACT_SITE 150; /note=""Nucleophile""; /evidence=""...","BINDING 124; /ligand=""3-oxoeicosanoyl-CoA""; /l...",,6Y7F;,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...,"[{'function': 'ACT_SITE', 'position': '150', '...","[{'function': 'BINDING', 'position': '124', 'l...",
1,A2RUC4,Homo sapiens (Human),315,1.14.11.42,A2RUC4;,,"BINDING 106; /ligand=""2-oxoglutarate""; /ligand...",,3AL5;3AL6;,MAGQHLPVPRLEGVSREQFMQHLYPQRKPLVLEGIDLGPCTSKWTV...,,"[{'function': 'BINDING', 'position': '106', 'l...",
2,A5PLL7,Homo sapiens (Human),270,1.14.19.77,A5PLL7;,,,"SITE 95; /note=""Essential for catalytic activi...",,MAGAENWPGQQLELDEDEASCCRWGAQHAGARELAALYSPGKRLQE...,,,"[{'function': 'SITE', 'position': '95', 'note'..."
3,C9JRZ8,Homo sapiens (Human),316,1.1.1.-; 1.1.1.216; 1.1.1.300; 1.1.1.54; 1.1.1.64,C9JRZ8;,"ACT_SITE 49; /note=""Proton donor""; /evidence=""...","BINDING 20..22; /ligand=""NADP(+)""; /ligand_id=...","SITE 78; /note=""Lowers pKa of active site Tyr""...",,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...,"[{'function': 'ACT_SITE', 'position': '49', 'n...","[{'function': 'BINDING', 'position': '20..22',...","[{'function': 'SITE', 'position': '78', 'note'..."
4,O00506,Homo sapiens (Human),426,2.7.11.1,O00506;,"ACT_SITE 140; /note=""Proton acceptor""; /eviden...","BINDING 26..34; /ligand=""ATP""; /ligand_id=""ChE...",,2XIK;3W8H;4NZW;7Z4V;,MAHLRGFANQHSRVDPEELFTKLDRIGKGSFGEVYKGIDNHTKEVV...,"[{'function': 'ACT_SITE', 'position': '140', '...","[{'function': 'BINDING', 'position': '26..34',...",


In [18]:
# 多个EC number的数据复制，使得每个EC number都直接可以对应一组数据
download_uniprot_dataset['EC number'] = download_uniprot_dataset['EC number'].map(lambda x:x.split(';'))
download_uniprot_dataset = download_uniprot_dataset.explode('EC number')
download_uniprot_dataset['EC number'] = download_uniprot_dataset['EC number'].apply(lambda x:x.strip())

In [19]:
download_uniprot_dataset.head(10)

Unnamed: 0,Entry,Organism,Length,EC number,AlphaFoldDB,Active site,Binding site,Site,PDB,Sequence,Active site reformate,Binding site reformate,Site reformate
0,A1L3X0,Homo sapiens (Human),281,2.3.1.199,A1L3X0;,"ACT_SITE 150; /note=""Nucleophile""; /evidence=""...","BINDING 124; /ligand=""3-oxoeicosanoyl-CoA""; /l...",,6Y7F;,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...,"[{'function': 'ACT_SITE', 'position': '150', '...","[{'function': 'BINDING', 'position': '124', 'l...",
1,A2RUC4,Homo sapiens (Human),315,1.14.11.42,A2RUC4;,,"BINDING 106; /ligand=""2-oxoglutarate""; /ligand...",,3AL5;3AL6;,MAGQHLPVPRLEGVSREQFMQHLYPQRKPLVLEGIDLGPCTSKWTV...,,"[{'function': 'BINDING', 'position': '106', 'l...",
2,A5PLL7,Homo sapiens (Human),270,1.14.19.77,A5PLL7;,,,"SITE 95; /note=""Essential for catalytic activi...",,MAGAENWPGQQLELDEDEASCCRWGAQHAGARELAALYSPGKRLQE...,,,"[{'function': 'SITE', 'position': '95', 'note'..."
3,C9JRZ8,Homo sapiens (Human),316,1.1.1.-,C9JRZ8;,"ACT_SITE 49; /note=""Proton donor""; /evidence=""...","BINDING 20..22; /ligand=""NADP(+)""; /ligand_id=...","SITE 78; /note=""Lowers pKa of active site Tyr""...",,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...,"[{'function': 'ACT_SITE', 'position': '49', 'n...","[{'function': 'BINDING', 'position': '20..22',...","[{'function': 'SITE', 'position': '78', 'note'..."
3,C9JRZ8,Homo sapiens (Human),316,1.1.1.216,C9JRZ8;,"ACT_SITE 49; /note=""Proton donor""; /evidence=""...","BINDING 20..22; /ligand=""NADP(+)""; /ligand_id=...","SITE 78; /note=""Lowers pKa of active site Tyr""...",,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...,"[{'function': 'ACT_SITE', 'position': '49', 'n...","[{'function': 'BINDING', 'position': '20..22',...","[{'function': 'SITE', 'position': '78', 'note'..."
3,C9JRZ8,Homo sapiens (Human),316,1.1.1.300,C9JRZ8;,"ACT_SITE 49; /note=""Proton donor""; /evidence=""...","BINDING 20..22; /ligand=""NADP(+)""; /ligand_id=...","SITE 78; /note=""Lowers pKa of active site Tyr""...",,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...,"[{'function': 'ACT_SITE', 'position': '49', 'n...","[{'function': 'BINDING', 'position': '20..22',...","[{'function': 'SITE', 'position': '78', 'note'..."
3,C9JRZ8,Homo sapiens (Human),316,1.1.1.54,C9JRZ8;,"ACT_SITE 49; /note=""Proton donor""; /evidence=""...","BINDING 20..22; /ligand=""NADP(+)""; /ligand_id=...","SITE 78; /note=""Lowers pKa of active site Tyr""...",,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...,"[{'function': 'ACT_SITE', 'position': '49', 'n...","[{'function': 'BINDING', 'position': '20..22',...","[{'function': 'SITE', 'position': '78', 'note'..."
3,C9JRZ8,Homo sapiens (Human),316,1.1.1.64,C9JRZ8;,"ACT_SITE 49; /note=""Proton donor""; /evidence=""...","BINDING 20..22; /ligand=""NADP(+)""; /ligand_id=...","SITE 78; /note=""Lowers pKa of active site Tyr""...",,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...,"[{'function': 'ACT_SITE', 'position': '49', 'n...","[{'function': 'BINDING', 'position': '20..22',...","[{'function': 'SITE', 'position': '78', 'note'..."
4,O00506,Homo sapiens (Human),426,2.7.11.1,O00506;,"ACT_SITE 140; /note=""Proton acceptor""; /eviden...","BINDING 26..34; /ligand=""ATP""; /ligand_id=""ChE...",,2XIK;3W8H;4NZW;7Z4V;,MAHLRGFANQHSRVDPEELFTKLDRIGKGSFGEVYKGIDNHTKEVV...,"[{'function': 'ACT_SITE', 'position': '140', '...","[{'function': 'BINDING', 'position': '26..34',...",
5,O00746,Homo sapiens (Human),187,2.7.4.6,O00746;,"ACT_SITE 151; /note=""Pros-phosphohistidine int...","BINDING 45; /ligand=""ATP""; /ligand_id=""ChEBI:C...",,1EHW;,MGGLFWRSALRGLRCGPRAPGPSLLVRHGSGGPSWTRERTLVAVKP...,"[{'function': 'ACT_SITE', 'position': '151', '...","[{'function': 'BINDING', 'position': '45', 'li...",


In [20]:
download_uniprot_dataset['Site labeled'] = ~(pd.isna(download_uniprot_dataset['Active site']) & pd.isna(download_uniprot_dataset['Site']) & pd.isna(download_uniprot_dataset['Binding site']))

In [21]:
site_nolabeled_dataset = download_uniprot_dataset.loc[~download_uniprot_dataset['Site labeled']]
site_nolabeled_dataset.info()    # 完全没有位点标记的数据

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41410 entries, 9 to 172053
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Entry                   41410 non-null  object
 1   Organism                41410 non-null  object
 2   Length                  41410 non-null  int64 
 3   EC number               41410 non-null  object
 4   AlphaFoldDB             41410 non-null  object
 5   Active site             0 non-null      object
 6   Binding site            0 non-null      object
 7   Site                    0 non-null      object
 8   PDB                     1284 non-null   object
 9   Sequence                41410 non-null  object
 10  Active site reformate   0 non-null      object
 11  Binding site reformate  0 non-null      object
 12  Site reformate          0 non-null      object
 13  Site labeled            41410 non-null  bool  
dtypes: bool(1), int64(1), object(12)
memory usage: 4.5+ M

In [22]:
site_labeled_dataset = download_uniprot_dataset.loc[download_uniprot_dataset['Site labeled']]
site_labeled_dataset.info()    # 完全没有位点标记的数据

<class 'pandas.core.frame.DataFrame'>
Int64Index: 143398 entries, 0 to 172049
Data columns (total 14 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Entry                   143398 non-null  object
 1   Organism                143398 non-null  object
 2   Length                  143398 non-null  int64 
 3   EC number               143398 non-null  object
 4   AlphaFoldDB             143398 non-null  object
 5   Active site             70740 non-null   object
 6   Binding site            122618 non-null  object
 7   Site                    19384 non-null   object
 8   PDB                     8523 non-null    object
 9   Sequence                143398 non-null  object
 10  Active site reformate   70740 non-null   object
 11  Binding site reformate  122618 non-null  object
 12  Site reformate          19384 non-null   object
 13  Site labeled            143398 non-null  bool  
dtypes: bool(1), int64(1), object(12)
mem

In [23]:
site_nolabeled_dataset.to_pickle(os.path.join(uniprot_raw_path, 'uniprot-download_sequence_site_ec_clean-site-nolabeled.pkl'))
site_labeled_dataset.to_pickle(os.path.join(uniprot_raw_path, 'uniprot-download_sequence_site_ec_clean-site-labeled.pkl'))
download_uniprot_dataset.to_pickle(os.path.join(uniprot_raw_path, 'uniprot-download_sequence_site_ec_clean.pkl'))