In [109]:
# 1. Process CAFA3 training set

In [9]:
import pandas as pd
import ast

In [10]:
df = pd.read_csv('raw_data/CAFA3_training_data/training_set.csv')

In [11]:
# Feed this file to UniProt ID Online Mapping tool: https://www.uniprot.org/id-mapping to obtained STRING IDs 
PID_list = df.PID.unique().tolist()
with open("PID_list.txt", "w") as f:
    for pid in PID_list:
        f.write(pid + "\n")
# 

In [12]:
# Say the mapping file downloaded from UniProt online tool named as "UniProt2STRING.tsv"
uniprot_string_mapping = pd.read_csv('./raw_data/UniProt2STRING.tsv', sep='\t')
uniprot_string_mapping.rename(columns={'From': "PID", "To": "STRING"},inplace=True)

In [13]:
merge_taxon = pd.merge(df, uniprot_string_mapping, left_on='PID', right_on='PID', how = 'left')

In [14]:
merge_taxon.drop_duplicates(subset=["PID"], keep="last", inplace= True)

In [15]:
# merge_taxon['TaxonID'] = merge_taxon['STRING'].apply(lambda x: x.split('.')[0])
import numpy as np
def extract_taxon_id(string_value):
    if pd.isna(string_value):
        return np.nan
    return string_value.split('.')[0]

merge_taxon['TaxonID'] = merge_taxon['STRING'].apply(extract_taxon_id)

In [16]:
def safe_literal_eval(s):
    try:
        # 检查是否为空列表表示
        if s == '[]':
            return []
        # 否则，尝试解析为列表
        return ast.literal_eval(s)
    except:
        # 如果出现任何错误，返回原始字符串
        return s

# 假设df是您的DataFrame
for column in ['BP', 'CC', 'MF']:
    merge_taxon[column] = merge_taxon[column].apply(safe_literal_eval)

In [17]:
# remove last row
merge_taxon = merge_taxon[:-1]

In [18]:
merge_taxon.set_index("PID",inplace=True)
merge_taxon.to_pickle("data/CAFA3/train_df.pkl")

In [19]:
len(merge_taxon)

66841

In [None]:
df1 = pd.read_pickle("data/CAFA3/training_df.pkl")
df2 = pd.read_pickle("data/CAFA3/train_df.pkl")

all_row = df1.loc['all']
all_row_filtered = all_row[df2.columns.intersection(all_row.index)].to_frame().T
all_row_filtered.index = ['all']
df2_combined = pd.concat([df2, all_row_filtered])
df2_combined.to_pickle("data/CAFA3/train_df.pkl")

In [5]:
# 2. Prepare for the test set
import pandas as pd
test_df = pd.read_pickle("data/CAFA3/test/test_df.pkl")
test_df.reset_index(inplace=True)
test_df.rename(columns={"index": "PID"}, inplace=True)


In [6]:
uid_dict = pd.read_pickle("data/CAFA3/test/test_uid.pkl")

In [7]:
test_df['mapped_id'] = test_df['PID'].map(uid_dict)

In [8]:
uid_dict["T72270011510"]

'Q9VS51'

In [9]:
uid_dict = pd.read_pickle("data/CAFA3/test/test_uid.pkl")
test_df['mapped_id'] = test_df['PID'].map(uid_dict)
test_df = test_df[~test_df['mapped_id'].isna()]

In [10]:
# Feed this file to UniProt ID Online Mapping tool: https://www.uniprot.org/id-mapping to obtained STRING IDs
PID_list = test_df.mapped_id.unique().tolist()
with open("data/CAFA3/test/test_PID_list.txt", "w") as f:
    for pid in PID_list:
        f.write(pid + "\n")
# 

In [11]:
unid_string = pd.read_csv("data/CAFA3/test/test_uid2string.tsv", sep="\t")
merged_df = pd.merge(test_df, unid_string,
                     left_on='mapped_id', right_on='From', how='left')
merged_df.drop(columns=['mapped_id'], inplace=True)
merged_df.rename(columns={"From": "UID", "To": "STRING"}, inplace=True)
merged_df['length'] = merged_df.apply(lambda x: len(x['sequence']), axis=1)

In [12]:
merged_df.drop_duplicates(subset=["PID"], keep="last", inplace= True)

In [13]:
import numpy as np
def extract_taxon_id(string_value):
    if pd.isna(string_value):
        return np.nan
    return string_value.split('.')[0]
merged_df['TaxonID'] = merged_df['STRING'].apply(extract_taxon_id)

In [14]:
merged_df.head()

Unnamed: 0,PID,BP,CC,MF,sequence,UID,STRING,length,TaxonID
0,T100900000026,"[GO:0030030, GO:0008150, GO:0065007, GO:000998...","[GO:0030054, GO:0005575, GO:0070160, GO:000591...","[GO:0003824, GO:0003924, GO:0016817, GO:001681...",MAESFKELDPDSSMGKALEMTCAIQNQLARILAEFEMTLERDVLQP...,P55194,10090.ENSMUSP00000052181,601,10090
1,T100900000046,[],[],"[GO:0042802, GO:0003674, GO:0005515, GO:0005488]",MRLCIPQVLLALFLSMLTAPGEGSRRRATQEDTTQPALLRLSDHLL...,P23979,10090.ENSMUSP00000003826,487,10090
2,T100900000115,"[GO:0044238, GO:0008150, GO:0008104, GO:004426...","[GO:0043226, GO:0005768, GO:0044444, GO:009879...","[GO:0016788, GO:0003674, GO:0016787, GO:000382...",MNNLSFSELCCLFCCPPCPGKIASKLAFLPPDPTYTLMCDESGSRW...,Q6AY17,10116.ENSRNOP00000047508,288,10116
3,T100900000116,"[GO:0033365, GO:0009057, GO:0044238, GO:007072...","[GO:0014069, GO:0098794, GO:0045202, GO:000557...","[GO:0003824, GO:0016788, GO:0016790, GO:000367...",MPEPGPRMNGFSLGELCWLFCCPPCPSRIAAKLAFLPPEPTYTVLA...,B5DFK7,10116.ENSRNOP00000017362,320,10116
4,T100900000161,"[GO:0042439, GO:0008150, GO:0071704, GO:190156...",[],"[GO:0052689, GO:0016298, GO:0003824, GO:001678...",MADDLEQQPQGWLSSWLPTWRPTSMSQLKNVEARILQCLQNKFLAR...,Q8VD66,10090.ENSMUSP00000044134,342,10090


In [15]:
merged_df.set_index("PID",inplace=True)
merged_df.to_pickle("data/CAFA3/test_df.pkl")

In [16]:
merged_df

Unnamed: 0_level_0,BP,CC,MF,sequence,UID,STRING,length,TaxonID
PID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
T100900000026,"[GO:0030030, GO:0008150, GO:0065007, GO:000998...","[GO:0030054, GO:0005575, GO:0070160, GO:000591...","[GO:0003824, GO:0003924, GO:0016817, GO:001681...",MAESFKELDPDSSMGKALEMTCAIQNQLARILAEFEMTLERDVLQP...,P55194,10090.ENSMUSP00000052181,601,10090
T100900000046,[],[],"[GO:0042802, GO:0003674, GO:0005515, GO:0005488]",MRLCIPQVLLALFLSMLTAPGEGSRRRATQEDTTQPALLRLSDHLL...,P23979,10090.ENSMUSP00000003826,487,10090
T100900000115,"[GO:0044238, GO:0008150, GO:0008104, GO:004426...","[GO:0043226, GO:0005768, GO:0044444, GO:009879...","[GO:0016788, GO:0003674, GO:0016787, GO:000382...",MNNLSFSELCCLFCCPPCPGKIASKLAFLPPDPTYTLMCDESGSRW...,Q6AY17,10116.ENSRNOP00000047508,288,10116
T100900000116,"[GO:0033365, GO:0009057, GO:0044238, GO:007072...","[GO:0014069, GO:0098794, GO:0045202, GO:000557...","[GO:0003824, GO:0016788, GO:0016790, GO:000367...",MPEPGPRMNGFSLGELCWLFCCPPCPSRIAAKLAFLPPEPTYTVLA...,B5DFK7,10116.ENSRNOP00000017362,320,10116
T100900000161,"[GO:0042439, GO:0008150, GO:0071704, GO:190156...",[],"[GO:0052689, GO:0016298, GO:0003824, GO:001678...",MADDLEQQPQGWLSSWLPTWRPTSMSQLKNVEARILQCLQNKFLAR...,Q8VD66,10090.ENSMUSP00000044134,342,10090
...,...,...,...,...,...,...,...,...
T72270011510,[],[],[],MEFARRVSARFETKRLPEDVDDGLETLEEYKQRWRSVRIIYFTMFL...,Q9VS51,7227.FBpp0304979,546,7227
T833330001696,[],[],[],MASQLTDAFARKFYYLRLSITDVCNFRCTYCLPDGYKPSGVTNKGF...,P30745,511145.b0781,329,511145
T833330003120,[],[],[],MSASALVCLAPGSEETEAVTTIDLLVRGGIKVTTASVASDGNLAIT...,Q46948,511145.b0424,196,511145
T96060015362,[],[],[],MAGPNQLCIRRWTTKHVAVWLKDEGFFEYVDILCNKHRLDGITLLT...,Q96LT4,9606.ENSP00000500411,415,9606


In [60]:
# 2. PDBch dataset preparation
import pandas as pd
train_df = pd.read_pickle("data/PDBch/pdbch_training_df.pkl")
train_df.reset_index(inplace=True)
train_df.rename(columns={"index": "PID"}, inplace=True)

In [61]:
uid_dict = pd.read_pickle("data/PDBch/pdbch_uid_dict.pkl")
train_df['mapped_id'] = train_df['PID'].map(uid_dict)
train_df = train_df[~train_df.mapped_id.isna()]
# unid_string = pd.read_csv("data/test/test_uid2string.tsv", sep="\t")

In [62]:
PID_list = train_df.mapped_id.unique().tolist()
with open("data/PDBch/pdbch_PID_list.txt", "w") as f:
    for pid in PID_list:
        f.write(pid + "\n")

In [63]:
len(PID_list)

27842

In [64]:
unid_string = pd.read_csv("data/PDBch/train_uid2string.tsv", sep="\t")
merged_df = pd.merge(train_df, unid_string,
                     left_on='mapped_id', right_on='From', how='left')
merged_df.drop(columns=['mapped_id'], inplace=True)
merged_df.rename(columns={"From": "UID", "To": "STRING"}, inplace=True)
merged_df['length'] = merged_df.apply(lambda x: len(x['sequence']), axis=1)

In [65]:
merged_df.drop_duplicates(subset=["PID"], keep="last", inplace= True)

In [66]:
import numpy as np
def extract_taxon_id(string_value):
    if pd.isna(string_value):
        return np.nan
    return string_value.split('.')[0]
merged_df['TaxonID'] = merged_df['STRING'].apply(extract_taxon_id)

In [67]:
merged_df.set_index("PID",inplace=True)
merged_df.to_pickle("data/PDBch/train_df.pkl")

In [104]:
df1 = pd.read_pickle("data/PDBch/pdbch_training_df.pkl")
df2 = pd.read_pickle("data/PDBch/train_df.pkl")

all_row = df1.loc['all']
all_row_filtered = all_row[df2.columns.intersection(all_row.index)].to_frame().T
all_row_filtered.index = ['all']
df2_combined = pd.concat([df2, all_row_filtered])
df2_combined.to_pickle("data/PDBch/train_df.pkl")

In [68]:
import pandas as pd
test_df = pd.read_pickle("data/PDBch/pdbch_test_df.pkl")
test_df.reset_index(inplace=True)
test_df.rename(columns={"index": "PID"}, inplace=True)

In [69]:
uid_dict = pd.read_pickle("data/PDBch/pdbch_uid_dict.pkl")
test_df['mapped_id'] = test_df['PID'].map(uid_dict)
test_df = test_df[~test_df.mapped_id.isna()]
# unid_string = pd.read_csv("data/test/test_uid2string.tsv", sep="\t")

In [70]:
PID_list = test_df.mapped_id.unique().tolist()
with open("data/PDBch/test/pdbch_test_PID_list.txt", "w") as f:
    for pid in PID_list:
        f.write(pid + "\n")

In [71]:
unid_string = pd.read_csv("data/PDBch/test/test_uid2string.tsv", sep="\t")
merged_df = pd.merge(test_df, unid_string,
                     left_on='mapped_id', right_on='From', how='left')
merged_df.drop(columns=['mapped_id'], inplace=True)
merged_df.rename(columns={"From": "UID", "To": "STRING"}, inplace=True)
merged_df['length'] = merged_df.apply(lambda x: len(x['sequence']), axis=1)

In [72]:
merged_df.drop_duplicates(subset=["PID"], keep="last", inplace= True)

In [73]:
import numpy as np
def extract_taxon_id(string_value):
    if pd.isna(string_value):
        return np.nan
    return string_value.split('.')[0]
merged_df['TaxonID'] = merged_df['STRING'].apply(extract_taxon_id)

In [74]:
merged_df

Unnamed: 0,PID,BP,MF,CC,sequence,UID,STRING,length,TaxonID
0,11AS-A,"[GO:0006520, GO:0046394, GO:0006974, GO:003278...","[GO:0032559, GO:0017076, GO:0005524, GO:003563...",[GO:0005829],MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQD...,P00963,511145.b3744,330,511145
1,18GS-A,"[GO:0032496, GO:0009966, GO:0031099, GO:000657...","[GO:0008144, GO:0004364, GO:0030234, GO:190168...","[GO:0031974, GO:0012505, GO:0030141, GO:010100...",MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKA...,P09211,9606.ENSP00000381607,210,9606
2,1A0P-A,"[GO:0009314, GO:0006310, GO:0051301, GO:000625...","[GO:0140097, GO:0003677]","[GO:1902494, GO:1905348]",QDLARIEQFLDALWLEKNLAENTLNAYRRDLSMMVEWLHHRGLTLA...,P0A8P8,511145.b2894,290,511145
3,1A22-A,"[GO:0009966, GO:0048513, GO:0048017, GO:007170...","[GO:0005126, GO:0098772, GO:0030546, GO:005142...","[GO:0031974, GO:0043235, GO:0005768, GO:001250...",FPTIPLSRLFDNAMLRAHRLHQLAFDTYQEFEEAYIPKEQKYSFLQ...,P01241,9606.ENSP00000312673,191,9606
4,1A4E-A,"[GO:0006979, GO:0098754, GO:1901700, GO:001003...","[GO:0016209, GO:0004601, GO:0046906, GO:001668...","[GO:0031974, GO:0005777, GO:0070013, GO:000573...",DVREDRVVTNSTGNPINEPFVTQRIGEHGPLLLQDYNLIDSLAHFN...,P15202,4932.YDR256C,488,4932
...,...,...,...,...,...,...,...,...,...
3409,6R3V-A,"[GO:0009966, GO:0071702, GO:0046907, GO:004852...","[GO:0030234, GO:0019904, GO:0098772, GO:003126...","[GO:0098805, GO:0005768, GO:0012505, GO:000557...",MDPLSELQDDLTLDDTSEALNQLKLASIDEKNWPSDEMPDFPKSDD...,Q07960,9606.ENSP00000310491,439,9606
3410,6R5K-D,"[GO:0051236, GO:0071702, GO:0031326, GO:001060...","[GO:0030234, GO:0098772, GO:0019901, GO:000485...","[GO:0035770, GO:0043228, GO:0005840, GO:199090...",GPDSMADITDKTAEQLENLNIQDDQKQAATGSESQSVENSSASLYV...,P04147,4932.YER165W,581,4932
3411,6R7Z-A,"[GO:0034220, GO:0015698, GO:0006812, GO:005117...","[GO:0022857, GO:0015103, GO:0005216, GO:002280...","[GO:0071944, GO:0031224, GO:0016021, GO:0005886]",MKVTLSALDTSESSFTPLVVIELAQDVKEETKEWLKNRIIAKKKDG...,Q9NW15,9606.ENSP00000292246,667,9606
3412,7ODC-A,"[GO:0048513, GO:0048522, GO:0006520, GO:000657...","[GO:0016829, GO:0042803, GO:0016830, GO:004280...","[GO:0048471, GO:0005829]",MSSFTKDEFDCHILDEGFTAKDILDQKINEVSSSDDKDAFYVADLG...,P00860,10090.ENSMUSP00000128661,424,10090


In [75]:
merged_df.set_index("PID",inplace=True)
merged_df.to_pickle("data/PDBch/test_df.pkl")