In [1]:
import os
import gdown
import pickle
import tarfile
import numpy as np
import pandas as pd
from tqdm import tqdm

In [7]:
file='data-cafa.tar.gz'
if not os.path.exists(file):
    url = "https://drive.google.com/uc?id=1ByatMjSiucHhtyNfARG2jjpHUwkKvsxt&confirm=t"
    output = file
    gdown.download(url=url, output=output, quiet=False, fuzzy=False)

Downloading...
From: https://drive.google.com/uc?id=1ByatMjSiucHhtyNfARG2jjpHUwkKvsxt&confirm=t
To: /home/191it109/project/home/cafa3/data-cafa.tar.gz
100%|██████████████████████████████████████| 1.07G/1.07G [02:13<00:00, 8.05MB/s]


In [9]:
# open file
file = tarfile.open('data-cafa.tar.gz')
  
# extracting file
file.extractall('./data_cafa3/')
  
file.close()

In [5]:
with open(r'data_cafa3/terms.pkl', 'rb') as input_file:
    go_df = pickle.load(input_file)
go_df

Unnamed: 0,terms
0,GO:0097458
1,GO:0004497
2,GO:0003674
3,GO:0036477
4,GO:0003824
...,...
5215,GO:0000131
5216,GO:0016237
5217,GO:1902534
5218,GO:0005762


In [6]:
namespaces_df = pd.read_csv('data_cafa3/CAFA3_training_data/uniprot_sprot_exp.txt',sep='\t',
                            header=None,names=['accession','term','aspect'])
namespaces_df

Unnamed: 0,accession,term,aspect
0,P0DJZ0,GO:0030430,C
1,P32234,GO:0005525,F
2,P83011,GO:0043231,C
3,P83010,GO:0043231,C
4,P81928,GO:0007275,P
...,...,...,...
386192,Q8IYH5,GO:0005730,C
386193,Q8IYH5,GO:0005634,C
386194,Q6KAQ7,GO:0005671,C
386195,O73557,GO:0046761,P


In [25]:
aspect_wise_terms = {
    'F':set(),
    'C':set(),
    'P':set()
}

for i in tqdm(range(len(namespaces_df))):
    aspect_wise_terms[namespaces_df.iloc[i]['aspect']].add(namespaces_df.iloc[i]['term'])

print('No. of MF:',len(aspect_wise_terms['F']))
print('No. of CC:',len(aspect_wise_terms['C']))
print('No. of BP:',len(aspect_wise_terms['P']))

100%|████████████████████████████████| 386197/386197 [00:34<00:00, 11199.80it/s]

No. of MF: 5966
No. of CC: 2183
No. of BP: 16117





In [27]:
u_set = aspect_wise_terms['F'].union(aspect_wise_terms['C'])
u_set = u_set.union(aspect_wise_terms['P'])
print(len(u_set))

go_set = set(go_df['terms'])
u_set = u_set.union(go_set)
print(len(u_set))

24266
25077


In [7]:
def load(filename, with_rels=False):
        ont = dict()
        obj = None
        with open(filename, 'r') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                if line == '[Term]':
                    if obj is not None:
                        ont[obj['id']] = obj
                    obj = dict()
                    obj['is_a'] = list()
                    obj['part_of'] = list()
                    obj['regulates'] = list()
                    obj['alt_ids'] = list()
                    obj['is_obsolete'] = False
                    continue
                elif line == '[Typedef]':
                    if obj is not None:
                        ont[obj['id']] = obj
                    obj = None
                else:
                    if obj is None:
                        continue
                    l = line.split(": ")
                    if l[0] == 'id':
                        obj['id'] = l[1]
                    elif l[0] == 'alt_id':
                        obj['alt_ids'].append(l[1])
                    elif l[0] == 'namespace':
                        obj['namespace'] = l[1]
                    elif l[0] == 'is_a':
                        obj['is_a'].append(l[1].split(' ! ')[0])
                    elif with_rels and l[0] == 'relationship':
                        it = l[1].split()
                        # add all types of relationships
                        obj['is_a'].append(it[1])
                    elif l[0] == 'name':
                        obj['name'] = l[1]
                    elif l[0] == 'is_obsolete' and l[1] == 'true':
                        obj['is_obsolete'] = True
            if obj is not None:
                ont[obj['id']] = obj
        for term_id in list(ont.keys()):
            for t_id in ont[term_id]['alt_ids']:
                ont[t_id] = ont[term_id]
            if ont[term_id]['is_obsolete']:
                del ont[term_id]
        for term_id, val in ont.items():
            if 'children' not in val:
                val['children'] = set()
            for p_id in val['is_a']:
                if p_id in ont:
                    if 'children' not in ont[p_id]:
                        ont[p_id]['children'] = set()
                    ont[p_id]['children'].add(term_id)
        return ont

In [8]:
ont = load('data_cafa3/go.obo')

In [12]:
aspect_wise_terms = {
    'molecular_function': 
    {
        'count': 0, 
        'terms' : set()
    },
    'cellular_component':
    {
        'count': 0, 
        'terms' : set()
    },
    'biological_process': 
    {
        'count': 0, 
        'terms' : set()
    }
}

for i in range(len(go_df)):
    aspect_wise_terms[ont[go_df['terms'][i]]['namespace']]['count']+=1
    aspect_wise_terms[ont[go_df['terms'][i]]['namespace']]['terms'].add(go_df['terms'][i])

print('MF count:',aspect_wise_terms['molecular_function']['count'])
print('BP count:',aspect_wise_terms['biological_process']['count'])
print('CC count:',aspect_wise_terms['cellular_component']['count'])

MF count: 677
BP count: 3992
CC count: 551


In [13]:
with open(r'data_cafa3/train_data.pkl', 'rb') as input_file:
    train_df = pickle.load(input_file)
train_df

Unnamed: 0,proteins,sequences,annotations
0,A0A060X6Z0,MPISSSSSSSTKSMRRAASELERSDSVTSPRFIGRRQSLIEDARKE...,"{GO:0043005, GO:0005623, GO:0004511, GO:001649..."
1,A0A068FIK2,MEVGGGSEECCVKVAVHVRPLIGDEKVQGCKDCVTVIPGKPQVQIG...,"{GO:0005623, GO:0005856, GO:0043232, GO:000588..."
2,A0A075F932,MVSESHHEALAAPPATTVAAAPPSNVTEPASPGGGGGKEDAFSKLK...,"{GO:0046883, GO:0009987, GO:0005623, GO:004476..."
3,A0A078CGE6,MARQMTSSQFHKSKTLDNKYMLGDEIGKGAYGRVYIGLDLENGDFV...,"{GO:0044428, GO:0009987, GO:0071704, GO:004317..."
4,A0A086F3E3,MTKGRLEAFSDGVLAIIITIMVLELKVPEGSSWASLQPILPRFLAY...,"{GO:0022803, GO:0009987, GO:0008324, GO:002284..."
...,...,...,...
66836,W5EP13,MLLFAPTPPPSPATAHRRPGGSAASCIRCSSVRELDRSPSRPPLPP...,"{GO:0009987, GO:0005623, GO:0016787, GO:000957..."
66837,W8DXL4,MWLSACLCLVLSFLGGVNGTCPSQCSCEYHGRHDGSGSRLVLCNDL...,"{GO:0003008, GO:0050877, GO:0050953, GO:000815..."
66838,W8E7I1,MSSEEGKLFVGGLNFNTDERALEDHFSSFGPISEVVVVKDRETQRS...,"{GO:0009987, GO:0009631, GO:0070417, GO:005171..."
66839,X1WGX5,MEGKPRKKSFTPRDGKKPSFKSKGKPGGKPQGKRPFKPHNNDKGKG...,"{GO:0008354, GO:0009987, GO:0044763, GO:005117..."


In [22]:
def replace_row_mf(row):
    row_terms = list(row.intersection(aspect_wise_terms['molecular_function']['terms']))
    if len(row_terms) > 0:
        return ';'.join(row_terms)
    else:
        return np.nan

In [79]:
mf_df = train_df.copy(deep=True)
mf_df['annotations'] = mf_df['annotations'].apply(replace_row_mf)
mf_df = mf_df.dropna().reset_index(drop=True)
mf_df.rename(columns = {'proteins':'Entry','sequences':'Sequence','annotations':'Gene Ontology (molecular function)'}, inplace = True)
mf_df

Unnamed: 0,Entry,Sequence,Gene Ontology (molecular function)
0,A0A060X6Z0,MPISSSSSSSTKSMRRAASELERSDSVTSPRFIGRRQSLIEDARKE...,GO:0003824;GO:0016491;GO:0016705;GO:0004497;GO...
1,A0A078CGE6,MARQMTSSQFHKSKTLDNKYMLGDEIGKGAYGRVYIGLDLENGDFV...,GO:0016773;GO:0003824;GO:0004672;GO:0004674;GO...
2,A0A086F3E3,MTKGRLEAFSDGVLAIIITIMVLELKVPEGSSWASLQPILPRFLAY...,GO:0022892;GO:0022838;GO:0005216;GO:0015075;GO...
3,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,GO:0003824;GO:0016491;GO:0016705;GO:0004497;GO...
4,A0A096SRM5,MAANGGDHTSARPHVVLLPSAGMGHLVPFARLAVALSEGHGCNVSV...,GO:0003824;GO:0016758;GO:0016740;GO:0008194;GO...
...,...,...,...
36105,V5JFY4,MGPWTLLLLHLPLVVSMLPAPTNVSIVSFNLEHTLTWLPGPETPDN...,GO:0004872;GO:0004888;GO:0003674;GO:0005488;GO...
36106,V5YM14,MRPNLLAAAIAVPLSLLAAQIAQAGEGMWVPQQLPEIAGPLKKAGL...,GO:0003824;GO:0005515;GO:0042803;GO:0046983;GO...
36107,V5YMB3,MRHPAFRLTLLASTVAFALAPQAAQAAPSAADRIAGTELIARDALF...,GO:0003824;GO:0008233;GO:0016787;GO:0003674;GO...
36108,V9GXG1,MPYAEITVNLGKVTLGEENRKKMTNSCLKRHENSSLVQAVCALLNS...,GO:0004518;GO:0003824;GO:0016787;GO:0044877;GO...


In [70]:
mf_df.to_csv('cafa3_train/mf_df.csv',index=False)

In [14]:
def replace_row_cc(row):
    row_terms = list(row.intersection(aspect_wise_terms['cellular_component']['terms']))
    if len(row_terms) > 0:
        return ';'.join(row_terms)
    else:
        return np.nan

In [15]:
cc_df = train_df.copy(deep=True)
cc_df['annotations'] = cc_df['annotations'].apply(replace_row_cc)
cc_df = cc_df.dropna().reset_index(drop=True)
cc_df.rename(columns = {'proteins':'Entry','sequences':'Sequence','annotations':'Gene Ontology (cellular component)'}, inplace = True)
cc_df

Unnamed: 0,Entry,Sequence,Gene Ontology (cellular component)
0,A0A060X6Z0,MPISSSSSSSTKSMRRAASELERSDSVTSPRFIGRRQSLIEDARKE...,GO:0098805;GO:0071944;GO:0005575;GO:0005623;GO...
1,A0A068FIK2,MEVGGGSEECCVKVAVHVRPLIGDEKVQGCKDCVTVIPGKPQVQIG...,GO:0005623;GO:0005856;GO:0043232;GO:0005881;GO...
2,A0A075F932,MVSESHHEALAAPPATTVAAAPPSNVTEPASPGGGGGKEDAFSKLK...,GO:0005575;GO:0005623
3,A0A078CGE6,MARQMTSSQFHKSKTLDNKYMLGDEIGKGAYGRVYIGLDLENGDFV...,GO:0044428;GO:0005623;GO:0043233;GO:0043232;GO...
4,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,GO:0005575;GO:0043231;GO:0005623;GO:0044464;GO...
...,...,...,...
50591,U3JAG9,MHTTRSPSASIQAGAAGDALDLSLNGSQLTMGRRPSSASPGKHFSR...,GO:0098805;GO:0044459;GO:0005623;GO:0005575;GO...
50592,U4PR86,MSKYEVLQGFYAVHDELGSGGFGKVRLATHLLTNQKVAIKIIDKKQ...,GO:0005813;GO:0005575;GO:0005623;GO:0044464;GO...
50593,V5JFY4,MGPWTLLLLHLPLVVSMLPAPTNVSIVSFNLEHTLTWLPGPETPDN...,GO:0016020;GO:0005575
50594,V9GXG1,MPYAEITVNLGKVTLGEENRKKMTNSCLKRHENSSLVQAVCALLNS...,GO:0005575;GO:0043231;GO:0005623;GO:0044464;GO...


In [16]:
cc_df.to_csv('cafa3_train/cc_df.csv',index=False)

In [17]:
def replace_row_bp(row):
    row_terms = list(row.intersection(aspect_wise_terms['biological_process']['terms']))
    if len(row_terms) > 0:
        return ';'.join(row_terms)
    else:
        return np.nan

In [18]:
bp_df = train_df.copy(deep=True)
bp_df['annotations'] = bp_df['annotations'].apply(replace_row_bp)
bp_df = bp_df.dropna().reset_index(drop=True)
bp_df.rename(columns = {'proteins':'Entry','sequences':'Sequence','annotations':'Gene Ontology (biological process)'}, inplace = True)
bp_df

Unnamed: 0,Entry,Sequence,Gene Ontology (biological process)
0,A0A060X6Z0,MPISSSSSSSTKSMRRAASELERSDSVTSPRFIGRRQSLIEDARKE...,GO:0008152;GO:0055114;GO:0044710;GO:0008150;GO...
1,A0A075F932,MVSESHHEALAAPPATTVAAAPPSNVTEPASPGGGGGKEDAFSKLK...,GO:0046883;GO:0009987;GO:0044763;GO:0044765;GO...
2,A0A078CGE6,MARQMTSSQFHKSKTLDNKYMLGDEIGKGAYGRVYIGLDLENGDFV...,GO:0009987;GO:0071704;GO:0043170;GO:0043412;GO...
3,A0A086F3E3,MTKGRLEAFSDGVLAIIITIMVLELKVPEGSSWASLQPILPRFLAY...,GO:0009987;GO:0044763;GO:0044765;GO:0098655;GO...
4,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,GO:0009410;GO:0070887;GO:0009987;GO:0051716;GO...
...,...,...,...
53495,W5EP13,MLLFAPTPPPSPATAHRRPGGSAASCIRCSSVRELDRSPSRPPLPP...,GO:0009987;GO:0016311;GO:0008152;GO:0044237;GO...
53496,W8DXL4,MWLSACLCLVLSFLGGVNGTCPSQCSCEYHGRHDGSGSRLVLCNDL...,GO:0003008;GO:0050877;GO:0050953;GO:0007600;GO...
53497,W8E7I1,MSSEEGKLFVGGLNFNTDERALEDHFSSFGPISEVVVVKDRETQRS...,GO:0009987;GO:0051716;GO:0070417;GO:0009266;GO...
53498,X1WGX5,MEGKPRKKSFTPRDGKKPSFKSKGKPGGKPQGKRPFKPHNNDKGKG...,GO:0008354;GO:0009987;GO:0032504;GO:0044763;GO...


In [19]:
bp_df.to_csv('cafa3_train/bp_df.csv',index=False)

In [20]:
with open(r'data_cafa3/test_data.pkl', 'rb') as input_file:
    test_df = pickle.load(input_file)
test_df

Unnamed: 0,proteins,sequences,annotations
0,T100900000026,MAESFKELDPDSSMGKALEMTCAIQNQLARILAEFEMTLERDVLQP...,"{GO:0019222, GO:0009987, GO:0044763, GO:001681..."
1,T100900000046,MRLCIPQVLLALFLSMLTAPGEGSRRRATQEDTTQPALLRLSDHLL...,"{GO:0042802, GO:0005515, GO:0003674, GO:0005488}"
2,T100900000115,MNNLSFSELCCLFCCPPCPGKIASKLAFLPPDPTYTLMCDESGSRW...,"{GO:0044380, GO:0071704, GO:0043412, GO:005117..."
3,T100900000116,MPEPGPRMNGFSLGELCWLFCCPPCPSRIAAKLAFLPPEPTYTVLA...,"{GO:0044380, GO:0071704, GO:0043170, GO:000562..."
4,T100900000141,MAVPGPTARAGARPRLDLQLVQRFVRIQKVFFPSWSSQNVLMFMTL...,"{GO:0010033, GO:0009987, GO:0070887, GO:005171..."
...,...,...,...
3323,T992870001087,MIDGKTANEIFDSIRQHIIAGTLRAEDSLPPVRELASELKVNRNTV...,"{GO:0034645, GO:0097659, GO:0051253, GO:007170..."
3324,T992870001259,MKQGLQLRLSQQLAMTPQLQQAIRLLQLSTLELQQELQQALENNPL...,"{GO:0034645, GO:0097659, GO:0071704, GO:003132..."
3325,T992870001336,MDYQNNVSEERVAEMIWDAVSEGATLKDVHGIPQDMMDGLYAHAYE...,"{GO:0042802, GO:0005515, GO:0003674, GO:0005488}"
3326,T992870001601,MTVDSNTSSGRGNDPEQIDLIELLLQLWRGKMTIIVAVIIAILLAV...,"{GO:0042802, GO:0005515, GO:0003674, GO:0005488}"


In [23]:
mf_df = test_df.copy(deep=True)
mf_df['annotations'] = mf_df['annotations'].apply(replace_row_mf)
mf_df = mf_df.dropna().reset_index(drop=True)
mf_df.rename(columns = {'proteins':'Entry','sequences':'Sequence','annotations':'Gene Ontology (molecular function)'}, inplace = True)
mf_df

Unnamed: 0,Entry,Sequence,Gene Ontology (molecular function)
0,T100900000026,MAESFKELDPDSSMGKALEMTCAIQNQLARILAEFEMTLERDVLQP...,GO:0003824;GO:0016818;GO:0016462;GO:0003924;GO...
1,T100900000046,MRLCIPQVLLALFLSMLTAPGEGSRRRATQEDTTQPALLRLSDHLL...,GO:0005515;GO:0005488;GO:0003674;GO:0042802
2,T100900000115,MNNLSFSELCCLFCCPPCPGKIASKLAFLPPDPTYTLMCDESGSRW...,GO:0016790;GO:0003824;GO:0016788;GO:0016787;GO...
3,T100900000116,MPEPGPRMNGFSLGELCWLFCCPPCPSRIAAKLAFLPPEPTYTVLA...,GO:0016790;GO:0003824;GO:0016788;GO:0016787;GO...
4,T100900000161,MADDLEQQPQGWLSSWLPTWRPTSMSQLKNVEARILQCLQNKFLAR...,GO:0004620;GO:0003824;GO:0052689;GO:0016788;GO...
...,...,...,...
1132,T992870001087,MIDGKTANEIFDSIRQHIIAGTLRAEDSLPPVRELASELKVNRNTV...,GO:0030170;GO:0003700;GO:1901363;GO:0001071;GO...
1133,T992870001259,MKQGLQLRLSQQLAMTPQLQQAIRLLQLSTLELQQELQQALENNPL...,GO:1901363;GO:1990837;GO:0001071;GO:0003690;GO...
1134,T992870001336,MDYQNNVSEERVAEMIWDAVSEGATLKDVHGIPQDMMDGLYAHAYE...,GO:0005515;GO:0005488;GO:0003674;GO:0042802
1135,T992870001601,MTVDSNTSSGRGNDPEQIDLIELLLQLWRGKMTIIVAVIIAILLAV...,GO:0005515;GO:0005488;GO:0003674;GO:0042802


In [24]:
mf_df.to_csv('cafa3_test/mf_df.csv',index=False)

In [25]:
cc_df = test_df.copy(deep=True)
cc_df['annotations'] = cc_df['annotations'].apply(replace_row_cc)
cc_df = cc_df.dropna().reset_index(drop=True)
cc_df.rename(columns = {'proteins':'Entry','sequences':'Sequence','annotations':'Gene Ontology (cellular component)'}, inplace = True)
cc_df

Unnamed: 0,Entry,Sequence,Gene Ontology (cellular component)
0,T100900000026,MAESFKELDPDSSMGKALEMTCAIQNQLARILAEFEMTLERDVLQP...,GO:0005911;GO:0005575;GO:0043296;GO:0030054;GO...
1,T100900000115,MNNLSFSELCCLFCCPPCPGKIASKLAFLPPDPTYTLMCDESGSRW...,GO:0005623;GO:0098794;GO:0005886;GO:0043227;GO...
2,T100900000116,MPEPGPRMNGFSLGELCWLFCCPPCPSRIAAKLAFLPPEPTYTVLA...,GO:0005575;GO:0005623;GO:0044464;GO:0044456;GO...
3,T100900000167,MEKSWMLWSFIERWLLALASWSWALCRISLLPLIVTFHLYGGIVLL...,GO:0005623;GO:0043005;GO:0032838;GO:0005886;GO...
4,T100900000453,MRIGLLWLVPLFTLTEGTDGFLQQKNDGRRTKEIVSMVEERHPVHE...,GO:0005575;GO:0005623;GO:0044464;GO:0016020;GO...
...,...,...,...
1260,T992870000225,MVLGKPQTDPTLEWFLSHCHIHKYPSKSTLIHQGEKAETLYYIVKG...,GO:0032993;GO:0032991;GO:0005575
1261,T992870000466,MIPEKRIIRRIQSGGCAIHCQDCSISQLCIPFTLNEHELDQLDNII...,GO:0032993;GO:0032991;GO:0005575
1262,T992870000685,MPEVKTEKPHLLDMGKPQLRMVDLNLLTVFDAVMQEQNITRAAHTL...,GO:0032993;GO:0032991;GO:0005575
1263,T992870001023,MMRVLVVEDNALLRHHLKVQLQDSGHQVDAAEDAREADYYLNEHLP...,GO:0032993;GO:0032991;GO:0005575


In [26]:
cc_df.to_csv('cafa3_test/cc_df.csv',index=False)

In [27]:
bp_df = test_df.copy(deep=True)
bp_df['annotations'] = bp_df['annotations'].apply(replace_row_bp)
bp_df = bp_df.dropna().reset_index(drop=True)
bp_df.rename(columns = {'proteins':'Entry','sequences':'Sequence','annotations':'Gene Ontology (biological process)'}, inplace = True)
bp_df

Unnamed: 0,Entry,Sequence,Gene Ontology (biological process)
0,T100900000026,MAESFKELDPDSSMGKALEMTCAIQNQLARILAEFEMTLERDVLQP...,GO:0019222;GO:0009987;GO:0044763;GO:0008150;GO...
1,T100900000115,MNNLSFSELCCLFCCPPCPGKIASKLAFLPPDPTYTLMCDESGSRW...,GO:0044380;GO:0071704;GO:0043170;GO:0043412;GO...
2,T100900000116,MPEPGPRMNGFSLGELCWLFCCPPCPSRIAAKLAFLPPEPTYTVLA...,GO:0044380;GO:0071704;GO:0009057;GO:0043170;GO...
3,T100900000141,MAVPGPTARAGARPRLDLQLVQRFVRIQKVFFPSWSSQNVLMFMTL...,GO:0010033;GO:0009987;GO:0070887;GO:0051716;GO...
4,T100900000161,MADDLEQQPQGWLSSWLPTWRPTSMSQLKNVEARILQCLQNKFLAR...,GO:0009987;GO:0071704;GO:0044763;GO:0009308;GO...
...,...,...,...
2387,T992870000466,MIPEKRIIRRIQSGGCAIHCQDCSISQLCIPFTLNEHELDQLDNII...,GO:0034645;GO:0097659;GO:0016070;GO:0071704;GO...
2388,T992870000685,MPEVKTEKPHLLDMGKPQLRMVDLNLLTVFDAVMQEQNITRAAHTL...,GO:0034645;GO:0097659;GO:0016070;GO:0071704;GO...
2389,T992870001023,MMRVLVVEDNALLRHHLKVQLQDSGHQVDAAEDAREADYYLNEHLP...,GO:0034645;GO:0097659;GO:0016070;GO:0071704;GO...
2390,T992870001087,MIDGKTANEIFDSIRQHIIAGTLRAEDSLPPVRELASELKVNRNTV...,GO:0034645;GO:0097659;GO:0051253;GO:0071704;GO...


In [28]:
bp_df.to_csv('cafa3_test/bp_df.csv',index=False)