In [1]:
import os
import gdown
import pickle
import tarfile
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
file='data-2016.tar.gz'
if not os.path.exists(file):
    url = "https://drive.google.com/uc?id=1vW3GHb0L85Auk5ojuo0x-SV7K51W-c-5&confirm=t"
    output = file
    gdown.download(url=url, output=output, quiet=False, fuzzy=False)

Downloading...
From: https://drive.google.com/uc?id=1vW3GHb0L85Auk5ojuo0x-SV7K51W-c-5&confirm=t
To: /home/191it109/project/home/golabeler/data-2016.tar.gz
100%|████████████████████████████████████████| 759M/759M [00:07<00:00, 95.8MB/s]


In [3]:
# open file
file = tarfile.open('data-2016.tar.gz')
  
# extracting file
file.extractall('./data_golabeler/')
  
file.close()

In [7]:
with open(r'data_golabeler/terms.pkl', 'rb') as input_file:
    go_df = pickle.load(input_file)
go_df

Unnamed: 0,terms
0,GO:0044217
1,GO:0043656
2,GO:0005575
3,GO:0044215
4,GO:0018995
...,...
5096,GO:0000314
5097,GO:0005665
5098,GO:0005839
5099,GO:0004812


In [6]:
# namespaces_df = pd.read_csv('data_cafa3/CAFA3_training_data/uniprot_sprot_exp.txt',sep='\t',
#                             header=None,names=['accession','term','aspect'])
# namespaces_df

Unnamed: 0,accession,term,aspect
0,P0DJZ0,GO:0030430,C
1,P32234,GO:0005525,F
2,P83011,GO:0043231,C
3,P83010,GO:0043231,C
4,P81928,GO:0007275,P
...,...,...,...
386192,Q8IYH5,GO:0005730,C
386193,Q8IYH5,GO:0005634,C
386194,Q6KAQ7,GO:0005671,C
386195,O73557,GO:0046761,P


In [25]:
aspect_wise_terms = {
    'F':set(),
    'C':set(),
    'P':set()
}

for i in tqdm(range(len(namespaces_df))):
    aspect_wise_terms[namespaces_df.iloc[i]['aspect']].add(namespaces_df.iloc[i]['term'])

print('No. of MF:',len(aspect_wise_terms['F']))
print('No. of CC:',len(aspect_wise_terms['C']))
print('No. of BP:',len(aspect_wise_terms['P']))

100%|████████████████████████████████| 386197/386197 [00:34<00:00, 11199.80it/s]

No. of MF: 5966
No. of CC: 2183
No. of BP: 16117





In [27]:
u_set = aspect_wise_terms['F'].union(aspect_wise_terms['C'])
u_set = u_set.union(aspect_wise_terms['P'])
print(len(u_set))

go_set = set(go_df['terms'])
u_set = u_set.union(go_set)
print(len(u_set))

24266
25077


In [8]:
def load(filename, with_rels=False):
        ont = dict()
        obj = None
        with open(filename, 'r') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                if line == '[Term]':
                    if obj is not None:
                        ont[obj['id']] = obj
                    obj = dict()
                    obj['is_a'] = list()
                    obj['part_of'] = list()
                    obj['regulates'] = list()
                    obj['alt_ids'] = list()
                    obj['is_obsolete'] = False
                    continue
                elif line == '[Typedef]':
                    if obj is not None:
                        ont[obj['id']] = obj
                    obj = None
                else:
                    if obj is None:
                        continue
                    l = line.split(": ")
                    if l[0] == 'id':
                        obj['id'] = l[1]
                    elif l[0] == 'alt_id':
                        obj['alt_ids'].append(l[1])
                    elif l[0] == 'namespace':
                        obj['namespace'] = l[1]
                    elif l[0] == 'is_a':
                        obj['is_a'].append(l[1].split(' ! ')[0])
                    elif with_rels and l[0] == 'relationship':
                        it = l[1].split()
                        # add all types of relationships
                        obj['is_a'].append(it[1])
                    elif l[0] == 'name':
                        obj['name'] = l[1]
                    elif l[0] == 'is_obsolete' and l[1] == 'true':
                        obj['is_obsolete'] = True
            if obj is not None:
                ont[obj['id']] = obj
        for term_id in list(ont.keys()):
            for t_id in ont[term_id]['alt_ids']:
                ont[t_id] = ont[term_id]
            if ont[term_id]['is_obsolete']:
                del ont[term_id]
        for term_id, val in ont.items():
            if 'children' not in val:
                val['children'] = set()
            for p_id in val['is_a']:
                if p_id in ont:
                    if 'children' not in ont[p_id]:
                        ont[p_id]['children'] = set()
                    ont[p_id]['children'].add(term_id)
        return ont

In [9]:
ont = load('data_golabeler/go.obo')

In [10]:
aspect_wise_terms = {
    'molecular_function': 
    {
        'count': 0, 
        'terms' : set()
    },
    'cellular_component':
    {
        'count': 0, 
        'terms' : set()
    },
    'biological_process': 
    {
        'count': 0, 
        'terms' : set()
    }
}

for i in range(len(go_df)):
    aspect_wise_terms[ont[go_df['terms'][i]]['namespace']]['count']+=1
    aspect_wise_terms[ont[go_df['terms'][i]]['namespace']]['terms'].add(go_df['terms'][i])

print('MF count:',aspect_wise_terms['molecular_function']['count'])
print('BP count:',aspect_wise_terms['biological_process']['count'])
print('CC count:',aspect_wise_terms['cellular_component']['count'])

MF count: 652
BP count: 3904
CC count: 545


In [11]:
with open(r'data_golabeler/train_data.pkl', 'rb') as input_file:
    train_df = pickle.load(input_file)
train_df

Unnamed: 0,index,proteins,accessions,sequences,annotations,interpros,orgs
0,225,11K_PAVHV,P0DJZ0;,MQNNTTGMDTKSLKNCGQPKAVCTHCKHSPPCPQPGCVTKRPPVPP...,"[GO:0044217, GO:0043656, GO:0005575, GO:004421...",[],648237
1,240,128UP_DROME,P32234; Q9V648;,MSTILEKISAIESEMARTQKNKATSAHLGLLKAKLAKLRRELISPK...,"[GO:0097367, GO:0043168, GO:0003674, GO:190126...","[IPR031167, IPR031662, IPR006074, IPR006073, I...",7227
2,252,13KDA_SCYCA,P83011;,MIFTAXDRSAIEXV,"[GO:0044464, GO:0043226, GO:0005575, GO:004322...",[],7830
3,253,13KDA_TRISC,P83010;,AGEPANNEDRFNY,"[GO:0044464, GO:0043226, GO:0005575, GO:004322...",[],30494
4,259,140U_DROME,P81928; Q9VFM8;,MNFLWKGRRFLIAGILPTFEGAADEIVDKENKTYKAFLASKPPEET...,"[GO:0032501, GO:0044699, GO:0007275, GO:004470...",[IPR003397],7227
...,...,...,...,...,...,...,...
65023,550268,ZYX_HUMAN,Q15942; A4D2G6; B4DQX7; Q6I9S4;,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...,"[GO:0071840, GO:0071560, GO:0070887, GO:000591...",[IPR001781],9606
65024,550269,ZYX_MOUSE,Q62523; P70461; Q3UGQ3;,MAAPRPPPAISVSVSAPAFYAPQKKFAPVVAPKPKVNPFRPGDSEP...,"[GO:0007178, GO:0071560, GO:0070887, GO:007136...",[IPR001781],10090
65025,550270,ZYX_XENLA,A5H447;,MDPAAPATRMTSSFTINISTPSFYNPPKKFAPVVPPKPKINPFKAP...,"[GO:0010467, GO:1901362, GO:0080090, GO:000680...",[IPR001781],8355
65026,550274,ZZZ3_HUMAN,Q8IYH5; B7WPC6; Q6N004; Q6N070; Q8IYP0; Q8IYR1...,MAASRSTRVTRSTVGLNGLDESFCGRTLRNRSIAHPEEISSNSQVR...,"[GO:0044464, GO:0043226, GO:0071840, GO:001604...","[IPR009057, IPR017930, IPR001005, IPR000433]",9606


In [13]:
train_df.iloc[0]['annotations']

list

In [14]:
def replace_row_mf(row):
    row = set(row)
    row_terms = list(row.intersection(aspect_wise_terms['molecular_function']['terms']))
    if len(row_terms) > 0:
        return ';'.join(row_terms)
    else:
        return np.nan

In [15]:
mf_df = train_df.copy(deep=True)
mf_df['annotations'] = mf_df['annotations'].apply(replace_row_mf)
mf_df = mf_df.dropna().reset_index(drop=True)
mf_df.rename(columns = {'proteins':'Entry','sequences':'Sequence','annotations':'Gene Ontology (molecular function)'}, inplace = True)
mf_df

Unnamed: 0,index,Entry,accessions,Sequence,Gene Ontology (molecular function),interpros,orgs
0,240,128UP_DROME,P32234; Q9V648;,MSTILEKISAIESEMARTQKNKATSAHLGLLKAKLAKLRRELISPK...,GO:0036094;GO:0043167;GO:0032553;GO:0017076;GO...,"[IPR031167, IPR031662, IPR006074, IPR006073, I...",7227
1,261,14310_ARATH,P48347; Q9LME5;,MENEREKQVYLAKLSEQTERYDEMVEAMKKVAQLDVELTVEERNLV...,GO:0036094;GO:0043167;GO:0032553;GO:0017076;GO...,"[IPR000308, IPR023409, IPR023410]",3702
2,263,14311_ARATH,Q9S9Z8; A0JQ87; F4HWN0; Q0WL19;,MENERAKQVYLAKLNEQAERYDEMVEAMKKVAALDVELTIEERNLL...,GO:0051117;GO:0005515;GO:0005488;GO:0019899;GO...,"[IPR000308, IPR023409, IPR023410]",3702
3,287,14333_ARATH,P42644; F4KBI7; Q945L2;,MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL...,GO:0036094;GO:0043167;GO:0032553;GO:0017076;GO...,"[IPR000308, IPR023409, IPR023410]",3702
4,296,14335_ARATH,P42645;,MSSDSSREENVYLAKLAEQAERYEEMVEFMEKVAKTVETEELTVEE...,GO:0036094;GO:0043167;GO:0032553;GO:0017076;GO...,"[IPR000308, IPR023409, IPR023410]",3702
...,...,...,...,...,...,...,...
34483,550264,ZYM1_SCHPO,Q9UTC0;,MEHTTQCKSKQGKPCDCQSKCGCQDCKESCGCKSSAVDNCKCSSCK...,GO:0043167;GO:0046914;GO:0043169;GO:0046872;GO...,[IPR001008],284812
34484,550266,ZYX_CAEEL,Q9U3F4; H2L2F5; H2L2F6; H2L2F7; Q9U3F5;,MGPPPPPPPPPLLPSGEILPSRKWKTEDAPRRNNHPAPAPPKPSRP...,GO:0003674;GO:0005515;GO:0005488;GO:0019899,[IPR001781],6239
34485,550267,ZYX_CHICK,Q04584;,MASPGTPGTRMTTTVSINISTPSFYNPQKKFAPVVAPKPKVNPFKT...,GO:0003674;GO:0008092;GO:0005515;GO:0005488,[IPR001781],9031
34486,550268,ZYX_HUMAN,Q15942; A4D2G6; B4DQX7; Q6I9S4;,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...,GO:0003676;GO:0005488;GO:0003723;GO:0003674;GO...,[IPR001781],9606


In [17]:
mf_df.to_csv('golabeler_train/mf_df.csv',index=False)

In [18]:
def replace_row_cc(row):
    row = set(row)
    row_terms = list(row.intersection(aspect_wise_terms['cellular_component']['terms']))
    if len(row_terms) > 0:
        return ';'.join(row_terms)
    else:
        return np.nan

In [19]:
cc_df = train_df.copy(deep=True)
cc_df['annotations'] = cc_df['annotations'].apply(replace_row_cc)
cc_df = cc_df.dropna().reset_index(drop=True)
cc_df.rename(columns = {'proteins':'Entry','sequences':'Sequence','annotations':'Gene Ontology (cellular component)'}, inplace = True)
cc_df

Unnamed: 0,index,Entry,accessions,Sequence,Gene Ontology (cellular component),interpros,orgs
0,225,11K_PAVHV,P0DJZ0;,MQNNTTGMDTKSLKNCGQPKAVCTHCKHSPPCPQPGCVTKRPPVPP...,GO:0030430;GO:0033643;GO:0044216;GO:0044215;GO...,[],648237
1,252,13KDA_SCYCA,P83011;,MIFTAXDRSAIEXV,GO:0043227;GO:0005575;GO:0005622;GO:0044424;GO...,[],7830
2,253,13KDA_TRISC,P83010;,AGEPANNEDRFNY,GO:0043227;GO:0005575;GO:0005622;GO:0044424;GO...,[],30494
3,261,14310_ARATH,P48347; Q9LME5;,MENEREKQVYLAKLSEQTERYDEMVEAMKKVAQLDVELTVEERNLV...,GO:0005622;GO:0043231;GO:0043226;GO:0005829;GO...,"[IPR000308, IPR023409, IPR023410]",3702
4,266,14331_CAEEL,P41932; Q21537;,MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...,GO:0043227;GO:0005634;GO:0044464;GO:0005938;GO...,"[IPR000308, IPR023409, IPR023410]",6239
...,...,...,...,...,...,...,...
49341,550266,ZYX_CAEEL,Q9U3F4; H2L2F5; H2L2F6; H2L2F7; Q9U3F5;,MGPPPPPPPPPLLPSGEILPSRKWKTEDAPRRNNHPAPAPPKPSRP...,GO:0005634;GO:0030017;GO:0005622;GO:0044449;GO...,[IPR001781],6239
49342,550267,ZYX_CHICK,Q04584;,MASPGTPGTRMTTTVSINISTPSFYNPQKKFAPVVAPKPKVNPFKT...,GO:0030055;GO:0005622;GO:0043226;GO:0005925;GO...,[IPR001781],9031
49343,550268,ZYX_HUMAN,Q15942; A4D2G6; B4DQX7; Q6I9S4;,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...,GO:0005634;GO:0030055;GO:0005622;GO:0016021;GO...,[IPR001781],9606
49344,550274,ZZZ3_HUMAN,Q8IYH5; B7WPC6; Q6N004; Q6N070; Q8IYP0; Q8IYR1...,MAASRSTRVTRSTVGLNGLDESFCGRTLRNRSIAHPEEISSNSQVR...,GO:0005634;GO:0031981;GO:0005622;GO:0043233;GO...,"[IPR009057, IPR017930, IPR001005, IPR000433]",9606


In [20]:
cc_df.to_csv('golabeler_train/cc_df.csv',index=False)

In [21]:
def replace_row_bp(row):
    row = set(row)
    row_terms = list(row.intersection(aspect_wise_terms['biological_process']['terms']))
    if len(row_terms) > 0:
        return ';'.join(row_terms)
    else:
        return np.nan

In [22]:
bp_df = train_df.copy(deep=True)
bp_df['annotations'] = bp_df['annotations'].apply(replace_row_bp)
bp_df = bp_df.dropna().reset_index(drop=True)
bp_df.rename(columns = {'proteins':'Entry','sequences':'Sequence','annotations':'Gene Ontology (biological process)'}, inplace = True)
bp_df

Unnamed: 0,index,Entry,accessions,Sequence,Gene Ontology (biological process),interpros,orgs
0,259,140U_DROME,P81928; Q9VFM8;,MNFLWKGRRFLIAGILPTFEGAADEIVDKENKTYKAFLASKPPEET...,GO:0007275;GO:0044707;GO:0032502;GO:0044699;GO...,[IPR003397],7227
1,261,14310_ARATH,P48347; Q9LME5;,MENEREKQVYLAKLSEQTERYDEMVEAMKKVAQLDVELTVEERNLV...,GO:0014070;GO:0043401;GO:0001101;GO:0051716;GO...,"[IPR000308, IPR023409, IPR023410]",3702
2,265,14331_ARATH,P42643; Q945M2; Q9M0S7;,MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...,GO:0050789;GO:0019222;GO:0065007;GO:0008150;GO...,"[IPR000308, IPR023409, IPR023410]",3702
3,266,14331_CAEEL,P41932; Q21537;,MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...,GO:0009792;GO:0009566;GO:0040001;GO:0044763;GO...,"[IPR000308, IPR023409, IPR023410]",6239
4,276,14332_ARATH,Q01525;,MASGREEFVYMAKLAEQAERYEEMVEFMEKVSAAVDGDELTVEERN...,GO:0014070;GO:0043401;GO:0051716;GO:0065007;GO...,"[IPR000308, IPR023409, IPR023410]",3702
...,...,...,...,...,...,...,...
51711,550266,ZYX_CAEEL,Q9U3F4; H2L2F5; H2L2F6; H2L2F7; Q9U3F5;,MGPPPPPPPPPLLPSGEILPSRKWKTEDAPRRNNHPAPAPPKPSRP...,GO:0000003;GO:0008150,[IPR001781],6239
51712,550268,ZYX_HUMAN,Q15942; A4D2G6; B4DQX7; Q6I9S4;,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...,GO:1902589;GO:0051716;GO:0065007;GO:0007167;GO...,[IPR001781],9606
51713,550269,ZYX_MOUSE,Q62523; P70461; Q3UGQ3;,MAAPRPPPAISVSVSAPAFYAPQKKFAPVVAPKPKVNPFRPGDSEP...,GO:0051716;GO:0065007;GO:0007179;GO:0007167;GO...,[IPR001781],10090
51714,550270,ZYX_XENLA,A5H447;,MDPAAPATRMTSSFTINISTPSFYNPPKKFAPVVPPKPKINPFKAP...,GO:0019438;GO:0019222;GO:0044237;GO:0006725;GO...,[IPR001781],8355


In [23]:
bp_df.to_csv('golabeler_train/bp_df.csv',index=False)

In [27]:
with open(r'data_golabeler/test_data.pkl', 'rb') as input_file:
    test_df = pickle.load(input_file)
test_df

Unnamed: 0,level_0,index,proteins,accessions,sequences,annotations,interpros,orgs
84,95,617,1B48_HUMAN,P30486; Q29764;,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...,"[GO:0030139, GO:0098576, GO:0030662, GO:003065...","[IPR007110, IPR013783, IPR003006, IPR003597, I...",9606
181,204,1028,3BHS7_MOUSE,Q9EQC1; A2RTR5;,MADSAQVPTLVYLVTGGCGFLGEHIVRMLLEREPRLRELRVFDLHL...,"[GO:0060326, GO:0008152, GO:0030595, GO:007088...","[IPR002225, IPR016040]",10090
183,208,1043,3BP1_MOUSE,P55194; E9QMQ2; Q99KK8;,MAESFKELDPDSSMGKALEMTCAIQNQLARILAEFEMTLERDVLQP...,"[GO:0005515, GO:0005488, GO:0017124, GO:001990...","[IPR027267, IPR004148, IPR008936, IPR000198]",10090
304,394,2418,6PGD_DROME,P41572; Q9W519;,MSGQADIALIGLAVMGQNLILNMDEKGFVVCAYNRTVAKVKEFLAN...,"[GO:0044710, GO:0008152, GO:0010906, GO:004426...","[IPR008927, IPR013328, IPR012284, IPR006114, I...",7227
326,427,2607,8ODP_RAT,P53369;,MSTSRLYTLVLVLQPQRVLLGMKKRGFGAGRWNGFGGKVQEGETIE...,"[GO:0010035, GO:0005622, GO:0061458, GO:004444...","[IPR020476, IPR020084, IPR000086, IPR015797, I...",10116
...,...,...,...,...,...,...,...,...
55732,67106,552113,ZN809_MOUSE,G3X9G7; Q4KL58; Q8BIJ2;,MGLVSFEDVAVDFTLEEWQDLDAAQRTLYRDVMLETYSSLVFLDPC...,"[GO:0019080, GO:0001171, GO:0006351, GO:005079...","[IPR001909, IPR007087, IPR015880, IPR013087]",10090
55751,67125,552243,ZNF8_MOUSE,Q8BGV5; Q52KP6; Q8BJ50;,MDHQDKAATVAMASRPQATQLQEPVTFRDVAVDFTQEEWGQLDPTQ...,"[GO:0010468, GO:0006351, GO:0006725, GO:005078...","[IPR001909, IPR007087, IPR015880, IPR013087]",10090
55763,67138,552282,ZNRF4_HUMAN,Q8WWF5; A8K886; O75866;,MPLCRPEHLMPRASRVPVAASLPLSHAVIPTQLPSRPGHRPPGRPR...,"[GO:0008152, GO:0005622, GO:0044432, GO:000646...","[IPR003137, IPR001841, IPR013083]",9606
55764,67139,552284,ZNRF4_MOUSE,Q9DAH2; Q9WTN2;,MARFAWTRVAPVALVTFWLVLSLSPTDAQVNLSSVDFLDLPALLGV...,"[GO:0005622, GO:0005575, GO:0005623, GO:000573...","[IPR001841, IPR013083]",10090


In [29]:
test_df = test_df.drop(['level_0'],axis=1)
test_df.reset_index(inplace = True)
test_df

Unnamed: 0,level_0,index,proteins,accessions,sequences,annotations,interpros,orgs
0,84,617,1B48_HUMAN,P30486; Q29764;,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...,"[GO:0030139, GO:0098576, GO:0030662, GO:003065...","[IPR007110, IPR013783, IPR003006, IPR003597, I...",9606
1,181,1028,3BHS7_MOUSE,Q9EQC1; A2RTR5;,MADSAQVPTLVYLVTGGCGFLGEHIVRMLLEREPRLRELRVFDLHL...,"[GO:0060326, GO:0008152, GO:0030595, GO:007088...","[IPR002225, IPR016040]",10090
2,183,1043,3BP1_MOUSE,P55194; E9QMQ2; Q99KK8;,MAESFKELDPDSSMGKALEMTCAIQNQLARILAEFEMTLERDVLQP...,"[GO:0005515, GO:0005488, GO:0017124, GO:001990...","[IPR027267, IPR004148, IPR008936, IPR000198]",10090
3,304,2418,6PGD_DROME,P41572; Q9W519;,MSGQADIALIGLAVMGQNLILNMDEKGFVVCAYNRTVAKVKEFLAN...,"[GO:0044710, GO:0008152, GO:0010906, GO:004426...","[IPR008927, IPR013328, IPR012284, IPR006114, I...",7227
4,326,2607,8ODP_RAT,P53369;,MSTSRLYTLVLVLQPQRVLLGMKKRGFGAGRWNGFGGKVQEGETIE...,"[GO:0010035, GO:0005622, GO:0061458, GO:004444...","[IPR020476, IPR020084, IPR000086, IPR015797, I...",10116
...,...,...,...,...,...,...,...,...
1783,55732,552113,ZN809_MOUSE,G3X9G7; Q4KL58; Q8BIJ2;,MGLVSFEDVAVDFTLEEWQDLDAAQRTLYRDVMLETYSSLVFLDPC...,"[GO:0019080, GO:0001171, GO:0006351, GO:005079...","[IPR001909, IPR007087, IPR015880, IPR013087]",10090
1784,55751,552243,ZNF8_MOUSE,Q8BGV5; Q52KP6; Q8BJ50;,MDHQDKAATVAMASRPQATQLQEPVTFRDVAVDFTQEEWGQLDPTQ...,"[GO:0010468, GO:0006351, GO:0006725, GO:005078...","[IPR001909, IPR007087, IPR015880, IPR013087]",10090
1785,55763,552282,ZNRF4_HUMAN,Q8WWF5; A8K886; O75866;,MPLCRPEHLMPRASRVPVAASLPLSHAVIPTQLPSRPGHRPPGRPR...,"[GO:0008152, GO:0005622, GO:0044432, GO:000646...","[IPR003137, IPR001841, IPR013083]",9606
1786,55764,552284,ZNRF4_MOUSE,Q9DAH2; Q9WTN2;,MARFAWTRVAPVALVTFWLVLSLSPTDAQVNLSSVDFLDLPALLGV...,"[GO:0005622, GO:0005575, GO:0005623, GO:000573...","[IPR001841, IPR013083]",10090


In [30]:
mf_df = test_df.copy(deep=True)
mf_df['annotations'] = mf_df['annotations'].apply(replace_row_mf)
mf_df = mf_df.dropna().reset_index(drop=True)
mf_df.rename(columns = {'proteins':'Entry','sequences':'Sequence','annotations':'Gene Ontology (molecular function)'}, inplace = True)
mf_df

Unnamed: 0,level_0,index,Entry,accessions,Sequence,Gene Ontology (molecular function),interpros,orgs
0,181,1028,3BHS7_MOUSE,Q9EQC1; A2RTR5;,MADSAQVPTLVYLVTGGCGFLGEHIVRMLLEREPRLRELRVFDLHL...,GO:0016491;GO:0003824;GO:0003674;GO:0016616;GO...,"[IPR002225, IPR016040]",10090
1,183,1043,3BP1_MOUSE,P55194; E9QMQ2; Q99KK8;,MAESFKELDPDSSMGKALEMTCAIQNQLARILAEFEMTLERDVLQP...,GO:0017124;GO:0005515;GO:0005488;GO:0003674;GO...,"[IPR027267, IPR004148, IPR008936, IPR000198]",10090
2,392,3170,AA3R_MOUSE,Q61618; Q9R202;,MEADNTTETDWLNITYITMEAAIGLCAVVGNMLVIWVVKLNPTLRT...,GO:0004930;GO:0004872;GO:0003674;GO:0004871;GO...,"[IPR000466, IPR001634, IPR000276, IPR017452]",10090
3,462,3586,AARE_ARATH,Q84LM4; O23313;,MDSSGTDSAKELHVGLDPTTEEEYATQSKLLQEFINIPSIDKAWIF...,GO:0008233;GO:0017171;GO:0016787;GO:0008236;GO...,"[IPR011042, IPR029058, IPR011659, IPR001375]",3702
4,716,4346,ABHD2_HUMAN,P08910; Q53G48; Q53GU0; Q5FVD9; Q8TC79;,MNAMLETPELPAVFDGVKLAAVAAVLYVIVRCLNLKSPTAPPDLYF...,GO:0005488;GO:0016787;GO:0016298;GO:0052689;GO...,"[IPR029058, IPR000073, IPR000952, IPR012020]",9606
...,...,...,...,...,...,...,...,...
674,55712,551992,ZN683_HUMAN,Q8IZ20; Q5T141; Q5T146; Q5T147; Q5T149; Q8NEN4;,MKEESAAQLGCCHRPMALGGTGGSLSPSLDFQLFRGDQVFSACRPL...,GO:0044212;GO:0001191;GO:0005515;GO:0005488;GO...,"[IPR007087, IPR015880, IPR013087]",9606
675,55713,551993,ZN683_MOUSE,I7HJS4;,MKALDGLRESLYPSLDFQLYQDDQVCSADASQPLADSVGAHDLAWS...,GO:0044877;GO:0005488;GO:0003674;GO:0003682,"[IPR007087, IPR015880, IPR013087]",10090
676,55718,552022,ZN704_MOUSE,Q9ERQ3; Q7TQL8; Q8BJW9;,MQARRLAKRPSLGSRRGGAAPAPAPEAAALGLPPPGPSPAAAPGSW...,GO:0035326;GO:0003690;GO:0044212;GO:0003676;GO...,[IPR007087],10090
677,55732,552113,ZN809_MOUSE,G3X9G7; Q4KL58; Q8BIJ2;,MGLVSFEDVAVDFTLEEWQDLDAAQRTLYRDVMLETYSSLVFLDPC...,GO:0004527;GO:0016779;GO:0016772;GO:0016740;GO...,"[IPR001909, IPR007087, IPR015880, IPR013087]",10090


In [31]:
mf_df.to_csv('golabeler_test/mf_df.csv',index=False)

In [32]:
cc_df = test_df.copy(deep=True)
cc_df['annotations'] = cc_df['annotations'].apply(replace_row_cc)
cc_df = cc_df.dropna().reset_index(drop=True)
cc_df.rename(columns = {'proteins':'Entry','sequences':'Sequence','annotations':'Gene Ontology (cellular component)'}, inplace = True)
cc_df

Unnamed: 0,level_0,index,Entry,accessions,Sequence,Gene Ontology (cellular component),interpros,orgs
0,84,617,1B48_HUMAN,P30486; Q29764;,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...,GO:0044433;GO:0045335;GO:0031988;GO:0005794;GO...,"[IPR007110, IPR013783, IPR003006, IPR003597, I...",9606
1,326,2607,8ODP_RAT,P53369;,MSTSRLYTLVLVLQPQRVLLGMKKRGFGAGRWNGFGGKVQEGETIE...,GO:0005634;GO:0031988;GO:0005622;GO:0043231;GO...,"[IPR020476, IPR020084, IPR000086, IPR015797, I...",10116
2,392,3170,AA3R_MOUSE,Q61618; Q9R202;,MEADNTTETDWLNITYITMEAAIGLCAVVGNMLVIWVVKLNPTLRT...,GO:0005622;GO:0016021;GO:0000323;GO:0043231;GO...,"[IPR000466, IPR001634, IPR000276, IPR017452]",10090
3,462,3586,AARE_ARATH,Q84LM4; O23313;,MDSSGTDSAKELHVGLDPTTEEEYATQSKLLQEFINIPSIDKAWIF...,GO:0043227;GO:0005634;GO:0005575;GO:0005622;GO...,"[IPR011042, IPR029058, IPR011659, IPR001375]",3702
4,575,3984,ABA4_ARATH,Q8LFP9; Q9FZH4;,MGFSSFISQPLSSSLSVMKRNVSAKRSELCLDSSKIRLDHRWSFIG...,GO:0005622;GO:0043231;GO:0043226;GO:0044435;GO...,[IPR025461],3702
...,...,...,...,...,...,...,...,...
1143,55732,552113,ZN809_MOUSE,G3X9G7; Q4KL58; Q8BIJ2;,MGLVSFEDVAVDFTLEEWQDLDAAQRTLYRDVMLETYSSLVFLDPC...,GO:0043227;GO:0005634;GO:0044464;GO:0005622;GO...,"[IPR001909, IPR007087, IPR015880, IPR013087]",10090
1144,55751,552243,ZNF8_MOUSE,Q8BGV5; Q52KP6; Q8BJ50;,MDHQDKAATVAMASRPQATQLQEPVTFRDVAVDFTQEEWGQLDPTQ...,GO:0043227;GO:0005634;GO:0044464;GO:0005622;GO...,"[IPR001909, IPR007087, IPR015880, IPR013087]",10090
1145,55763,552282,ZNRF4_HUMAN,Q8WWF5; A8K886; O75866;,MPLCRPEHLMPRASRVPVAASLPLSHAVIPTQLPSRPGHRPPGRPR...,GO:0005783;GO:0005622;GO:0043231;GO:0043226;GO...,"[IPR003137, IPR001841, IPR013083]",9606
1146,55764,552284,ZNRF4_MOUSE,Q9DAH2; Q9WTN2;,MARFAWTRVAPVALVTFWLVLSLSPTDAQVNLSSVDFLDLPALLGV...,GO:0005575;GO:0005622;GO:0044424;GO:0005623;GO...,"[IPR001841, IPR013083]",10090


In [33]:
cc_df.to_csv('golabeler_test/cc_df.csv',index=False)

In [34]:
bp_df = test_df.copy(deep=True)
bp_df['annotations'] = bp_df['annotations'].apply(replace_row_bp)
bp_df = bp_df.dropna().reset_index(drop=True)
bp_df.rename(columns = {'proteins':'Entry','sequences':'Sequence','annotations':'Gene Ontology (biological process)'}, inplace = True)
bp_df

Unnamed: 0,level_0,index,Entry,accessions,Sequence,Gene Ontology (biological process),interpros,orgs
0,84,617,1B48_HUMAN,P30486; Q29764;,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...,GO:0045087;GO:0051716;GO:0006952;GO:0071346;GO...,"[IPR007110, IPR013783, IPR003006, IPR003597, I...",9606
1,181,1028,3BHS7_MOUSE,Q9EQC1; A2RTR5;,MADSAQVPTLVYLVTGGCGFLGEHIVRMLLEREPRLRELRVFDLHL...,GO:0044710;GO:0051716;GO:0009605;GO:0048870;GO...,"[IPR002225, IPR016040]",10090
2,304,2418,6PGD_DROME,P41572; Q9W519;,MSGQADIALIGLAVMGQNLILNMDEKGFVVCAYNRTVAKVKEFLAN...,GO:0019222;GO:0044237;GO:0065007;GO:0005996;GO...,"[IPR008927, IPR013328, IPR012284, IPR006114, I...",7227
3,326,2607,8ODP_RAT,P53369;,MSTSRLYTLVLVLQPQRVLLGMKKRGFGAGRWNGFGGKVQEGETIE...,GO:0000003;GO:0007548;GO:0003006;GO:0022414;GO...,"[IPR020476, IPR020084, IPR000086, IPR015797, I...",10116
4,392,3170,AA3R_MOUSE,Q61618; Q9R202;,MEADNTTETDWLNITYITMEAAIGLCAVVGNMLVIWVVKLNPTLRT...,GO:0002279;GO:0009611;GO:0009605;GO:0002694;GO...,"[IPR000466, IPR001634, IPR000276, IPR017452]",10090
...,...,...,...,...,...,...,...,...
1429,55718,552022,ZN704_MOUSE,Q9ERQ3; Q7TQL8; Q8BJW9;,MQARRLAKRPSLGSRRGGAAPAPAPEAAALGLPPPGPSPAAAPGSW...,GO:0019222;GO:0044237;GO:0019438;GO:0006725;GO...,[IPR007087],10090
1430,55732,552113,ZN809_MOUSE,G3X9G7; Q4KL58; Q8BIJ2;,MGLVSFEDVAVDFTLEEWQDLDAAQRTLYRDVMLETYSSLVFLDPC...,GO:0006260;GO:0019083;GO:0090305;GO:0060255;GO...,"[IPR001909, IPR007087, IPR015880, IPR013087]",10090
1431,55751,552243,ZNF8_MOUSE,Q8BGV5; Q52KP6; Q8BJ50;,MDHQDKAATVAMASRPQATQLQEPVTFRDVAVDFTQEEWGQLDPTQ...,GO:0006355;GO:0060255;GO:0044763;GO:1901360;GO...,"[IPR001909, IPR007087, IPR015880, IPR013087]",10090
1432,55763,552282,ZNRF4_HUMAN,Q8WWF5; A8K886; O75866;,MPLCRPEHLMPRASRVPVAASLPLSHAVIPTQLPSRPGHRPPGRPR...,GO:0009057;GO:0044237;GO:0043412;GO:0036211;GO...,"[IPR003137, IPR001841, IPR013083]",9606


In [35]:
bp_df.to_csv('golabeler_test/bp_df.csv',index=False)