In [1]:
import os
import os.path as osp

import pickle

from blast import *

from src.python.preprocess2 import *



In [2]:
def preds_by_attr(hits_per_uid, attr, func=lambda x: x):
    preds = {}
    pbar = tqdm(range(len(hits_per_uid)), desc="sequences processed")
    for uid, hits in hits_per_uid.items():
        pbar.update(1)
        if len(hits) == 0:
            continue
        preds[uid] = {}
        for go, hits in hits.items():
            hs =[getattr(h, attr) for h in hits if h.evalue < 0.001]
            if len(hs) == 0:
                continue
            preds[uid][go] = func(max(hs))
    pbar.close()
    return preds

def load_object(pth):
    with open(pth, 'rb') as f:
        loaded_dist_mat = pickle.load(f)
        assert len(loaded_dist_mat) > 0
    return loaded_dist_mat

In [3]:
pth_to_cco_hsps = "../../Data/cafapi_blast_CCO_hsp_p2v2"
pth_to_bpo_hsps = "../../Data/cafapi_blast_BPO_hsp_p2v2"

hits_cco_per_uid = load_object(pth_to_cco_hsps)
hits_bpo_per_uid = load_object(pth_to_bpo_hsps)

preds_cco_pident = preds_by_attr(hits_cco_per_uid, "pident", func=lambda x: x/100)
preds_bpo_pident = preds_by_attr(hits_bpo_per_uid, "pident", func=lambda x: x/100)

sequences processed: 100%|██████████| 18313/18313 [00:00<00:00, 19189.42it/s]
sequences processed: 100%|██████████| 18313/18313 [00:02<00:00, 7728.76it/s]


In [4]:
PATH = "../../Data/CAFA_PI/submission"

if not osp.exists(PATH):
    os.makedirs(PATH)

In [5]:
targets_by_sp = {"208963": load_targets("../../Data/CAFA_PI/targetFiles/target.208963.fasta"),
                 "237561": load_targets("../../Data/CAFA_PI/targetFiles/target.237561.fasta")}

len(targets_by_sp["208963"]), len(targets_by_sp["237561"]), sum(map(len, targets_by_sp.values()))

0%0%0%0%0%0%0%0%0%0%0%0%0%0%0%0%0%0%0%0%0%0%0%0%0%0%0%0%0%0%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%1%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%2%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%3%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%4%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%6%6%6%6%6%6%6%6%

39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%39%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%41%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%42%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%43%

100%
Finished loading 5892 sequences!
100%
Finished loading 12421 sequences!


(5892, 12421, 18313)

In [6]:
def get_go_dict(targets, preds_by_uid):
    go_dict = {}
    for uid in targets.keys():
        if uid not in preds_by_uid:
            continue
        for go, prob in preds_by_uid[uid].items():
            if go not in go_dict:
                go_dict[go] = [(uid, prob)]
            else:
                go_dict[go].append((uid, prob))
    return go_dict            

In [13]:
num_model = 1
group_id = "yotamfr"
go_targets = {"GO:0001539": {
    "ancestors": ["GO:0048870", "GO:0040011", "GO:0051674", "GO:0006928"],
    "related": ["GO:0005929", "GO:0031514", "GO:0097589", "GO:0009288", "GO:0055040"]
}, "GO:0042710": {
    "ancestors": ["GO:0044764", "GO:0051704"],
    "related": ["GO:0097311", "GO:0097312", "GO:0097313", "GO:0044420", "GO:0031012", "GO:0044421", "GO:000576"]
}}
        

for sp in ["208963", "237561"]:
    go_cco_dict = get_go_dict(targets_by_sp[sp], preds_cco_pident)
    go_bpo_dict = get_go_dict(targets_by_sp[sp], preds_bpo_pident)
    
    for go in go_targets.keys():
        
        candidates = []
        if go in go_bpo_dict:
            for uid, prob in go_bpo_dict[go]:
                candidates.append((uid, prob))
                print(go, uid, prob)
        
        for anc in go_targets[go]["ancestors"]:
            if anc in go_bpo_dict:
                for uid1, prob1 in go_bpo_dict[anc]:
                    for rel in go_targets[go]["related"]:
                        if rel in go_cco_dict:
                            for uid2, prob2 in go_cco_dict[rel]:
                                if uid1 == uid2:
                                    candidates.append((uid1, max(prob1, prob2)))
                                    print(go, uid1, max(prob1, prob2))
                            
        _, go_id = go.split(":")

        lines = ["AUTHOR\t%s\n" % group_id, "MODEL\t%d\n" % num_model, "KEYWORDS\tsequence alignment, homolog, other functional information\n"]
        with open(osp.join(PATH, "%s_%d_%s_%s.txt" % (group_id, num_model, sp, go_id)), "w+") as f:
            for uid, prob in candidates:
                lines.append("%s\t%.2f\n" % (uid, prob))
            lines.append("END")
            f.writelines(lines)

GO:0001539 T2089630000610 0.33030000000000004
GO:0001539 T2089630000660 0.3333
GO:0001539 T2089630004698 0.31980000000000003
GO:0001539 T2089630000621 0.2789
GO:0001539 T2089630001139 0.4857
GO:0001539 T2089630001204 0.35369999999999996
GO:0001539 T2089630001225 0.3778
GO:0001539 T2089630001619 0.3195
GO:0001539 T2089630001694 0.3346
GO:0001539 T2089630002626 0.35350000000000004
GO:0001539 T2089630002956 0.3211
GO:0001539 T2089630004698 0.3506
GO:0001539 T2089630005007 0.2922
GO:0001539 T2089630005123 0.3423
GO:0001539 T2089630005598 0.285
GO:0001539 T2375610000271 0.26789999999999997
GO:0001539 T2375610000271 0.2667
GO:0001539 T2375610000272 0.26789999999999997
GO:0001539 T2375610000272 0.2667
GO:0001539 T2375610000368 0.2977
GO:0001539 T2375610000368 0.2944
GO:0001539 T2375610000369 0.2977
GO:0001539 T2375610000369 0.2944
GO:0001539 T2375610000516 0.3772
GO:0001539 T2375610000535 0.6082
GO:0001539 T2375610000535 0.2945
GO:0001539 T2375610000536 0.6082
GO:0001539 T2375610000536 0.2945

GO:0001539 T2375610011687 0.4606
GO:0001539 T2375610011687 0.3243
GO:0001539 T2375610011688 0.4606
GO:0001539 T2375610011688 0.3243
GO:0001539 T2375610011690 0.3741
GO:0001539 T2375610011691 0.3741
GO:0001539 T2375610011793 0.3933
GO:0001539 T2375610011965 0.4817
GO:0001539 T2375610011968 0.4817
GO:0001539 T2375610012244 0.4096
GO:0001539 T2375610012244 0.4096
GO:0001539 T2375610012245 0.4096
GO:0001539 T2375610012245 0.4096
GO:0001539 T2375610012394 0.2407
GO:0001539 T2375610012395 0.2407
GO:0001539 T2375610000027 0.3059
GO:0001539 T2375610000028 0.3059
GO:0001539 T2375610000080 0.3824
GO:0001539 T2375610000081 0.3824
GO:0001539 T2375610000197 0.4524
GO:0001539 T2375610000198 0.4524
GO:0001539 T2375610000230 0.3432
GO:0001539 T2375610000288 0.32
GO:0001539 T2375610000289 0.32
GO:0001539 T2375610000471 0.32909999999999995
GO:0001539 T2375610000501 0.4714
GO:0001539 T2375610000516 0.3224
GO:0001539 T2375610000581 0.29710000000000003
GO:0001539 T2375610000581 0.29710000000000003
GO:00015

GO:0001539 T2375610008015 0.3444
GO:0001539 T2375610008097 0.2667
GO:0001539 T2375610008097 0.2667
GO:0001539 T2375610008098 0.2583
GO:0001539 T2375610008098 0.2741
GO:0001539 T2375610008112 0.2605
GO:0001539 T2375610008112 0.2642
GO:0001539 T2375610008113 0.2605
GO:0001539 T2375610008113 0.2642
GO:0001539 T2375610008333 0.2963
GO:0001539 T2375610008339 0.2963
GO:0001539 T2375610008372 0.41619999999999996
GO:0001539 T2375610008372 0.41619999999999996
GO:0001539 T2375610008374 0.41619999999999996
GO:0001539 T2375610008374 0.41619999999999996
GO:0001539 T2375610008393 0.4925
GO:0001539 T2375610008394 0.4925
GO:0001539 T2375610008445 0.2857
GO:0001539 T2375610008454 0.3169
GO:0001539 T2375610008455 0.3169
GO:0001539 T2375610008506 0.29760000000000003
GO:0001539 T2375610008506 0.29760000000000003
GO:0001539 T2375610008507 0.29760000000000003
GO:0001539 T2375610008507 0.29760000000000003
GO:0001539 T2375610008536 0.4058
GO:0001539 T2375610008537 0.4058
GO:0001539 T2375610008661 0.8537
GO:00

In [8]:
import synapseclient
from synapseclient import Project, Folder, File, Link

In [14]:
syn = synapseclient.Synapse()
syn.login('yotamfra@mail.tau.ac.il', 'tsiqly12')

Welcome,  !



In [15]:
# project = syn.get('syn11533497') 
project = Project('CAFA_PI')
project = syn.store(project)

In [16]:
folder = Folder('submissions', parent=project)
folder = syn.store(folder)

In [17]:
data = File(osp.join(PATH, 'submission1.zip'), parent=folder)
data = syn.store(data)


##################################################
 Uploading file to Synapse storage 
##################################################

