In [1]:
import pandas as pd
import numpy as np
import re
import networkx
from networkx.algorithms.components.connected import connected_components
from collections import defaultdict
import random
import glob
import os
from Bio import SeqIO
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import jaccard_score
import time
start = time.time()

# Loading MIBiG dataframes and creating dictionaries

In [2]:
bioactivity_df = pd.read_csv("./bioactivity_df-220203.csv",sep='\t',index_col=0)

bioactivity_dict = dict(zip(bioactivity_df['metabolite'],bioactivity_df['activity']))

bioactivity_dict

{'abyssomicin': 'antibacterial',
 'aurafuron': 'antifungal-cytotoxic',
 'aureothin': 'antibacterial-antifungal-cytotoxic',
 'avilamycin': 'antibacterial',
 'bafilomycin': 'antibacterial-antifungal-cytotoxic',
 'borrelidin': 'antibacterial-cytotoxic',
 'chlorothricin': 'antibacterial',
 'coelimycin': 'unknown',
 'dawenol': 'unknown',
 'erythromycin': 'antibacterial',
 'galbonolides': 'antifungal',
 'kedarcidin': 'antibacterial-cytotoxic',
 'lactimidomycin': 'antifungal-cytotoxic',
 'neoaureothin': 'antifungal-cytotoxic',
 'neocarzinostatin': 'antibacterial-cytotoxic',
 'nystatin': 'antifungal',
 'pactamycin': 'antibacterial-antifungal-cytotoxic',
 'salinomycin': 'antibacterial-cytotoxic',
 'soraphen': 'antifungal',
 'spirangien': 'antifungal-cytotoxic',
 'sporolide': 'unknown',
 'stambomycin': 'antibacterial-cytotoxic',
 'stigmatellin': 'antifungal',
 'tautomycetin': 'antifungal-cytotoxic',
 'tautomycin': 'antifungal-cytotoxic',
 'tiacumicin': 'antibacterial',
 '9methylstreptimidone': '

In [3]:
np.unique(list(bioactivity_dict.values()))

array(['antibacterial', 'antibacterial-antifungal',
       'antibacterial-antifungal-cytotoxic', 'antibacterial-cytotoxic',
       'antifungal', 'antifungal-cytotoxic', 'cytotoxic',
       'cytotoxic-unknown', 'nan', 'unknown'], dtype='<U34')

In [4]:
mibig_df = pd.read_csv("./All_MIBiG_compounds_with_CF_NPC_classes.txt",sep='\t')

mibig_df

Unnamed: 0,compound_name,smiles,inchi_key,cf_kingdom,cf_superclass,cf_class,cf_subclass,cf_direct_parent,npc_class,npc_superclass,npc_pathway,npc_isglycoside
0,BGC0000001_abyssomicin C,CC1C[C@]23OC(=O)C4=C2OC1C(O)C3\C=C/C(=O)[C@@H]...,FNEADFUPWHAVTA-UHFFFAOYSA-N,Organic compounds,Organoheterocyclic compounds,Oxanes,,Oxanes,Spirotetronate macrolides,Macrolides,Polyketides,0
1,BGC0000001_atrop-abyssomicin C,CC1CC23OC(=O)C4=C2OC1C(O)C3\C=C/C(=O)C(C)CC(C)...,FNEADFUPWHAVTA-UHFFFAOYSA-N,Organic compounds,Organoheterocyclic compounds,Oxanes,,Oxanes,Spirotetronate macrolides,Macrolides,Polyketides,0
2,BGC0000002_aculeximycin,CCCC(O[C@H]1C[C@](C)(N)[C@H](O)[C@H](C)O1)C(C)...,VJKZKLDZOAFAEE-UHFFFAOYSA-N,Organic compounds,Lipids and lipid-like molecules,Prenol lipids,Terpene glycosides,Diterpene glycosides,,,,1
3,BGC0000003_AF-toxin,CCC(C)C(C(=O)OC(/C=C/C=C/C=C/C(=O)O)C1(CO1)C)O...,ONOBRFRRMLDPES-UHFFFAOYSA-N,Organic compounds,Organic acids and derivatives,Peptidomimetics,Depsipeptides,Depsipeptides,,,,0
4,BGC0000004_aflatoxin G1,[H][C@@]12OC=C[C@]1([H])C1=C(O2)C=C(OC)C2=C1OC...,XWIYFDMXXLINPU-UHFFFAOYSA-N,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Furanocoumarins,Difurocoumarolactones,Aflatoxins; Simple coumarins,Chromanes; Coumarins,Polyketides; Shikimates and Phenylpropanoids,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2107,BGC0002034_perquinoline A,C1CC2(C(C3=C(C=C(C=C3O)O)C(N2C1=O)C(=O)NCCCC(=...,CDFQOHHGTNUFPU-UHFFFAOYSA-N,,,,,,,,Alkaloids,0
2108,BGC0002034_perquinoline B,C1CC2(C(C3=C(C=C(C=C3O)O)C(N2C1=O)C(=O)NCCC(=O...,XHJZZGRZEGUFCM-UHFFFAOYSA-N,,,,,,,,Alkaloids,0
2109,BGC0002034_perquinoline C,C1CC2(C(C3=C(C=C(C=C3O)O)C(N2C1=O)C(=O)NCCCC(=...,ZNSLKIGPUIPFNC-UHFFFAOYSA-N,,,,,,,,Alkaloids,0
2110,BGC0002035_ilicicolin H,CC=CC1C2CC(CCC2C(=CC1C(=O)C3=C(C(=CNC3=O)C4=CC...,BYVVOONSAAQMKI-UHFFFAOYSA-N,Organic compounds,Organoheterocyclic compounds,Pyridines and derivatives,Phenylpyridines,Phenylpyridines,Pyridine alkaloids,Nicotinic acid alkaloids,Alkaloids,0


In [5]:
mibig_activity_dict = {}
missing_mets_list = []

for i,r in mibig_df.iterrows():
    met_name = r['compound_name'].split('_')[1].replace(' ','_').lower().split('_')[0]
    for key in bioactivity_dict.keys():
        if type(key) != float:
            if met_name in key:
                print(r['compound_name'].split('_')[0],met_name,key,bioactivity_dict[key])
                if type(bioactivity_dict[key]) == float:
                    mibig_activity_dict[r['compound_name'].split('_')[0]] = 'unknown'
                else:
                    mibig_activity_dict[r['compound_name'].split('_')[0]] = bioactivity_dict[key]
            else:
                missing_mets_list.append(met_name)

BGC0000001 abyssomicin abyssomicin antibacterial
BGC0000002 aculeximycin aculeximycin antibacterial-antifungal
BGC0000014 ambruticin ambruticin antifungal
BGC0000016 amphotericin amphotericin antifungal
BGC0000020 ansamitocin ansamitocin cytotoxic
BGC0000021 apoptolidin apoptolidin cytotoxic
BGC0000023 aurafuron aurafuron antifungal-cytotoxic
BGC0000024 aureothin aureothin antibacterial-antifungal-cytotoxic
BGC0000024 aureothin neoaureothin antifungal-cytotoxic
BGC0000025 avermectin avermectin cytotoxic
BGC0000026 avilamycin avilamycin antibacterial
BGC0000026 avilamycin avilamycin antibacterial
BGC0000028 bafilomycin bafilomycin antibacterial-antifungal-cytotoxic
BGC0000031 borrelidin borrelidin antibacterial-cytotoxic
BGC0000033 calicheamicin calicheamicin antibacterial-antifungal-cytotoxic
BGC0000034 candicidin candicidin antifungal
BGC0000035 chalcomycin chalcomycin antibacterial
BGC0000035 chalcomycin dihyrdochalcomycin unknown
BGC0000036 chlorothricin chlorothricin antibacterial


BGC0000186 thailandamide thailandamide unknown
BGC0000187 asukamycin asukamycin antibacterial-antifungal-cytotoxic
BGC0000188 saxitoxin saxitoxin nan
BGC0000191 aclacinomycin aclacinomycin cytotoxic
BGC0000191 aclacinomycin aclacinomycin_2 cytotoxic
BGC0000191 aclacinomycin aclacinomycin_3 cytotoxic
BGC0000192 aclacinomycin aclacinomycin cytotoxic
BGC0000192 aclacinomycin aclacinomycin_2 cytotoxic
BGC0000192 aclacinomycin aclacinomycin_3 cytotoxic
BGC0000193 aclacinomycin aclacinomycin cytotoxic
BGC0000193 aclacinomycin aclacinomycin_2 cytotoxic
BGC0000193 aclacinomycin aclacinomycin_3 cytotoxic
BGC0000194 actinorhodin actinorhodin antibacterial
BGC0000195 alnumycin alnumycin antibacterial-cytotoxic
BGC0000195 alnumycin alnumycin antibacterial-cytotoxic
BGC0000195 alnumycin alnumycin antibacterial-cytotoxic
BGC0000195 alnumycin alnumycin antibacterial-cytotoxic
BGC0000197 aranciamycin aranciamycin antibacterial-cytotoxic
BGC0000198 arenimycin arenimycin antibacterial-cytotoxic
BGC00001

BGC0000335 cystomanamide cystomanamides unknown
BGC0000335 cystomanamide cystomanamides unknown
BGC0000335 cystomanamide cystomanamides unknown
BGC0000335 cystomanamide cystomanamides unknown
BGC0000336 daptomycin daptomycin antibacterial
BGC0000339 echinomycin echinomycin antibacterial-antifungal-cytotoxic
BGC0000340 echoside echosides cytotoxic-unknown
BGC0000340 echoside echosides cytotoxic-unknown
BGC0000340 echoside echosides cytotoxic-unknown
BGC0000340 echoside echosides cytotoxic-unknown
BGC0000340 echoside echosides cytotoxic-unknown
BGC0000341 enduracidin enduracidin antibacterial
BGC0000343 enterobactin enterobactin nan
BGC0000345 eponemycin eponemycin cytotoxic
BGC0000346 epoxomicin epoxomicin cytotoxic
BGC0000349 erythrochelin erythrochelin antibacterial
BGC0000352 fimsbactin fimsbactin nan
BGC0000354 friulimicin friulimicin antibacterial
BGC0000354 friulimicin friulimicin antibacterial
BGC0000354 friulimicin friulimicin antibacterial
BGC0000354 friulimicin friulimicin ant

BGC0000636 carotenoid pantoea_anantis_carotenoid nan
BGC0000636 carotenoid staph_aureus_carotenoid nan
BGC0000636 carotenoid carotenoid_2 unknown
BGC0000636 carotenoid carotenoid_3 nan
BGC0000636 carotenoid carotenoid_4 nan
BGC0000636 carotenoid carotenoid_5 nan
BGC0000636 carotenoid carotenoid_6 nan
BGC0000636 carotenoid carotenoid_7 nan
BGC0000636 carotenoid carotenoid_8 nan
BGC0000636 carotenoid carotenoid_9 nan
BGC0000636 carotenoid carotenoid_10 nan
BGC0000636 carotenoid carotenoid_11 nan
BGC0000636 carotenoid carotenoid_12 nan
BGC0000636 carotenoid carotenoid_13 nan
BGC0000636 carotenoid carotenoid_14 nan
BGC0000636 carotenoid carotenoid_15 nan
BGC0000636 carotenoid carotenoid_16 nan
BGC0000636 carotenoid carotenoid_17 nan
BGC0000637 carotenoid carotenoid nan
BGC0000637 carotenoid rubivivax_gelatnosus_carotenoid nan
BGC0000637 carotenoid rhobacter_sphaeroides_241_carotenoid nan
BGC0000637 carotenoid pantoea_anantis_carotenoid nan
BGC0000637 carotenoid staph_aureus_carotenoid nan


BGC0000647 carotenoid carotenoid_11 nan
BGC0000647 carotenoid carotenoid_12 nan
BGC0000647 carotenoid carotenoid_13 nan
BGC0000647 carotenoid carotenoid_14 nan
BGC0000647 carotenoid carotenoid_15 nan
BGC0000647 carotenoid carotenoid_16 nan
BGC0000647 carotenoid carotenoid_17 nan
BGC0000648 carotenoid carotenoid nan
BGC0000648 carotenoid rubivivax_gelatnosus_carotenoid nan
BGC0000648 carotenoid rhobacter_sphaeroides_241_carotenoid nan
BGC0000648 carotenoid pantoea_anantis_carotenoid nan
BGC0000648 carotenoid staph_aureus_carotenoid nan
BGC0000648 carotenoid carotenoid_2 unknown
BGC0000648 carotenoid carotenoid_3 nan
BGC0000648 carotenoid carotenoid_4 nan
BGC0000648 carotenoid carotenoid_5 nan
BGC0000648 carotenoid carotenoid_6 nan
BGC0000648 carotenoid carotenoid_7 nan
BGC0000648 carotenoid carotenoid_8 nan
BGC0000648 carotenoid carotenoid_9 nan
BGC0000648 carotenoid carotenoid_10 nan
BGC0000648 carotenoid carotenoid_11 nan
BGC0000648 carotenoid carotenoid_12 nan
BGC0000648 carotenoid c

BGC0000743 capsular capsular_polysaccharide nan
BGC0000743 capsular capsular_polysaccharide_2 nan
BGC0000743 capsular capsular_polysaccharide_3 nan
BGC0000743 capsular capsular_polysaccharide_4 nan
BGC0000743 capsular capsular_polysaccharide_5 nan
BGC0000743 capsular capsular_polysaccharide_6 nan
BGC0000743 capsular capsular_polysaccharide_7 nan
BGC0000743 capsular capsular_polysaccharide_8 nan
BGC0000743 capsular capsular_polysaccharide_9 nan
BGC0000743 capsular capsular_polysaccharide_10 nan
BGC0000743 capsular capsular_polysaccharide_11 nan
BGC0000743 capsular capsular_polysaccharide_12 nan
BGC0000743 capsular capsular_polysaccharide_13 nan
BGC0000743 capsular capsular_polysaccharide_14 nan
BGC0000743 capsular capsular_polysaccharide_15 nan
BGC0000743 capsular capsular_polysaccharide_16 nan
BGC0000743 capsular capsular_polysaccharide_17 nan
BGC0000743 capsular capsular_polysaccharide_18 nan
BGC0000743 capsular capsular_polysaccharide_19 nan
BGC0000743 capsular capsular_polysaccharid

BGC0000752 capsular capsular_polysaccharide_19 nan
BGC0000752 capsular capsular_polysaccharide_20 nan
BGC0000752 capsular capsular_polysaccharide_21 nan
BGC0000752 capsular capsular_polysaccharide_22 nan
BGC0000752 capsular capsular_polysaccharide_23 nan
BGC0000752 capsular capsular_polysaccharide_24 nan
BGC0000752 capsular capsular_polysaccharide_25 nan
BGC0000752 capsular capsular_polysaccharide_26 nan
BGC0000752 capsular capsular_polysaccharide_27 nan
BGC0000752 capsular capsular_polysaccharide_28 nan
BGC0000752 capsular capsular_polysaccharide_29 nan
BGC0000752 capsular capsular_polysaccharide_30 nan
BGC0000753 capsular capsular_polysaccharide nan
BGC0000753 capsular capsular_polysaccharide_2 nan
BGC0000753 capsular capsular_polysaccharide_3 nan
BGC0000753 capsular capsular_polysaccharide_4 nan
BGC0000753 capsular capsular_polysaccharide_5 nan
BGC0000753 capsular capsular_polysaccharide_6 nan
BGC0000753 capsular capsular_polysaccharide_7 nan
BGC0000753 capsular capsular_polysacchar

BGC0000879 toyocamycin toyocamycin antifungal-cytotoxic
BGC0000879 toyocamycin toyocamycin_2 antifungal-cytotoxic
BGC0000880 tunicamycin tunicamycin antibacterial-antifungal-cytotoxic
BGC0000881 toyocamycin toyocamycin antifungal-cytotoxic
BGC0000881 toyocamycin toyocamycin_2 antifungal-cytotoxic
BGC0000882 mildiomycin mildiomycin antifungal
BGC0000883 arginomycin arginomycin antibacterial-antifungal
BGC0000887 saxitoxin saxitoxin nan
BGC0000887 toxin saxitoxin nan
BGC0000887 toxin anatoxin nan
BGC0000887 toxin lyngbyatoxin cytotoxic
BGC0000887 toxin mangotoxin nan
BGC0000887 toxin trifolitoxin unknown
BGC0000887 toxin tabtoxin nan
BGC0000887 toxin phaseolotoxin nan
BGC0000887 toxin t3_toxin nan
BGC0000887 toxin saxitoxin nan
BGC0000887 toxin anatoxin nan
BGC0000887 toxin lyngbyatoxin cytotoxic
BGC0000887 toxin mangotoxin nan
BGC0000887 toxin trifolitoxin unknown
BGC0000887 toxin tabtoxin nan
BGC0000887 toxin phaseolotoxin nan
BGC0000887 toxin t3_toxin nan
BGC0000888 bacilysin bacilysi

BGC0001020 muraymycin muraymycin antibacterial
BGC0001021 mycobactin mycobactin nan
BGC0001022 myxalamid myxalamid antibacterial-antifungal
BGC0001023 myxochromide myxochromide nan
BGC0001023 myxochromide myxochromide_d unknown
BGC0001023 myxochromide myxochromide_d_2 unknown
BGC0001023 myxochromide myxochromide_d_3 unknown
BGC0001023 myxochromide myxochromide_d_4 unknown
BGC0001023 myxochromide myxochromide_b unknown
BGC0001023 myxochromide myxochromide_a unknown
BGC0001023 myxochromide myxochromide_c unknown
BGC0001023 myxochromide myxochromide_c_2 unknown
BGC0001023 myxochromide myxochromide_a_2 unknown
BGC0001023 myxochromide myxochromide_a_3 unknown
BGC0001023 myxochromide myxochromide_a_4 unknown
BGC0001023 myxochromide myxochromide_s unknown
BGC0001023 myxochromide myxochromide_s_2 unknown
BGC0001023 myxochromide myxochromide_s_3 unknown
BGC0001023 myxochromide myxochromide_d_5 unknown
BGC0001024 myxothiazol myxothiazol antifungal-cytotoxic
BGC0001025 myxovirescin myxovirescin a

BGC0001192 colistin colistin antibacterial
BGC0001192 colistin colistin antibacterial
BGC0001193 thiolutin thiolutin antibacterial-antifungal
BGC0001194 hitachimycin hitachimycin antibacterial-antifungal-cytotoxic
BGC0001195 nocardiopsin nocardiopsin unknown
BGC0001195 nocardiopsin nocardiopsin unknown
BGC0001195 nocardiopsin nocardiopsin unknown
BGC0001195 nocardiopsin nocardiopsin unknown
BGC0001196 desotamide desotamide antibacterial
BGC0001197 frankiamicin frankiamicin antibacterial
BGC0001199 akaeolide akaeolide antibacterial-cytotoxic
BGC0001201 eremophilene eremophilene unknown
BGC0001204 versipelostatin versipelostatin unknown
BGC0001206 indolmycin indolmycin antibacterial
BGC0001207 teixobactin teixobactin antibacterial
BGC0001209 streptide streptide unknown
BGC0001212 nannocystin nannocystin antifungal-cytotoxic
BGC0001213 nataxazole nataxazole cytotoxic
BGC0001214 marformycin marformycins antibacterial
BGC0001214 marformycin marformycins antibacterial
BGC0001214 marformycin 

BGC0001406 telomycin telomycin antibacterial
BGC0001409 dutomycin dutomycin antibacterial-cytotoxic
BGC0001414 griselimycin griselimycin antibacterial
BGC0001415 althiomycin althiomycin antibacterial
BGC0001415 althiomycin althiomycin_2 antibacterial
BGC0001417 myxochromide myxochromide nan
BGC0001417 myxochromide myxochromide_d unknown
BGC0001417 myxochromide myxochromide_d_2 unknown
BGC0001417 myxochromide myxochromide_d_3 unknown
BGC0001417 myxochromide myxochromide_d_4 unknown
BGC0001417 myxochromide myxochromide_b unknown
BGC0001417 myxochromide myxochromide_a unknown
BGC0001417 myxochromide myxochromide_c unknown
BGC0001417 myxochromide myxochromide_c_2 unknown
BGC0001417 myxochromide myxochromide_a_2 unknown
BGC0001417 myxochromide myxochromide_a_3 unknown
BGC0001417 myxochromide myxochromide_a_4 unknown
BGC0001417 myxochromide myxochromide_s unknown
BGC0001417 myxochromide myxochromide_s_2 unknown
BGC0001417 myxochromide myxochromide_s_3 unknown
BGC0001417 myxochromide myxochro

BGC0001441 belactosin belactosin cytotoxic
BGC0001441 belactosin belactosin cytotoxic
BGC0001442 actinonin actinonin antibacterial
BGC0001453 desferrioxamine desferrioxamine nan
BGC0001453 desferrioxamine desferrioxamine_2 nan
BGC0001453 desferrioxamine desferrioxamine_b nan
BGC0001455 antimycin antimycin antifungal-cytotoxic
BGC0001478 desferrioxamine desferrioxamine nan
BGC0001478 desferrioxamine desferrioxamine_2 nan
BGC0001478 desferrioxamine desferrioxamine_b nan
BGC0001479 anabaenopeptin anabaenopeptin nan
BGC0001479 anabaenopeptin anabaenopeptin_2 nan
BGC0001492 abyssomicin abyssomicin antibacterial
BGC0001492 abyssomicin abyssomicin antibacterial
BGC0001497 actinonin actinonin antibacterial
BGC0001498 aerobactin aerobactin nan
BGC0001499 aerobactin aerobactin nan
BGC0001501 ambiguine ambiguine antibacterial-antifungal-cytotoxic
BGC0001511 ansamitocin ansamitocin cytotoxic
BGC0001533 borrelidin borrelidin antibacterial-cytotoxic
BGC0001558 cosmomycin cosmomycin cytotoxic
BGC0001

In [6]:
len(mibig_activity_dict)

806

In [7]:
mibig_activity_dict

{'BGC0000001': 'antibacterial',
 'BGC0000002': 'antibacterial-antifungal',
 'BGC0000014': 'antifungal',
 'BGC0000016': 'antifungal',
 'BGC0000020': 'cytotoxic',
 'BGC0000021': 'cytotoxic',
 'BGC0000023': 'antifungal-cytotoxic',
 'BGC0000024': 'antifungal-cytotoxic',
 'BGC0000025': 'cytotoxic',
 'BGC0000026': 'antibacterial',
 'BGC0000028': 'antibacterial-antifungal-cytotoxic',
 'BGC0000031': 'antibacterial-cytotoxic',
 'BGC0000033': 'antibacterial-antifungal-cytotoxic',
 'BGC0000034': 'antifungal',
 'BGC0000035': 'unknown',
 'BGC0000036': 'antibacterial',
 'BGC0000038': 'unknown',
 'BGC0000040': 'cytotoxic',
 'BGC0000042': 'antibacterial',
 'BGC0000043': 'cytotoxic',
 'BGC0000044': 'unknown',
 'BGC0000051': 'antifungal',
 'BGC0000053': 'cytotoxic',
 'BGC0000054': 'antibacterial',
 'BGC0000055': 'antibacterial',
 'BGC0000059': 'antifungal',
 'BGC0000060': 'cytotoxic',
 'BGC0000061': 'antifungal',
 'BGC0000066': 'antifungal-cytotoxic',
 'BGC0000067': 'antifungal-cytotoxic',
 'BGC0000068'

In [8]:
mibig_bio_3_df = pd.read_csv('./mibig_3_bioactivity.csv',sep='\t',names=['MIBIG_ID','Activity'])

mibig_bio_3_dict = dict(zip(mibig_bio_3_df['MIBIG_ID'],mibig_bio_3_df['Activity']))

mibig_bio_3_dict

{'BGC0000018': 'Antibacterial',
 'BGC0000019': 'Antibacterial',
 'BGC0000025': 'Cytotoxic',
 'BGC0000032': 'Antibacterial',
 'BGC0000034': 'Antifungal',
 'BGC0000035': 'Antibacterial',
 'BGC0000040': 'Antiviral',
 'BGC0000042': 'Antibacterial',
 'BGC0000047': 'Antibacterial',
 'BGC0000050': 'Antibacterial',
 'BGC0000052': 'Antifungal',
 'BGC0000058': 'Cytotoxic',
 'BGC0000059': 'Antibacterial',
 'BGC0000060': 'Cytotoxic',
 'BGC0000061': 'Antifungal',
 'BGC0000066': 'Antibacterial',
 'BGC0000067': 'Antibacterial',
 'BGC0000068': 'Antibacterial',
 'BGC0000073': 'Antibacterial',
 'BGC0000074': 'Antibacterial',
 'BGC0000075': 'Antibacterial',
 'BGC0000078': 'Cytotoxic',
 'BGC0000079': 'Antibacterial',
 'BGC0000084': 'Antibacterial',
 'BGC0000085': 'Antibacterial',
 'BGC0000086': 'Antibacterial',
 'BGC0000087': 'Antibacterial',
 'BGC0000091': 'Cytotoxic',
 'BGC0000093': 'Antibacterial',
 'BGC0000096': 'Antibacterial',
 'BGC0000100': 'Antiprotozoa',
 'BGC0000105': 'Antibacterial',
 'BGC00001

In [9]:
for key in mibig_activity_dict:
    if mibig_activity_dict[key] == 'unknown':
        if key in mibig_bio_3_dict.keys():
            mibig_activity_dict[key] = mibig_bio_3_dict[key].lower()

for key in mibig_bio_3_dict:
    if key not in mibig_activity_dict:
        mibig_activity_dict[key] = mibig_bio_3_dict[key].lower()

In [10]:
bgc_subtype_df = pd.read_csv("./bgc_subtype_df.csv",'\t',names=['BGC','subtype'])

bgc_subtype_df

Unnamed: 0,BGC,subtype
0,BGC0000001,Modular type I polyketide
1,BGC0000002,Polyketide
2,BGC0000003,Polyketide
3,BGC0000004,Polyketide
4,BGC0000005,Polyketide
...,...,...
1921,BGC0002045,Type II polyketide
1922,BGC0002055,Trans-AT type I polyketide
1923,BGC0002056,Trans-AT type I polyketide
1924,BGC0002057,Trans-AT type I polyketide


In [11]:
subtype_type_dict = {}
with open("./subtype_type_df.csv") as f:
    for line in f:
        (key, val) = line.split(',')
        subtype_type_dict[key] = val.strip('\n')
        
subtype_type_dict

{'Modular type I polyketide': 'PKS',
 'Polyketide': 'PKS',
 'Other': 'Other',
 'Alkaloid Modular type I polyketide': 'Alkaloid-PKS',
 'Oligosaccharide': 'Oligosaccharide',
 'Iterative type I polyketide': 'PKS',
 'Modular type I polyketide Iterative type I polyketide Oligosaccharide': 'Oligosaccharide-PKS',
 'Modular type I polyketide Hybrid/tailoring saccharide': 'Oligosaccharide-PKS',
 'Iterative type I polyketide Enediyne type I polyketide': 'PKS',
 'Modular type I polyketide Trans-AT type I polyketide': 'PKS',
 'Terpene Iterative type I polyketide': 'PKS-Terpene',
 'Iterative type I polyketide Hybrid/tailoring saccharide': 'Oligosaccharide-PKS',
 'NRP Enediyne type I polyketide': 'NRPS-PKS',
 'NRP Modular type I polyketide': 'NRPS-PKS',
 'Trans-AT type I polyketide': 'PKS',
 'Polyketide NRP': 'NRPS-PKS',
 'Iterative type I polyketide Trans-AT type I polyketide': 'PKS',
 'Type II polyketide': 'PKS',
 'Alkaloid': 'Alkaloid',
 'Type III polyketide': 'PKS',
 'Type II polyketide Hybrid/t

In [12]:
type_col = []

for i,r in bgc_subtype_df.iterrows():
    type_col.append(subtype_type_dict[r['subtype']])
    
bgc_subtype_df['type'] = type_col

bgc_subtype_df

Unnamed: 0,BGC,subtype,type
0,BGC0000001,Modular type I polyketide,PKS
1,BGC0000002,Polyketide,PKS
2,BGC0000003,Polyketide,PKS
3,BGC0000004,Polyketide,PKS
4,BGC0000005,Polyketide,PKS
...,...,...,...
1921,BGC0002045,Type II polyketide,PKS
1922,BGC0002055,Trans-AT type I polyketide,PKS
1923,BGC0002056,Trans-AT type I polyketide,PKS
1924,BGC0002057,Trans-AT type I polyketide,PKS


In [13]:
bgc_type_dict = dict(zip(bgc_subtype_df.BGC,bgc_subtype_df.type))

bgc_type_dict

{'BGC0000001': 'PKS',
 'BGC0000002': 'PKS',
 'BGC0000003': 'PKS',
 'BGC0000004': 'PKS',
 'BGC0000005': 'PKS',
 'BGC0000006': 'PKS',
 'BGC0000007': 'PKS',
 'BGC0000008': 'PKS',
 'BGC0000009': 'PKS',
 'BGC0000010': 'PKS',
 'BGC0000011': 'PKS',
 'BGC0000012': 'PKS',
 'BGC0000013': 'PKS',
 'BGC0000014': 'PKS',
 'BGC0000016': 'Other',
 'BGC0000017': 'Alkaloid-PKS',
 'BGC0000018': 'PKS',
 'BGC0000019': 'PKS',
 'BGC0000020': 'PKS',
 'BGC0000021': 'PKS',
 'BGC0000022': 'PKS',
 'BGC0000023': 'PKS',
 'BGC0000024': 'PKS',
 'BGC0000025': 'PKS',
 'BGC0000026': 'Oligosaccharide',
 'BGC0000027': 'PKS',
 'BGC0000028': 'PKS',
 'BGC0000029': 'PKS',
 'BGC0000030': 'PKS',
 'BGC0000031': 'PKS',
 'BGC0000032': 'PKS',
 'BGC0000033': 'PKS',
 'BGC0000034': 'PKS',
 'BGC0000035': 'PKS',
 'BGC0000036': 'Oligosaccharide-PKS',
 'BGC0000037': 'PKS',
 'BGC0000038': 'PKS',
 'BGC0000039': 'PKS',
 'BGC0000040': 'PKS',
 'BGC0000041': 'PKS',
 'BGC0000042': 'PKS',
 'BGC0000043': 'PKS',
 'BGC0000044': 'PKS',
 'BGC0000045': 

# Creating class dataframe

In [14]:
combined_list,all_classes,type_list = [],[],[]

for key in bgc_type_dict:
    if 'tRNA' in bgc_type_dict[key]:
        type_list = ['tRNA_derived']
    else:
        type_list = bgc_type_dict[key].split('-')
    for item in type_list:
        if item not in all_classes:
            all_classes.append(item)
    combined_list.append(type_list)
    
combined_list

[['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Other'],
 ['Alkaloid', 'PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide', 'PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide', 'PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide', 'PKS'],
 ['PKS'],
 ['PKS'

In [15]:
class_df = pd.DataFrame(columns=(all_classes))

bgc_list = list(bgc_type_dict.keys())

for i,combined_classes in enumerate(combined_list):
    row_extension = []
    for final_class in all_classes:
        if final_class in combined_classes:
            row_extension.append(1)
        else:
            row_extension.append(0)
    class_df.loc[bgc_list[i]] = row_extension
    
class_df = class_df.sort_index(axis=0)
    
class_df

Unnamed: 0,PKS,Other,Alkaloid,Oligosaccharide,Terpene,NRPS,Cyclitol,Aminocoumarin,Betalactam,Siderophore,Pyrrolobenzodiazepine,RiPP,Butyrolactone,Nucleoside,Phenazine,Aminoglycoside,tRNA_derived,Phosphonate
BGC0000001,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000002,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000003,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000004,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000005,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BGC0002045,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0002055,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0002056,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0002057,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Creating similarity dataframe

In [18]:
### obtaining bigscape dataframe and bigscape dictionary

def parse_bigscape_df(input_file,no_orphans):
    bigscape_df = pd.read_csv(input_file,sep='\t')
    bigscape_df.rename(columns=lambda x: re.sub(" ","_",x), inplace=True)
    if no_orphans == 1:
        bigscape_df = bigscape_df[bigscape_df.Clustername_1 != bigscape_df.Clustername_2]
    bigscape_df.reset_index(inplace=True,drop=True)
    return bigscape_df

def get_neighbors(target,dataframe,column1,column2):
    subset1 = dataframe[(dataframe[column1]==target)]
    subcat = subset1.append(dataframe[(dataframe[column2]==target)])
    temp_list = []
    for index,row in subcat.iterrows():
        temp_list.append(subcat[column1][index])
        temp_list.append(subcat[column2][index])
    temp_list = list(np.unique(temp_list))
    return temp_list

def to_edges(l):
    it = iter(l)
    last = next(it)
    for current in it:
        yield last, current
        last = current

def to_graph(l):
    G = networkx.Graph()
    for part in l:
        G.add_nodes_from(part)
        G.add_edges_from(to_edges(part))
    return G

def get_family_dict(components_list,dataframe,dictionary,column1,column2,column3):
    count = 0
    for family in list(components_list):
        count += 1
        for fam_member in family:
            dictionary['GCF%s'%count].append(fam_member)
    return dictionary

def main_get_families(input_file):
    bigscape_df = parse_bigscape_df(input_file,1)
    strain_list = list(np.unique([bigscape_df['Clustername_1']]+[bigscape_df['Clustername_2']]))
    targets_list = np.unique([bigscape_df.Clustername_1,bigscape_df.Clustername_2])
    neighbors_list = []
    for target in targets_list:
        neighbors_list.append(get_neighbors(target,bigscape_df,'Clustername_1','Clustername_2'))
    G = to_graph(neighbors_list)
    C = connected_components(G)
    gcf_dict = defaultdict(list)
    gcf_dict = get_family_dict(C,bigscape_df,gcf_dict,'Clustername_1','Clustername_2','Raw_distance')
    return bigscape_df,gcf_dict,strain_list


bigscape_df,bigscape_dict,strain_list = main_get_families("./bigscape_all_c030.txt")
bigscape_df["Raw_distance"] = 1-bigscape_df["Raw_distance"]

bigscape_dict

defaultdict(list,
            {'GCF1': ['BGC0000004.1.region001',
              'BGC0000009.1.region001',
              'BGC0000008.1.region001',
              'BGC0000007.1.region001',
              'BGC0000006.1.region001'],
             'GCF2': ['BGC0001511.1.region001', 'BGC0000020.1.region001'],
             'GCF3': ['BGC0001349.1.region001',
              'BGC0000029.1.region001',
              'BGC0000097.1.region001'],
             'GCF4': ['BGC0000031.1.region001', 'BGC0001533.1.region001'],
             'GCF5': ['BGC0000061.1.region001', 'BGC0000034.1.region001'],
             'GCF6': ['BGC0000047.1.region001',
              'BGC0001396.1.region001',
              'BGC0000035.1.region001'],
             'GCF7': ['BGC0000098.1.region001', 'BGC0000039.1.region001'],
             'GCF8': ['BGC0000051.1.region001', 'BGC0001580.1.region001'],
             'GCF9': ['BGC0002032.1.region001', 'BGC0000059.1.region001'],
             'GCF10': ['BGC0000063.1.region001', 'BGC0000062.1.re

In [19]:
len(bigscape_dict)

192

In [20]:
bigscape_df2 = parse_bigscape_df("./bigscape_all_c030.txt",0)
all_clusters = list(np.unique([bigscape_df2['Clustername_1']]+[bigscape_df2['Clustername_2']]))

len(all_clusters)

1627

In [21]:
strain_list_renamed = []

for item in strain_list:
    strain_list_renamed.append(item.split('.')[0])
    
strain_list_renamed = list(np.unique(strain_list_renamed))

similarity_df = pd.DataFrame(columns=strain_list_renamed, index=range(0,len(strain_list_renamed)-1))
index_row = 0
row_names = []
for gcf in bigscape_dict:
    for cluster in bigscape_dict[gcf]:
        row_names.append(cluster.split('.')[0])
        temp_dict = {}
        self = cluster.split(".")[0]
        temp_dict[self] = [1]
        temp_df = bigscape_df[bigscape_df.Clustername_1.str.contains(cluster) | 
                              bigscape_df.Clustername_2.str.contains(cluster)]
        for i,r in temp_df.iterrows():
            if temp_df.Clustername_1.loc[i] == cluster:
                target = temp_df.Clustername_2.loc[i]
                target = str(target).split(".")[0]
                if target not in temp_dict:
                    temp_dict[target] = [temp_df.Raw_distance.loc[i]]
                else:
                    temp_dict[target] = temp_dict[target]+[temp_df.Raw_distance.loc[i]]
            else:
                target = temp_df.Clustername_1.loc[i]
                target = str(target).split(".")[0]
                if target not in temp_dict.keys():
                    temp_dict[target] = [temp_df.Raw_distance.loc[i]]
                else:
                    temp_dict[target] = temp_dict[target]+[temp_df.Raw_distance.loc[i]]
        for key in temp_dict:
            if len(temp_dict[key]) > 1:
                new_value = max(temp_dict[key])
                temp_dict[key] = new_value
            else:
                temp_dict[key] = temp_dict[key][0]
        similarity_df.loc[index_row] = pd.Series(temp_dict)
        index_row += 1
similarity_df.fillna(0,inplace=True)
similarity_df.index = row_names
similarity_df = similarity_df[~similarity_df.index.duplicated(keep='first')]
similarity_df = similarity_df.sort_index(axis=0)

len(strain_list_renamed),len(similarity_df)

(520, 520)

In [22]:
similarity_df

Unnamed: 0,BGC0000004,BGC0000006,BGC0000007,BGC0000008,BGC0000009,BGC0000020,BGC0000029,BGC0000031,BGC0000034,BGC0000035,...,BGC0002012,BGC0002015,BGC0002016,BGC0002022,BGC0002023,BGC0002027,BGC0002029,BGC0002030,BGC0002032,BGC0002033
BGC0000004,1.000000,0.976806,0.953003,0.976330,0.880436,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BGC0000006,0.976806,1.000000,0.972632,0.997759,0.895077,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BGC0000007,0.953003,0.972632,1.000000,0.972158,0.893131,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BGC0000008,0.976330,0.997759,0.972158,1.000000,0.894979,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BGC0000009,0.880436,0.895077,0.893131,0.894979,1.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BGC0002027,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
BGC0002029,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
BGC0002030,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
BGC0002032,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [23]:
cluster_list_renamed = []

for item in all_clusters:
    cluster_list_renamed.append(item.split('.')[0])
    
cluster_list_renamed = list(np.unique(cluster_list_renamed))

In [24]:
orphan_list = []

for cluster in cluster_list_renamed:
    if cluster not in similarity_df.index:
        orphan_list.append(cluster)
        new_col = []
        for index in similarity_df.index:
            new_col.append(0)
        similarity_df[cluster] = new_col
        
similarity_df = similarity_df.sort_index(axis=1)
        
similarity_df

Unnamed: 0,BGC0000001,BGC0000002,BGC0000003,BGC0000004,BGC0000006,BGC0000007,BGC0000008,BGC0000009,BGC0000011,BGC0000012,...,BGC0002026,BGC0002027,BGC0002028,BGC0002029,BGC0002030,BGC0002032,BGC0002033,BGC0002034,BGC0002035,BGC0002036
BGC0000004,0,0,0,1.000000,0.976806,0.953003,0.976330,0.880436,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0
BGC0000006,0,0,0,0.976806,1.000000,0.972632,0.997759,0.895077,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0
BGC0000007,0,0,0,0.953003,0.972632,1.000000,0.972158,0.893131,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0
BGC0000008,0,0,0,0.976330,0.997759,0.972158,1.000000,0.894979,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0
BGC0000009,0,0,0,0.880436,0.895077,0.893131,0.894979,1.000000,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BGC0002027,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,...,0,1.0,0,0.0,0.0,0.0,0.0,0,0,0
BGC0002029,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,...,0,0.0,0,1.0,0.0,0.0,0.0,0,0,0
BGC0002030,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,...,0,0.0,0,0.0,1.0,0.0,0.0,0,0,0
BGC0002032,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,...,0,0.0,0,0.0,0.0,1.0,0.0,0,0,0


In [25]:
for cluster in cluster_list_renamed:
    if cluster not in similarity_df.index:
        new_row = []
        for col in similarity_df.columns:
            if cluster == col:
                new_row.append(1)
            else:
                new_row.append(0)
        similarity_df.loc[cluster] = new_row
        
similarity_df

Unnamed: 0,BGC0000001,BGC0000002,BGC0000003,BGC0000004,BGC0000006,BGC0000007,BGC0000008,BGC0000009,BGC0000011,BGC0000012,...,BGC0002026,BGC0002027,BGC0002028,BGC0002029,BGC0002030,BGC0002032,BGC0002033,BGC0002034,BGC0002035,BGC0002036
BGC0000004,0,0,0,1.000000,0.976806,0.953003,0.976330,0.880436,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0
BGC0000006,0,0,0,0.976806,1.000000,0.972632,0.997759,0.895077,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0
BGC0000007,0,0,0,0.953003,0.972632,1.000000,0.972158,0.893131,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0
BGC0000008,0,0,0,0.976330,0.997759,0.972158,1.000000,0.894979,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0
BGC0000009,0,0,0,0.880436,0.895077,0.893131,0.894979,1.000000,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BGC0002026,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,...,1,0.0,0,0.0,0.0,0.0,0.0,0,0,0
BGC0002028,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,...,0,0.0,1,0.0,0.0,0.0,0.0,0,0,0
BGC0002034,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,1,0,0
BGC0002035,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,0,1,0


# Filtering and merging dataframes

In [26]:
filt_class_df = class_df.loc[similarity_df.index, :]

filt_class_df

Unnamed: 0,PKS,Other,Alkaloid,Oligosaccharide,Terpene,NRPS,Cyclitol,Aminocoumarin,Betalactam,Siderophore,Pyrrolobenzodiazepine,RiPP,Butyrolactone,Nucleoside,Phenazine,Aminoglycoside,tRNA_derived,Phosphonate
BGC0000004,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000006,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000007,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000008,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000009,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BGC0002026,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
BGC0002028,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0002034,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0002035,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
merged_training_df = filt_class_df.join(similarity_df, how='outer')

merged_training_df

Unnamed: 0,PKS,Other,Alkaloid,Oligosaccharide,Terpene,NRPS,Cyclitol,Aminocoumarin,Betalactam,Siderophore,...,BGC0002026,BGC0002027,BGC0002028,BGC0002029,BGC0002030,BGC0002032,BGC0002033,BGC0002034,BGC0002035,BGC0002036
BGC0000004,1,0,0,0,0,0,0,0,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0
BGC0000006,1,0,0,0,0,0,0,0,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0
BGC0000007,1,0,0,0,0,0,0,0,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0
BGC0000008,1,0,0,0,0,0,0,0,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0
BGC0000009,1,0,0,0,0,0,0,0,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BGC0002026,0,0,0,0,0,1,0,0,0,0,...,1,0.0,0,0.0,0.0,0.0,0.0,0,0,0
BGC0002028,1,0,0,0,0,0,0,0,0,0,...,0,0.0,1,0.0,0.0,0.0,0.0,0,0,0
BGC0002034,0,1,0,0,0,0,0,0,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,1,0,0
BGC0002035,1,0,0,0,0,1,0,0,0,0,...,0,0.0,0,0.0,0.0,0.0,0.0,0,1,0


In [28]:
label_col = []

for i,r in merged_training_df.iterrows():
    if i in mibig_activity_dict:
        label_col.append(mibig_activity_dict[i])
    else:
        label_col.append('unlabeled')
        
label_col

['unlabeled',
 'unlabeled',
 'unlabeled',
 'unlabeled',
 'unlabeled',
 'cytotoxic',
 'antibacterial',
 'antibacterial-cytotoxic',
 'antifungal',
 'antibacterial',
 'unlabeled',
 'antibacterial',
 'antifungal',
 'antifungal',
 'antifungal',
 'unlabeled',
 'unlabeled',
 'unlabeled',
 'antifungal-cytotoxic',
 'antifungal-cytotoxic',
 'cytotoxic',
 'unlabeled',
 'unlabeled',
 'antifungal-cytotoxic',
 'antibacterial-cytotoxic',
 'antibacterial',
 'antibacterial',
 'unlabeled',
 'cytotoxic',
 'antibacterial',
 'antifungal',
 'unlabeled',
 'antibacterial',
 'antifungal',
 'antibacterial',
 'antifungal',
 'unlabeled',
 'antibacterial-antifungal',
 'cytotoxic',
 'antifungal',
 'antibacterial',
 'antibacterial',
 'antibacterial',
 'antibacterial',
 'cytotoxic',
 'antifungal-cytotoxic',
 'antifungal-cytotoxic',
 'antifungal-cytotoxic',
 'antibacterial',
 'antiviral',
 'cytotoxic',
 'cytotoxic',
 'antibacterial',
 'unknown',
 'unlabeled',
 'antibacterial-cytotoxic',
 'antibacterial-cytotoxic',
 'a

In [29]:
merged_training_df['label'] = label_col

labeled_training_df = merged_training_df[merged_training_df['label'] != 'unlabeled']

labeled_training_df

Unnamed: 0,PKS,Other,Alkaloid,Oligosaccharide,Terpene,NRPS,Cyclitol,Aminocoumarin,Betalactam,Siderophore,...,BGC0002027,BGC0002028,BGC0002029,BGC0002030,BGC0002032,BGC0002033,BGC0002034,BGC0002035,BGC0002036,label
BGC0000020,1,0,0,0,0,0,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,cytotoxic
BGC0000029,1,0,0,0,0,0,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,antibacterial
BGC0000031,1,0,0,0,0,0,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,antibacterial-cytotoxic
BGC0000034,1,0,0,0,0,0,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,antifungal
BGC0000035,1,0,0,0,0,0,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,antibacterial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BGC0001979,1,0,0,0,0,0,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,cytotoxic
BGC0002010,1,0,0,0,0,1,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,antibacterial
BGC0002013,0,0,0,0,0,0,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,cytotoxic
BGC0002028,1,0,0,0,0,0,0,0,0,0,...,0.0,1,0.0,0.0,0.0,0.0,0,0,0,antibacterial


In [30]:
np.unique(labeled_training_df['label'])

array(['antibacterial', 'antibacterial-antifungal',
       'antibacterial-antifungal-cytotoxic', 'antibacterial-cytotoxic',
       'antifungal', 'antifungal-cytotoxic', 'antihelmintic',
       'antioxidant', 'antiprotozoa', 'antiviral', 'cytotoxic',
       'cytotoxic-unknown', 'herbicide', 'inhibitor', 'other', 'pigment',
       'regulatory', 'siderophore', 'unknown'], dtype=object)

In [31]:
list(labeled_training_df.columns)

['PKS',
 'Other',
 'Alkaloid',
 'Oligosaccharide',
 'Terpene',
 'NRPS',
 'Cyclitol',
 'Aminocoumarin',
 'Betalactam',
 'Siderophore',
 'Pyrrolobenzodiazepine',
 'RiPP',
 'Butyrolactone',
 'Nucleoside',
 'Phenazine',
 'Aminoglycoside',
 'tRNA_derived',
 'Phosphonate',
 'BGC0000001',
 'BGC0000002',
 'BGC0000003',
 'BGC0000004',
 'BGC0000006',
 'BGC0000007',
 'BGC0000008',
 'BGC0000009',
 'BGC0000011',
 'BGC0000012',
 'BGC0000013',
 'BGC0000014',
 'BGC0000017',
 'BGC0000018',
 'BGC0000020',
 'BGC0000021',
 'BGC0000022',
 'BGC0000023',
 'BGC0000024',
 'BGC0000026',
 'BGC0000027',
 'BGC0000028',
 'BGC0000029',
 'BGC0000030',
 'BGC0000031',
 'BGC0000032',
 'BGC0000033',
 'BGC0000034',
 'BGC0000035',
 'BGC0000036',
 'BGC0000037',
 'BGC0000038',
 'BGC0000039',
 'BGC0000040',
 'BGC0000041',
 'BGC0000042',
 'BGC0000043',
 'BGC0000044',
 'BGC0000045',
 'BGC0000046',
 'BGC0000047',
 'BGC0000048',
 'BGC0000050',
 'BGC0000051',
 'BGC0000052',
 'BGC0000053',
 'BGC0000054',
 'BGC0000055',
 'BGC0000056

# Running cross-validatin for class and similarity

In [32]:
from sklearn.model_selection import StratifiedKFold

def get_filt_prec(jaccard_df,precision_dict):
    for cutoff in [0.3,0.5,0.7,0.9]:
        filtered_jaccard_df = jaccard_df[jaccard_df['jaccard_index'] >= cutoff]
        if len(filtered_jaccard_df) > 0:
            count = 0
            for i,r in filtered_jaccard_df.iterrows():
                if r['predicted_class'][0] not in r['true_class']:
                    count += 1
            precision_filtered = ((len(filtered_jaccard_df)-count)/len(filtered_jaccard_df))*100
            precision_score = round(precision_filtered,2)
            print('cutoff=%s'%cutoff,len(filtered_jaccard_df)-count,count,len(filtered_jaccard_df),precision_score)
            if cutoff not in precision_dict.keys():
                precision_dict[cutoff] = [precision_score]
            else:
                precision_dict[cutoff] = precision_dict[cutoff] + [precision_score]
    return precision_dict

X = labeled_training_df.drop('label',axis=1)
y = labeled_training_df['label']

precision_dict = {}

skf = StratifiedKFold(n_splits=5, random_state=1066, shuffle=True)
for train_index, test_index in skf.split(X, y):
    training_df = labeled_training_df.iloc[train_index]
    testing_df = labeled_training_df.iloc[test_index]
    true_labels = testing_df['label']
    testing_df = testing_df.drop('label',axis=1)
    X_div = training_df.drop("label", axis=1)
    y_div = training_df["label"]
    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(X_div,y_div)
    distances, indices = nbrs.kneighbors(testing_df)
    y_div = y_div.reset_index(drop=True)
    neighbors_array = []
    for item in indices:
        candidate_list = []
        for i in range(1):
            candidate_list.append(y_div[item[i]])
        neighbors_array.append(candidate_list)
    neighbors_array = np.asarray(neighbors_array)
    correct_count = 0
    incorrect_count = 0
    predict_col,true_col = [],[]
    for i,item in enumerate(true_labels):
        predict_col.append(neighbors_array[i])
        true_col.append(item)
        if item in neighbors_array[i]:
            correct_count += 1
        else:
            incorrect_count += 1
    precision_score = round(correct_count/len(true_labels)*100,2)
    print('cutoff=0.0',correct_count,incorrect_count,len(true_labels),precision_score)
    if 0.0 not in precision_dict.keys():
        precision_dict[0.0] = [precision_score]
    else:
        precision_dict[0.0] = precision_dict[0.0] + [precision_score]
    def get_binary(fingerprint):
        new_row = []
        for index,item in enumerate(fingerprint):
            if item != 0:
                new_row.append(1)
            else:
                new_row.append(0)
        fingerprint = new_row
        return fingerprint
    jaccard_col,bgc_col = [],[]
    for i,bgc_id in enumerate(testing_df.index):
        bgc_col.append(bgc_id)
        count = 0
        train_index = int(indices[i][0])
        for j,r in training_df.iterrows():
            if count == train_index:
                training_fp = training_df.loc[j].drop('label')
            count += 1
        testing_fp = testing_df.loc[bgc_id]
        training_binary = get_binary(training_fp)
        testing_binary = get_binary(testing_fp)
        jaccard_index = jaccard_score(training_binary,testing_binary)
        jaccard_col.append(round(jaccard_index,2))
    jaccard_dict = {'bgc_ID': bgc_col, 'predicted_class': predict_col, 'true_class': true_col, 'jaccard_index': jaccard_col}
    jaccard_df = pd.DataFrame(jaccard_dict)
    jaccard_df = jaccard_df.sort_values('jaccard_index',ascending = False)
    jaccard_df = jaccard_df.reset_index(drop=True)
    precision_dict = get_filt_prec(jaccard_df,precision_dict)



cutoff=0.0 66 116 182 36.26
cutoff=0.3 77 93 170 45.29
cutoff=0.5 43 31 74 58.11
cutoff=0.7 28 7 35 80.0
cutoff=0.9 18 4 22 81.82
cutoff=0.0 75 107 182 41.21
cutoff=0.3 92 81 173 53.18
cutoff=0.5 57 24 81 70.37
cutoff=0.7 41 7 48 85.42
cutoff=0.9 28 4 32 87.5
cutoff=0.0 75 107 182 41.21
cutoff=0.3 89 87 176 50.57
cutoff=0.5 55 23 78 70.51
cutoff=0.7 42 9 51 82.35
cutoff=0.9 26 2 28 92.86
cutoff=0.0 75 106 181 41.44
cutoff=0.3 98 77 175 56.0
cutoff=0.5 59 21 80 73.75
cutoff=0.7 45 10 55 81.82
cutoff=0.9 34 7 41 82.93
cutoff=0.0 77 104 181 42.54
cutoff=0.3 92 82 174 52.87
cutoff=0.5 53 30 83 63.86
cutoff=0.7 37 10 47 78.72
cutoff=0.9 30 8 38 78.95


In [33]:
for key in precision_dict:
    print(round(np.average(precision_dict[key]),2))

40.53
51.58
67.32
81.66
84.81


# Creating substructure dataframe

In [34]:
def get_spec_dict(genomeID):
    gbk_paths = glob.glob('/Users/tiagoferreiraleao/Dropbox/tiago-NAS/BGClassifier/antismash_results_all/%s/*region*.gbk'%genomeID)
    gbk_paths.sort()
    spec_dict = {}
    col_list = []
    for gbk_file in gbk_paths:
        input_handle = open(gbk_file)
        for seq_record in SeqIO.parse(input_handle,'genbank'):
            sub_list = []
            for feature in seq_record.features:
                if feature.type == 'aSDomain':
                    if 'specificity' in feature.qualifiers:
                        sub_list.append(feature.qualifiers['specificity'])
                        if feature.qualifiers['specificity'] not in col_list:
                            col_list.append(feature.qualifiers['specificity'])
            spec_dict[os.path.basename(gbk_file)] = sub_list
    return spec_dict,col_list

In [35]:
def split_string_at(s, c, n):
    words = s.split(c)
    return c.join(words[:n]), c.join(words[n:])

antismash_glob = glob.glob('/Users/tiagoferreiraleao/Dropbox/tiago-NAS/BGClassifier/antismash_results_all/*/')

for antismash_path in antismash_glob:
    genomeID = split_string_at(antismash_path,'/',7)[1].rstrip('/')
    if antismash_path == antismash_glob[0]:
        spec_dict_final,col_list_final = get_spec_dict(genomeID)
    else:
        spec_dict_temp,col_list_temp = get_spec_dict(genomeID)
        spec_dict_final.update(spec_dict_temp)
        for spec_name in col_list_temp:
            if spec_name not in col_list_final:
                col_list_final.append(spec_name)
    print(genomeID,len(spec_dict_final),len(col_list_final))

BGC0001102
BGC0001102 1 5
BGC0001330
BGC0001330 2 11
BGC0000686
BGC0000686 2 11
BGC0001754
BGC0001754 3 15
BGC0001566
BGC0001566 4 15
BGC0000672
BGC0000672 7 15
BGC0000440
BGC0000440 8 18
BGC0001592
BGC0001592 9 18
BGC0000818
BGC0000818 10 18
BGC0000024
BGC0000024 11 19
BGC0000216
BGC0000216 12 19
BGC0001559
BGC0001559 13 19
BGC0001901
BGC0001901 14 20
BGC0000229
BGC0000229 15 20
BGC0000827
BGC0000827 16 20
BGC0000211
BGC0000211 17 20
BGC0000023
BGC0000023 18 23
BGC0000447
BGC0000447 19 27
BGC0001595
BGC0001595 20 27
BGC0000675
BGC0000675 21 27
BGC0001561
BGC0001561 22 27
BGC0001753
BGC0001753 23 27
BGC0001337
BGC0001337 24 27
BGC0001939
BGC0001939 25 29
BGC0001105
BGC0001105 26 29
BGC0000820
BGC0000820 26 29
BGC0001906
BGC0001906 27 29
BGC0000471
BGC0000471 28 29
BGC0001791
BGC0001791 29 31
BGC0000643
BGC0000643 30 31
BGC0000227
BGC0000227 31 31
BGC0000829
BGC0000829 32 31
BGC0001301
BGC0001301 33 31
BGC0001133
BGC0001133 34 31
BGC0001557
BGC0001557 36 32
BGC0000485
BGC0000485 37 32
B

BGC0001288 296 72
BGC0001415
BGC0001415 297 72
BGC0001627
BGC0001627 298 72
BGC0000391
BGC0000391 299 72
BGC0001071
BGC0001071 300 72
BGC0000365
BGC0000365 301 73
BGC0001085
BGC0001085 302 73
BGC0000157
BGC0000157 303 73
BGC0000533
BGC0000533 304 73
BGC0000701
BGC0000701 305 73
BGC0001881
BGC0001881 306 73
BGC0000953
BGC0000953 307 73
BGC0000739
BGC0000739 307 73
BGC0001049
BGC0001049 308 73
BGC0000706
BGC0000706 308 73
BGC0000534
BGC0000534 309 73
BGC0001082
BGC0001082 310 73
BGC0000150
BGC0000150 311 73
BGC0000362
BGC0000362 312 73
BGC0000998
BGC0000998 313 73
BGC0001076
BGC0001076 314 73
BGC0001244
BGC0001244 315 73
BGC0000396
BGC0000396 316 73
BGC0001620
BGC0001620 317 73
BGC0001412
BGC0001412 317 73
BGC0000965
BGC0000965 318 73
BGC0000159
BGC0000159 319 73
BGC0000991
BGC0000991 320 73
BGC0001888
BGC0001888 320 73
BGC0000354
BGC0000354 321 74
BGC0001286
BGC0001286 321 74
BGC0000730
BGC0000730 321 74
BGC0000502
BGC0000502 322 74
BGC0001616
BGC0001616 322 74
BGC0001424
BGC0001424 323

BGC0001065 551 84
BGC0001257
BGC0001257 552 84
BGC0000385
BGC0000385 553 84
BGC0001091
BGC0001091 554 84
BGC0000143
BGC0000143 555 84
BGC0000371
BGC0000371 556 84
BGC0000715
BGC0000715 557 84
BGC0000527
BGC0000527 558 84
BGC0001268
BGC0001268 559 84
BGC0001866
BGC0001866 560 84
BGC0000188
BGC0000188 560 84
BGC0000518
BGC0000518 561 84
BGC0001892
BGC0001892 562 84
BGC0000940
BGC0000940 563 84
BGC0000520
BGC0000520 563 84
BGC0000712
BGC0000712 564 84
BGC0000376
BGC0000376 565 84
BGC0000978
BGC0000978 566 84
BGC0001096
BGC0001096 567 84
BGC0000144
BGC0000144 568 84
BGC0001250
BGC0001250 569 84
BGC0000382
BGC0000382 570 84
BGC0001062
BGC0001062 571 84
BGC0001406
BGC0001406 572 84
BGC0001634
BGC0001634 573 84
BGC0001895
BGC0001895 574 84
BGC0000947
BGC0000947 575 84
BGC0000349
BGC0000349 576 84
BGC0001439
BGC0001439 577 84
BGC0001861
BGC0001861 578 84
BGC0000340
BGC0000340 579 84
BGC0001292
BGC0001292 580 84
BGC0000172
BGC0000172 581 84
BGC0000516
BGC0000516 582 84
BGC0000724
BGC0000724 583

BGC0000090 791 92
BGC0001370
BGC0001370 792 92
BGC0001526
BGC0001526 793 92
BGC0000632
BGC0000632 794 92
BGC0000400
BGC0000400 795 92
BGC0000064
BGC0000064 796 92
BGC0000858
BGC0000858 797 92
BGC0001384
BGC0001384 798 92
BGC0000256
BGC0000256 799 92
BGC0001519
BGC0001519 800 92
BGC0001941
BGC0001941 801 93
BGC0000893
BGC0000893 802 93
BGC0000269
BGC0000269 803 93
BGC0001189
BGC0001189 804 93
BGC0000867
BGC0000867 804 93
BGC0001915
BGC0001915 804 93
BGC0000833
BGC0000833 805 93
BGC0000659
BGC0000659 806 93
BGC0001324
BGC0001324 807 93
BGC0001116
BGC0001116 808 93
BGC0001572
BGC0001572 809 93
BGC0000692
BGC0000692 810 93
BGC0001740
BGC0001740 811 93
BGC0000454
BGC0000454 812 93
BGC0001586
BGC0001586 812 93
BGC0000666
BGC0000666 812 93
BGC0000202
BGC0000202 813 93
BGC0000030
BGC0000030 814 93
BGC0000834
BGC0000834 815 93
BGC0000008
BGC0000008 816 93
BGC0001912
BGC0001912 817 93
BGC0000498
BGC0000498 818 93
BGC0001778
BGC0001778 819 93
BGC0000037
BGC0000037 820 93
BGC0000205
BGC0000205 821

BGC0001332 1086 94
BGC0001756
BGC0001756 1087 94
BGC0000684
BGC0000684 1088 94
BGC0001564
BGC0001564 1089 94
BGC0000019
BGC0000019 1089 94
BGC0000825
BGC0000825 1090 94
BGC0000489
BGC0000489 1091 94
BGC0001769
BGC0001769 1092 94
BGC0001903
BGC0001903 1093 94
BGC0001563
BGC0001563 1094 94
BGC0001751
BGC0001751 1095 94
BGC0000683
BGC0000683 1096 94
BGC0001335
BGC0001335 1097 94
BGC0000213
BGC0000213 1098 94
BGC0000021
BGC0000021 1099 94
BGC0001597
BGC0001597 1100 94
BGC0000445
BGC0000445 1101 94
BGC0000677
BGC0000677 1101 94
BGC0001138
BGC0001138 1101 94
BGC0001904
BGC0001904 1102 94
BGC0000648
BGC0000648 1103 94
BGC0000822
BGC0000822 1104 94
BGC0000950
BGC0000950 1105 94
BGC0001882
BGC0001882 1105 94
BGC0000508
BGC0000508 1106 94
BGC0000198
BGC0000198 1107 94
BGC0000537
BGC0000537 1108 94
BGC0000705
BGC0000705 1109 94
BGC0000361
BGC0000361 1110 94
BGC0000153
BGC0000153 1111 96
BGC0000395
BGC0000395 1112 96
BGC0001849
BGC0001849 1113 96
BGC0001075
BGC0001075 1114 96
BGC0001411
BGC0001411

BGC0000115 1312 101
BGC0000571
BGC0000571 1313 101
BGC0000743
BGC0000743 1313 101
BGC0002014
BGC0002014 1314 101
BGC0001457
BGC0001457 1315 101
BGC0000585
BGC0000585 1316 101
BGC0001665
BGC0001665 1317 101
BGC0001201
BGC0001201 1318 101
BGC0001033
BGC0001033 1319 101
BGC0000920
BGC0000920 1319 101
BGC0001698
BGC0001698 1320 101
BGC0000578
BGC0000578 1321 101
BGC0001806
BGC0001806 1322 101
BGC0000547
BGC0000547 1323 101
BGC0001495
BGC0001495 1324 101
BGC0002022
BGC0002022 1325 101
BGC0000775
BGC0000775 1325 101
BGC0000311
BGC0000311 1326 101
BGC0000123
BGC0000123 1327 102
BGC0001237
BGC0001237 1328 102
BGC0001839
BGC0001839 1329 102
BGC0001005
BGC0001005 1330 102
BGC0001461
BGC0001461 1331 102
BGC0000781
BGC0000781 1331 102
BGC0001653
BGC0001653 1332 102
BGC0001801
BGC0001801 1333 102
BGC0001459
BGC0001459 1334 102
BGC0000329
BGC0000329 1335 102
BGC0000786
BGC0000786 1335 102
BGC0001654
BGC0001654 1335 102
BGC0001002
BGC0001002 1336 102
BGC0001230
BGC0001230 1337 102
BGC0000918
BGC00009

BGC0001113 1594 110
BGC0001321
BGC0001321 1596 110
BGC0000035
BGC0000035 1597 110
BGC0000809
BGC0000809 1598 110
BGC0000207
BGC0000207 1598 110
BGC0000663
BGC0000663 1599 110
BGC0001583
BGC0001583 1600 110
BGC0000451
BGC0000451 1600 110
BGC0001523
BGC0001523 1601 110
BGC0001711
BGC0001711 1602 110
BGC0001375
BGC0001375 1603 110
BGC0000253
BGC0000253 1604 110
BGC0000061
BGC0000061 1605 110
BGC0000405
BGC0000405 1606 110
BGC0000637
BGC0000637 1607 110
BGC0000896
BGC0000896 1607 110
BGC0001944
BGC0001944 1608 110
BGC0001178
BGC0001178 1609 110
BGC0000298
BGC0000298 1610 110
BGC0000608
BGC0000608 1611 110
BGC0000862
BGC0000862 1612 110
BGC0000630
BGC0000630 1613 110
BGC0000402
BGC0000402 1614 110
BGC0001988
BGC0001988 1615 110
BGC0000066
BGC0000066 1616 110
BGC0000254
BGC0000254 1617 110
BGC0001386
BGC0001386 1618 110
BGC0000092
BGC0000092 1619 110
BGC0001140
BGC0001140 1619 110
BGC0001372
BGC0001372 1620 110
BGC0001716
BGC0001716 1621 110
BGC0001524
BGC0001524 1622 110
BGC0000865
BGC00008

In [36]:
def filter_col_list(col_list_final):
    for spec_col in col_list_final:
        if 'inactive' in str(spec_col):
            print(spec_col)
            col_list_final.remove(spec_col)

filter_col_list(col_list_final)
filter_col_list(col_list_final) ### I'm not sure why I need to run twice to remove all the inactive

col_list_final

['KR activity: inactive', 'KR stereochemistry: C2']
['KR activity: inactive', 'KR stereochemistry: C1']
['KR activity: inactive', 'KR stereochemistry: (unknown)']
['consensus: mal', 'PKS signature: Malonyl-CoA', 'Minowa: inactive']
['KR activity: inactive', 'KR stereochemistry: A1']
['consensus: pk', 'PKS signature: (unknown)', 'Minowa: inactive']


[['consensus: X'],
 ['consensus: mal', 'PKS signature: Malonyl-CoA', 'Minowa: mal'],
 ['KR activity: active', 'KR stereochemistry: B1'],
 ['Minowa: NH2'],
 ['consensus: mmal', 'PKS signature: Malonyl-CoA', 'Minowa: mmal'],
 ['KR activity: active', 'KR stereochemistry: A1'],
 ['consensus: mal', 'PKS signature: (unknown)', 'Minowa: mal'],
 ['consensus: gly'],
 ['consensus: ser'],
 ['KR activity: active', 'KR stereochemistry: (unknown)'],
 ['consensus: thr'],
 ['consensus: tyr'],
 ['consensus: val'],
 ['consensus: ala'],
 ['consensus: hpg'],
 ['consensus: dhpg'],
 ['consensus: bht'],
 ['consensus: mmal', 'PKS signature: Methylmalonyl-CoA', 'Minowa: mmal'],
 ['consensus: pk', 'PKS signature: Malonyl-CoA', 'Minowa: mxmal'],
 ['consensus: mxmal', 'PKS signature: (unknown)', 'Minowa: mxmal'],
 ['consensus: mmal', 'PKS signature: Methylmalonyl-CoA', 'Minowa: benz'],
 ['consensus: pro'],
 ['consensus: leu'],
 ['consensus: gln'],
 ['consensus: dab'],
 ['consensus: pk', 'PKS signature: Methylmalo

In [37]:
len(col_list_final)

105

In [38]:
presence_dict = {}

for key in spec_dict_final.keys():
    short_key = key.split('.1')[0]
    values_list = []
    for i,v in enumerate(col_list_final):
        if v in spec_dict_final[key]:
            values_list.append(1)
        else:
            values_list.append(0)
    presence_dict[short_key] = values_list

presence_df = pd.DataFrame.from_dict(presence_dict).T

name_dict = dict(zip(presence_df.columns,col_list_final))

presence_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,100,101,102,103,104
BGC0001102,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0001330,1,1,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
BGC0001754,1,1,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
BGC0001566,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0000672,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BGC0000863,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BGC0000299,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0001179,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0000897,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
for key in name_dict:
    name_dict[key] = str(name_dict[key])
    
presence_df = presence_df.rename(columns = name_dict, inplace = False)

presence_df = presence_df.sort_index()

presence_df

Unnamed: 0,['consensus: X'],"['consensus: mal', 'PKS signature: Malonyl-CoA', 'Minowa: mal']","['KR activity: active', 'KR stereochemistry: B1']",['Minowa: NH2'],"['consensus: mmal', 'PKS signature: Malonyl-CoA', 'Minowa: mmal']","['KR activity: active', 'KR stereochemistry: A1']","['consensus: mal', 'PKS signature: (unknown)', 'Minowa: mal']",['consensus: gly'],['consensus: ser'],"['KR activity: active', 'KR stereochemistry: (unknown)']",...,"['consensus: pk', 'PKS signature: CHC-CoA', 'Minowa: CHC-CoA']",['consensus: LDAP'],['consensus: aad'],"['consensus: pk', 'PKS signature: Isobutyryl-CoA', 'Minowa: 2metbut']",['consensus: his'],"['consensus: mal', 'PKS signature: 2-Rhyd-Malonyl-CoA', 'Minowa: mal']",['consensus: cap'],['consensus: alpha-hydroxy-isocaproic-acid'],"['consensus: mmal', 'PKS signature: 2-Methylbutyryl-CoA', 'Minowa: mmal']",['consensus: lys-b']
BGC0000001,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BGC0000002,0,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BGC0000003,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BGC0000004,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0000006,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
main-chr.region008.gbk,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
main-chr.region009.gbk,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
main-chr.region010.gbk,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
main-chr.region011.gbk,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [40]:
presence_df = presence_df.loc[labeled_training_df.index]

presence_df

Unnamed: 0,['consensus: X'],"['consensus: mal', 'PKS signature: Malonyl-CoA', 'Minowa: mal']","['KR activity: active', 'KR stereochemistry: B1']",['Minowa: NH2'],"['consensus: mmal', 'PKS signature: Malonyl-CoA', 'Minowa: mmal']","['KR activity: active', 'KR stereochemistry: A1']","['consensus: mal', 'PKS signature: (unknown)', 'Minowa: mal']",['consensus: gly'],['consensus: ser'],"['KR activity: active', 'KR stereochemistry: (unknown)']",...,"['consensus: pk', 'PKS signature: CHC-CoA', 'Minowa: CHC-CoA']",['consensus: LDAP'],['consensus: aad'],"['consensus: pk', 'PKS signature: Isobutyryl-CoA', 'Minowa: 2metbut']",['consensus: his'],"['consensus: mal', 'PKS signature: 2-Rhyd-Malonyl-CoA', 'Minowa: mal']",['consensus: cap'],['consensus: alpha-hydroxy-isocaproic-acid'],"['consensus: mmal', 'PKS signature: 2-Methylbutyryl-CoA', 'Minowa: mmal']",['consensus: lys-b']
BGC0000020,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0000029,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BGC0000031,0,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BGC0000034,1,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BGC0000035,0,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BGC0001979,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0002010,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0002013,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0002028,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Creating final training dataframe

In [41]:
final_training_df = labeled_training_df.join(presence_df, how='outer')

final_training_df

Unnamed: 0,PKS,Other,Alkaloid,Oligosaccharide,Terpene,NRPS,Cyclitol,Aminocoumarin,Betalactam,Siderophore,...,"['consensus: pk', 'PKS signature: CHC-CoA', 'Minowa: CHC-CoA']",['consensus: LDAP'],['consensus: aad'],"['consensus: pk', 'PKS signature: Isobutyryl-CoA', 'Minowa: 2metbut']",['consensus: his'],"['consensus: mal', 'PKS signature: 2-Rhyd-Malonyl-CoA', 'Minowa: mal']",['consensus: cap'],['consensus: alpha-hydroxy-isocaproic-acid'],"['consensus: mmal', 'PKS signature: 2-Methylbutyryl-CoA', 'Minowa: mmal']",['consensus: lys-b']
BGC0000020,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0000029,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0000031,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0000034,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0000035,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BGC0001979,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0002010,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0002013,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0002028,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Running cross-validation with all three features

In [42]:
from sklearn.model_selection import StratifiedKFold

def get_filt_prec(jaccard_df,precision_dict):
    for cutoff in [0.3,0.5,0.7,0.9]:
        filtered_jaccard_df = jaccard_df[jaccard_df['jaccard_index'] >= cutoff]
        if len(filtered_jaccard_df) > 0:
            count = 0
            for i,r in filtered_jaccard_df.iterrows():
                if r['predicted_class'][0] not in r['true_class']:
                    count += 1
            precision_filtered = ((len(filtered_jaccard_df)-count)/len(filtered_jaccard_df))*100
            precision_score = round(precision_filtered,2)
            print('cutoff=%s'%cutoff,len(filtered_jaccard_df)-count,count,len(filtered_jaccard_df),precision_score)
            if cutoff not in precision_dict.keys():
                precision_dict[cutoff] = [precision_score]
            else:
                precision_dict[cutoff] = precision_dict[cutoff] + [precision_score]
    return precision_dict

X = final_training_df.drop('label',axis=1)
y = final_training_df['label']

precision_dict = {}

skf = StratifiedKFold(n_splits=5, random_state=1066, shuffle=True)
for train_index, test_index in skf.split(X, y):
    training_df = final_training_df.iloc[train_index]
    testing_df = final_training_df.iloc[test_index]
    true_labels = testing_df['label']
    testing_df = testing_df.drop('label',axis=1)
    X_div = training_df.drop("label", axis=1)
    y_div = training_df["label"]
    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(X_div,y_div)
    distances, indices = nbrs.kneighbors(testing_df)
    y_div = y_div.reset_index(drop=True)
    neighbors_array = []
    for item in indices:
        candidate_list = []
        for i in range(1):
            candidate_list.append(y_div[item[i]])
        neighbors_array.append(candidate_list)
    neighbors_array = np.asarray(neighbors_array)
    correct_count = 0
    incorrect_count = 0
    predict_col,true_col = [],[]
    for i,item in enumerate(true_labels):
        predict_col.append(neighbors_array[i])
        true_col.append(item)
        if item in neighbors_array[i]:
            correct_count += 1
        else:
            incorrect_count += 1
    precision_score = round(correct_count/len(true_labels)*100,2)
    print('cutoff=0.0',correct_count,incorrect_count,len(true_labels),precision_score)
    if 0.0 not in precision_dict.keys():
        precision_dict[0.0] = [precision_score]
    else:
        precision_dict[0.0] = precision_dict[0.0] + [precision_score]
    def get_binary(fingerprint):
        new_row = []
        for index,item in enumerate(fingerprint):
            if item != 0:
                new_row.append(1)
            else:
                new_row.append(0)
        fingerprint = new_row
        return fingerprint
    jaccard_col,bgc_col = [],[]
    for i,bgc_id in enumerate(testing_df.index):
        bgc_col.append(bgc_id)
        count = 0
        train_index = int(indices[i][0])
        for j,r in training_df.iterrows():
            if count == train_index:
                training_fp = training_df.loc[j].drop('label')
            count += 1
        testing_fp = testing_df.loc[bgc_id]
        training_binary = get_binary(training_fp)
        testing_binary = get_binary(testing_fp)
        jaccard_index = jaccard_score(training_binary,testing_binary)
        jaccard_col.append(round(jaccard_index,2))
    jaccard_dict = {'bgc_ID': bgc_col, 'predicted_class': predict_col, 'true_class': true_col, 'jaccard_index': jaccard_col}
    jaccard_df = pd.DataFrame(jaccard_dict)
    jaccard_df = jaccard_df.sort_values('jaccard_index',ascending = False)
    jaccard_df = jaccard_df.reset_index(drop=True)
    precision_dict = get_filt_prec(jaccard_df,precision_dict)



cutoff=0.0 73 109 182 40.11
cutoff=0.3 79 92 171 46.2
cutoff=0.5 64 61 125 51.2
cutoff=0.7 38 15 53 71.7
cutoff=0.9 22 2 24 91.67
cutoff=0.0 81 101 182 44.51
cutoff=0.3 90 86 176 51.14
cutoff=0.5 76 49 125 60.8
cutoff=0.7 45 13 58 77.59
cutoff=0.9 25 4 29 86.21
cutoff=0.0 92 90 182 50.55
cutoff=0.3 109 63 172 63.37
cutoff=0.5 87 38 125 69.6
cutoff=0.7 56 12 68 82.35
cutoff=0.9 25 2 27 92.59
cutoff=0.0 89 92 181 49.17
cutoff=0.3 101 71 172 58.72
cutoff=0.5 73 50 123 59.35
cutoff=0.7 49 14 63 77.78
cutoff=0.9 33 5 38 86.84
cutoff=0.0 79 102 181 43.65
cutoff=0.3 88 81 169 52.07
cutoff=0.5 73 48 121 60.33
cutoff=0.7 39 15 54 72.22
cutoff=0.9 27 8 35 77.14


In [43]:
for key in precision_dict:
    print(round(np.average(precision_dict[key]),2))

45.6
54.3
60.26
76.33
86.89


In [44]:
# textfile = open("./BGClassifier-training_list-220521.txt", "w")
# for element in list(training_df.index):
#     textfile.write(element + "\n")
# textfile.close()

In [45]:
# textfile = open("./BGClassifier-testing_list-220521.txt", "w")
# for element in list(testing_df.index):
#     textfile.write(element + "\n")
# textfile.close()

# Running cross-validation with substructure and similarity

In [46]:
cols_to_keep = []

for item in final_training_df.columns:
    if item not in all_classes:
        cols_to_keep.append(item)
        
substructure_training_df = final_training_df[cols_to_keep]

substructure_training_df

Unnamed: 0,BGC0000001,BGC0000002,BGC0000003,BGC0000004,BGC0000006,BGC0000007,BGC0000008,BGC0000009,BGC0000011,BGC0000012,...,"['consensus: pk', 'PKS signature: CHC-CoA', 'Minowa: CHC-CoA']",['consensus: LDAP'],['consensus: aad'],"['consensus: pk', 'PKS signature: Isobutyryl-CoA', 'Minowa: 2metbut']",['consensus: his'],"['consensus: mal', 'PKS signature: 2-Rhyd-Malonyl-CoA', 'Minowa: mal']",['consensus: cap'],['consensus: alpha-hydroxy-isocaproic-acid'],"['consensus: mmal', 'PKS signature: 2-Methylbutyryl-CoA', 'Minowa: mmal']",['consensus: lys-b']
BGC0000020,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0000029,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0000031,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0000034,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0000035,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BGC0001979,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0002010,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0002013,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0002028,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
from sklearn.model_selection import StratifiedKFold

def get_filt_prec(jaccard_df,precision_dict):
    for cutoff in [0.3,0.5,0.7,0.9]:
        filtered_jaccard_df = jaccard_df[jaccard_df['jaccard_index'] >= cutoff]
        if len(filtered_jaccard_df) > 0:
            count = 0
            for i,r in filtered_jaccard_df.iterrows():
                if r['predicted_class'][0] not in r['true_class']:
                    count += 1
            precision_filtered = ((len(filtered_jaccard_df)-count)/len(filtered_jaccard_df))*100
            precision_score = round(precision_filtered,2)
            print('cutoff=%s'%cutoff,len(filtered_jaccard_df)-count,count,len(filtered_jaccard_df),precision_score)
            if cutoff not in precision_dict.keys():
                precision_dict[cutoff] = [precision_score]
            else:
                precision_dict[cutoff] = precision_dict[cutoff] + [precision_score]
    return precision_dict

X = substructure_training_df.drop('label',axis=1)
y = substructure_training_df['label']

precision_dict = {}

skf = StratifiedKFold(n_splits=5, random_state=1066, shuffle=True)
for train_index, test_index in skf.split(X, y):
    training_df = substructure_training_df.iloc[train_index]
    testing_df = substructure_training_df.iloc[test_index]
    true_labels = testing_df['label']
    testing_df = testing_df.drop('label',axis=1)
    X_div = training_df.drop("label", axis=1)
    y_div = training_df["label"]
    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(X_div,y_div)
    distances, indices = nbrs.kneighbors(testing_df)
    y_div = y_div.reset_index(drop=True)
    neighbors_array = []
    for item in indices:
        candidate_list = []
        for i in range(1):
            candidate_list.append(y_div[item[i]])
        neighbors_array.append(candidate_list)
    neighbors_array = np.asarray(neighbors_array)
    correct_count = 0
    incorrect_count = 0
    predict_col,true_col = [],[]
    for i,item in enumerate(true_labels):
        predict_col.append(neighbors_array[i])
        true_col.append(item)
        if item in neighbors_array[i]:
            correct_count += 1
        else:
            incorrect_count += 1
    precision_score = round(correct_count/len(true_labels)*100,2)
    print('cutoff=0.0',correct_count,incorrect_count,len(true_labels),precision_score)
    if 0.0 not in precision_dict.keys():
        precision_dict[0.0] = [precision_score]
    else:
        precision_dict[0.0] = precision_dict[0.0] + [precision_score]
    def get_binary(fingerprint):
        new_row = []
        for index,item in enumerate(fingerprint):
            if item != 0:
                new_row.append(1)
            else:
                new_row.append(0)
        fingerprint = new_row
        return fingerprint
    jaccard_col,bgc_col = [],[]
    for i,bgc_id in enumerate(testing_df.index):
        bgc_col.append(bgc_id)
        count = 0
        train_index = int(indices[i][0])
        for j,r in training_df.iterrows():
            if count == train_index:
                training_fp = training_df.loc[j].drop('label')
            count += 1
        testing_fp = testing_df.loc[bgc_id]
        training_binary = get_binary(training_fp)
        testing_binary = get_binary(testing_fp)
        jaccard_index = jaccard_score(training_binary,testing_binary)
        jaccard_col.append(round(jaccard_index,2))
    jaccard_dict = {'bgc_ID': bgc_col, 'predicted_class': predict_col, 'true_class': true_col, 'jaccard_index': jaccard_col}
    jaccard_df = pd.DataFrame(jaccard_dict)
    jaccard_df = jaccard_df.sort_values('jaccard_index',ascending = False)
    jaccard_df = jaccard_df.reset_index(drop=True)
    precision_dict = get_filt_prec(jaccard_df,precision_dict)



cutoff=0.0 77 105 182 42.31
cutoff=0.3 70 60 130 53.85
cutoff=0.5 54 31 85 63.53
cutoff=0.7 37 6 43 86.05
cutoff=0.9 26 2 28 92.86
cutoff=0.0 67 115 182 36.81
cutoff=0.3 69 67 136 50.74
cutoff=0.5 64 37 101 63.37
cutoff=0.7 40 6 46 86.96
cutoff=0.9 30 5 35 85.71
cutoff=0.0 84 98 182 46.15
cutoff=0.3 86 48 134 64.18
cutoff=0.5 74 32 106 69.81
cutoff=0.7 49 9 58 84.48
cutoff=0.9 31 5 36 86.11
cutoff=0.0 81 100 181 44.75
cutoff=0.3 70 57 127 55.12
cutoff=0.5 67 36 103 65.05
cutoff=0.7 48 10 58 82.76
cutoff=0.9 35 6 41 85.37
cutoff=0.0 75 106 181 41.44
cutoff=0.3 67 65 132 50.76
cutoff=0.5 55 36 91 60.44
cutoff=0.7 37 15 52 71.15
cutoff=0.9 27 8 35 77.14


In [48]:
for key in precision_dict:
    print(round(np.average(precision_dict[key]),2))

42.29
54.93
64.44
82.28
85.44


# Predicting the unknown

In [49]:
unlabeled_training_df = merged_training_df[merged_training_df['label'] == 'unlabeled']

unlabeled_training_df[:30]

Unnamed: 0,PKS,Other,Alkaloid,Oligosaccharide,Terpene,NRPS,Cyclitol,Aminocoumarin,Betalactam,Siderophore,...,BGC0002027,BGC0002028,BGC0002029,BGC0002030,BGC0002032,BGC0002033,BGC0002034,BGC0002035,BGC0002036,label
BGC0000004,1,0,0,0,0,0,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,unlabeled
BGC0000006,1,0,0,0,0,0,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,unlabeled
BGC0000007,1,0,0,0,0,0,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,unlabeled
BGC0000008,1,0,0,0,0,0,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,unlabeled
BGC0000009,1,0,0,0,0,0,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,unlabeled
BGC0000039,1,0,0,0,0,0,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,unlabeled
BGC0000062,1,0,0,0,0,0,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,unlabeled
BGC0000063,1,0,0,0,0,0,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,unlabeled
BGC0000064,1,0,0,0,0,0,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,unlabeled
BGC0000076,1,0,0,0,0,0,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,unlabeled


In [50]:
unknown_bgcs = unlabeled_training_df.index

In [51]:
def running_knn(training_df,testing_df,k_value):
    j_col = []
    X_div = training_df.drop("label", axis=1)
    y_div = training_df["label"]
    nbrs = NearestNeighbors(n_neighbors=k_value, algorithm='ball_tree').fit(X_div,y_div)
    distances, indices = nbrs.kneighbors(testing_df)
    y_div = y_div.reset_index(drop=True)
    neighbors_array = []
    for item in indices:
        candidate_list = []
        for i in range(k_value):
            candidate_list.append(y_div[item[i]])
        neighbors_array.append(candidate_list)
    neighbors_array = np.asarray(neighbors_array)
    return neighbors_array,indices
    

neighbors_array,indices = running_knn(labeled_training_df,unlabeled_training_df.drop('label',axis=1),1)

bgc_col,bioact_col = [],[]

for i,item in enumerate(neighbors_array):
    print(unlabeled_training_df.index[i],item)
    bgc_col.append(unlabeled_training_df.index[i])
    bioact_col.append(item)
    
print(len(neighbors_array),len(bgc_col),len(bioact_col))

BGC0000004 ['antibacterial']
BGC0000006 ['antibacterial']
BGC0000007 ['antibacterial']
BGC0000008 ['antibacterial']
BGC0000009 ['antibacterial']
BGC0000039 ['antibacterial']
BGC0000062 ['antibacterial']
BGC0000063 ['antibacterial']
BGC0000064 ['antibacterial']
BGC0000076 ['antibacterial']
BGC0000077 ['antibacterial']
BGC0000090 ['antifungal-cytotoxic']
BGC0000098 ['antibacterial']
BGC0000116 ['antifungal']
BGC0000203 ['antibacterial']
BGC0000271 ['antibacterial']
BGC0000272 ['antibacterial']
BGC0000280 ['antibacterial']
BGC0000281 ['antibacterial']
BGC0000331 ['unknown']
BGC0000332 ['unknown']
BGC0000363 ['unknown']
BGC0000364 ['unknown']
BGC0000402 ['antifungal']
BGC0000404 ['unknown']
BGC0000405 ['antifungal']
BGC0000416 ['unknown']
BGC0000435 ['antibacterial-antifungal']
BGC0000471 ['cytotoxic']
BGC0000473 ['antibacterial']
BGC0000474 ['antibacterial']
BGC0000485 ['unknown']
BGC0000491 ['unknown']
BGC0000497 ['antibacterial']
BGC0000498 ['antibacterial']
BGC0000508 ['antibacterial']

In [52]:
jaccard_col = []

for i,bgc_id in enumerate(unlabeled_training_df.index):
    count = 0
    train_index = int(indices[i][0])
    print(i,bgc_id,train_index,indices[i])
    for j,r in labeled_training_df.iterrows():
        if count == train_index:
            training_fp = labeled_training_df.loc[j].drop('label')
        count += 1
    testing_fp = unlabeled_training_df.drop('label',axis=1).loc[bgc_id]
    training_binary = get_binary(training_fp)
    testing_binary = get_binary(testing_fp)
    jaccard_index = jaccard_score(training_binary,testing_binary)
    jaccard_col.append(jaccard_index)
    print(jaccard_index)
    
print(len(jaccard_col))

0 BGC0000004 452 [452]
0.14285714285714285
1 BGC0000006 452 [452]
0.14285714285714285
2 BGC0000007 452 [452]
0.14285714285714285
3 BGC0000008 452 [452]
0.14285714285714285
4 BGC0000009 452 [452]
0.14285714285714285
5 BGC0000039 452 [452]
0.25
6 BGC0000062 452 [452]
0.25
7 BGC0000063 452 [452]
0.25
8 BGC0000064 452 [452]
0.25
9 BGC0000076 452 [452]
0.25
10 BGC0000077 452 [452]
0.25
11 BGC0000090 10 [10]
0.6
12 BGC0000098 452 [452]
0.25
13 BGC0000116 283 [283]
1.0
14 BGC0000203 51 [51]
0.5
15 BGC0000271 42 [42]
0.6
16 BGC0000272 452 [452]
0.25
17 BGC0000280 452 [452]
0.25
18 BGC0000281 452 [452]
0.25
19 BGC0000331 173 [173]
0.8
20 BGC0000332 173 [173]
0.8
21 BGC0000363 471 [471]
0.25
22 BGC0000364 471 [471]
0.25
23 BGC0000402 193 [193]
0.42857142857142855
24 BGC0000404 471 [471]
0.25
25 BGC0000405 544 [544]
0.25
26 BGC0000416 471 [471]
0.25
27 BGC0000435 63 [63]
0.75
28 BGC0000471 86 [86]
0.5
29 BGC0000473 85 [85]
0.8333333333333334
30 BGC0000474 85 [85]
0.8333333333333334
31 BGC0000485 

0.3333333333333333
249 BGC0000337 471 [471]
0.3333333333333333
250 BGC0000342 471 [471]
0.3333333333333333
251 BGC0000344 471 [471]
0.3333333333333333
252 BGC0000347 471 [471]
0.3333333333333333
253 BGC0000348 471 [471]
0.3333333333333333
254 BGC0000350 544 [544]
0.3333333333333333
255 BGC0000351 471 [471]
0.3333333333333333
256 BGC0000355 471 [471]
0.3333333333333333
257 BGC0000356 152 [152]
0.4
258 BGC0000357 471 [471]
0.3333333333333333
259 BGC0000361 471 [471]
0.3333333333333333
260 BGC0000365 471 [471]
0.3333333333333333
261 BGC0000369 460 [460]
0.5
262 BGC0000372 471 [471]
0.3333333333333333
263 BGC0000381 471 [471]
0.3333333333333333
264 BGC0000392 471 [471]
0.3333333333333333
265 BGC0000393 693 [693]
0.5
266 BGC0000401 471 [471]
0.3333333333333333
267 BGC0000403 471 [471]
0.3333333333333333
268 BGC0000411 471 [471]
0.3333333333333333
269 BGC0000420 471 [471]
0.3333333333333333
270 BGC0000426 471 [471]
0.3333333333333333
271 BGC0000442 471 [471]
0.3333333333333333
272 BGC0000449

0.3333333333333333
449 BGC0001375 601 [601]
0.3333333333333333
450 BGC0001389 471 [471]
0.3333333333333333
451 BGC0001391 567 [567]
0.3333333333333333
452 BGC0001392 567 [567]
0.3333333333333333
453 BGC0001399 471 [471]
0.3333333333333333
454 BGC0001400 452 [452]
0.3333333333333333
455 BGC0001402 471 [471]
0.3333333333333333
456 BGC0001403 452 [452]
0.3333333333333333
457 BGC0001404 452 [452]
0.3333333333333333
458 BGC0001405 452 [452]
0.3333333333333333
459 BGC0001407 567 [567]
0.3333333333333333
460 BGC0001408 907 [907]
0.3333333333333333
461 BGC0001413 471 [471]
0.3333333333333333
462 BGC0001416 471 [471]
0.3333333333333333
463 BGC0001443 693 [693]
0.5
464 BGC0001445 693 [693]
0.5
465 BGC0001446 452 [452]
0.3333333333333333
466 BGC0001447 452 [452]
0.3333333333333333
467 BGC0001448 693 [693]
0.5
468 BGC0001449 693 [693]
0.4
469 BGC0001451 471 [471]
0.3333333333333333
470 BGC0001457 471 [471]
0.3333333333333333
471 BGC0001458 678 [678]
0.3333333333333333
472 BGC0001465 907 [907]
0.33

0.5
646 BGC0001885 452 [452]
0.3333333333333333
647 BGC0001886 452 [452]
0.3333333333333333
648 BGC0001889 567 [567]
0.3333333333333333
649 BGC0001890 471 [471]
0.3333333333333333
650 BGC0001892 452 [452]
0.3333333333333333
651 BGC0001897 452 [452]
0.3333333333333333
652 BGC0001898 601 [601]
0.3333333333333333
653 BGC0001899 452 [452]
0.3333333333333333
654 BGC0001900 471 [471]
0.3333333333333333
655 BGC0001902 693 [693]
0.5
656 BGC0001903 907 [907]
0.3333333333333333
657 BGC0001907 452 [452]
0.3333333333333333
658 BGC0001909 452 [452]
0.3333333333333333
659 BGC0001911 601 [601]
0.3333333333333333
660 BGC0001913 452 [452]
0.3333333333333333
661 BGC0001923 743 [743]
0.5
662 BGC0001926 452 [452]
0.3333333333333333
663 BGC0001929 567 [567]
0.3333333333333333
664 BGC0001930 567 [567]
0.3333333333333333
665 BGC0001938 452 [452]
0.3333333333333333
666 BGC0001940 452 [452]
0.3333333333333333
667 BGC0001941 693 [693]
0.5
668 BGC0001942 693 [693]
0.5
669 BGC0001944 452 [452]
0.3333333333333333


In [53]:
frames = {'bgc':bgc_col,'bioactivity':bioact_col,'jaccard_score':jaccard_col}

valid_jaccard_df = pd.DataFrame(frames)

valid_jaccard_df = valid_jaccard_df[valid_jaccard_df['jaccard_score'] >= 0.7]

valid_jaccard_df = valid_jaccard_df.sort_values(by='jaccard_score',ascending=False)

valid_jaccard_df = valid_jaccard_df.reset_index(drop=True)

valid_jaccard_df

Unnamed: 0,bgc,bioactivity,jaccard_score
0,BGC0000116,[antifungal],1.0
1,BGC0001130,[antibacterial],1.0
2,BGC0001933,[unknown],1.0
3,BGC0001830,[unknown],1.0
4,BGC0001770,[antifungal],1.0
5,BGC0001748,[unknown],1.0
6,BGC0001719,[inhibitor],1.0
7,BGC0001634,[unknown],1.0
8,BGC0001595,[antibacterial-antifungal-cytotoxic],1.0
9,BGC0001594,[antibacterial-antifungal-cytotoxic],1.0


In [54]:
valid_jaccard_df[valid_jaccard_df['bioactivity'] != 'unknown'][12:]

Unnamed: 0,bgc,bioactivity,jaccard_score
20,BGC0000473,[antibacterial],0.833333
21,BGC0000474,[antibacterial],0.833333
22,BGC0001667,[cytotoxic],0.833333
25,BGC0001931,[antibacterial],0.8
27,BGC0001693,[antibacterial-cytotoxic],0.75
28,BGC0001759,[antibacterial],0.75
29,BGC0000435,[antibacterial-antifungal],0.75
30,BGC0001952,[cytotoxic],0.75


**BGC001931-antibacterial: "Repurposing carrimycin as an antiviral agent against human coronaviruses, including the currently pandemic SARS-CoV-2"**

In [55]:
for key in bigscape_dict:
    if 'BGC0001931' in str(bigscape_dict[key]):
        print(bigscape_dict[key])

['BGC0002033.1.region001', 'BGC0001931.1.region001', 'BGC0000096.1.region001', 'BGC0000113.1.region001']


In [56]:
for i,item in unlabeled_training_df.drop('label',axis=1).loc['BGC0001931'].iteritems():
    if item > 0:
        print(i,item)

PKS 1
BGC0000096 0.7352222204208374
BGC0000113 0.772016778588295
BGC0001931 1.0


In [57]:
for i,r in mibig_df.iterrows():
    if 'BGC0001931' in r['compound_name']:
        print(r)

In [58]:
from collections import Counter

In [59]:
c= Counter(labeled_training_df['label'])

c

Counter({'cytotoxic': 142,
         'antibacterial': 304,
         'antibacterial-cytotoxic': 69,
         'antifungal': 76,
         'antifungal-cytotoxic': 37,
         'antibacterial-antifungal': 42,
         'antiviral': 10,
         'unknown': 165,
         'antibacterial-antifungal-cytotoxic': 28,
         'other': 13,
         'cytotoxic-unknown': 3,
         'pigment': 1,
         'inhibitor': 4,
         'siderophore': 8,
         'antihelmintic': 1,
         'antioxidant': 1,
         'herbicide': 2,
         'antiprotozoa': 1,
         'regulatory': 1})

# Calculating random

In [60]:
from sklearn.model_selection import StratifiedKFold

def get_filt_prec(jaccard_df,precision_dict):
    for cutoff in [0.3,0.5,0.7,0.9]:
        filtered_jaccard_df = jaccard_df[jaccard_df['jaccard_index'] >= cutoff]
        if len(filtered_jaccard_df) > 0:
            count = 0
            for i,r in filtered_jaccard_df.iterrows():
                if r['predicted_class'][0] not in r['true_class']:
                    count += 1
            precision_filtered = ((len(filtered_jaccard_df)-count)/len(filtered_jaccard_df))*100
            precision_score = round(precision_filtered,2)
            print('cutoff=%s'%cutoff,len(filtered_jaccard_df)-count,count,len(filtered_jaccard_df),precision_score)
            if cutoff not in precision_dict.keys():
                precision_dict[cutoff] = [precision_score]
            else:
                precision_dict[cutoff] = precision_dict[cutoff] + [precision_score]
    return precision_dict

X = final_training_df.drop('label',axis=1)
y = final_training_df['label']

precision_dict = {}

skf = StratifiedKFold(n_splits=5, random_state=1066, shuffle=True)
for train_index, test_index in skf.split(X, y):
    training_df = final_training_df.iloc[train_index]
    testing_df = final_training_df.iloc[test_index]
    column_names = list(testing_df.columns)
    random.shuffle(column_names)
    testing_df = testing_df[column_names]
    true_labels = testing_df['label']
    testing_df = testing_df.drop('label',axis=1)
    X_div = training_df.drop("label", axis=1)
    y_div = training_df["label"]
    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(X_div,y_div)
    distances, indices = nbrs.kneighbors(testing_df)
    y_div = y_div.reset_index(drop=True)
    neighbors_array = []
    for item in indices:
        candidate_list = []
        for i in range(1):
            candidate_list.append(y_div[item[i]])
        neighbors_array.append(candidate_list)
    neighbors_array = np.asarray(neighbors_array)
    correct_count = 0
    incorrect_count = 0
    predict_col,true_col = [],[]
    for i,item in enumerate(true_labels):
        predict_col.append(neighbors_array[i])
        true_col.append(item)
        if item in neighbors_array[i]:
            correct_count += 1
        else:
            incorrect_count += 1
    precision_score = round(correct_count/len(true_labels)*100,2)
    print('cutoff=0.0',correct_count,incorrect_count,len(true_labels),precision_score)
    if 0.0 not in precision_dict.keys():
        precision_dict[0.0] = [precision_score]
    else:
        precision_dict[0.0] = precision_dict[0.0] + [precision_score]
    def get_binary(fingerprint):
        new_row = []
        for index,item in enumerate(fingerprint):
            if item != 0:
                new_row.append(1)
            else:
                new_row.append(0)
        fingerprint = new_row
        return fingerprint
    jaccard_col,bgc_col = [],[]
    for i,bgc_id in enumerate(testing_df.index):
        bgc_col.append(bgc_id)
        count = 0
        train_index = int(indices[i][0])
        for j,r in training_df.iterrows():
            if count == train_index:
                training_fp = training_df.loc[j].drop('label')
            count += 1
        testing_fp = testing_df.loc[bgc_id]
        training_binary = get_binary(training_fp)
        testing_binary = get_binary(testing_fp)
        jaccard_index = jaccard_score(training_binary,testing_binary)
        jaccard_col.append(round(jaccard_index,2))
    jaccard_dict = {'bgc_ID': bgc_col, 'predicted_class': predict_col, 'true_class': true_col, 'jaccard_index': jaccard_col}
    jaccard_df = pd.DataFrame(jaccard_dict)
    jaccard_df = jaccard_df.sort_values('jaccard_index',ascending = False)
    jaccard_df = jaccard_df.reset_index(drop=True)
    precision_dict = get_filt_prec(jaccard_df,precision_dict)



cutoff=0.0 44 138 182 24.18
cutoff=0.3 1 1 2 50.0
cutoff=0.0 54 128 182 29.67
cutoff=0.3 1 2 3 33.33
cutoff=0.0 40 142 182 21.98
cutoff=0.3 3 11 14 21.43
cutoff=0.0 16 165 181 8.84
cutoff=0.3 2 5 7 28.57
cutoff=0.0 38 143 181 20.99
cutoff=0.3 0 2 2 0.0


In [61]:
for key in precision_dict:
    print(round(np.average(precision_dict[key]),2))

21.13
26.67


In [62]:
end = time.time()
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
run_time = "{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds)
print(run_time)

00:25:29.22
