In [1]:
import pandas as pd
import numpy as np
import re
import networkx
from networkx.algorithms.components.connected import connected_components
from collections import defaultdict
import random
import glob
import os
from Bio import SeqIO
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import jaccard_score

# Loading MIBiG dataframes and creating dictionaries

In [2]:
bioactivity_df = pd.read_csv("./bioactivity_df-220203.csv",sep='\t',index_col=0)

bioactivity_dict = dict(zip(bioactivity_df['metabolite'],bioactivity_df['activity']))

bioactivity_dict

{'abyssomicin': 'antibacterial',
 'aurafuron': 'antifungal-cytotoxic',
 'aureothin': 'antibacterial-antifungal-cytotoxic',
 'avilamycin': 'antibacterial',
 'bafilomycin': 'antibacterial-antifungal-cytotoxic',
 'borrelidin': 'antibacterial-cytotoxic',
 'chlorothricin': 'antibacterial',
 'coelimycin': 'unknown',
 'dawenol': 'unknown',
 'erythromycin': 'antibacterial',
 'galbonolides': 'antifungal',
 'kedarcidin': 'antibacterial-cytotoxic',
 'lactimidomycin': 'antifungal-cytotoxic',
 'neoaureothin': 'antifungal-cytotoxic',
 'neocarzinostatin': 'antibacterial-cytotoxic',
 'nystatin': 'antifungal',
 'pactamycin': 'antibacterial-antifungal-cytotoxic',
 'salinomycin': 'antibacterial-cytotoxic',
 'soraphen': 'antifungal',
 'spirangien': 'antifungal-cytotoxic',
 'sporolide': 'unknown',
 'stambomycin': 'antibacterial-cytotoxic',
 'stigmatellin': 'antifungal',
 'tautomycetin': 'antifungal-cytotoxic',
 'tautomycin': 'antifungal-cytotoxic',
 'tiacumicin': 'antibacterial',
 '9methylstreptimidone': '

In [3]:
mibig_df = pd.read_csv("./All_MIBiG_compounds_with_CF_NPC_classes.txt",sep='\t')

mibig_df

Unnamed: 0,compound_name,smiles,inchi_key,cf_kingdom,cf_superclass,cf_class,cf_subclass,cf_direct_parent,npc_class,npc_superclass,npc_pathway,npc_isglycoside
0,BGC0000001_abyssomicin C,CC1C[C@]23OC(=O)C4=C2OC1C(O)C3\C=C/C(=O)[C@@H]...,FNEADFUPWHAVTA-UHFFFAOYSA-N,Organic compounds,Organoheterocyclic compounds,Oxanes,,Oxanes,Spirotetronate macrolides,Macrolides,Polyketides,0
1,BGC0000001_atrop-abyssomicin C,CC1CC23OC(=O)C4=C2OC1C(O)C3\C=C/C(=O)C(C)CC(C)...,FNEADFUPWHAVTA-UHFFFAOYSA-N,Organic compounds,Organoheterocyclic compounds,Oxanes,,Oxanes,Spirotetronate macrolides,Macrolides,Polyketides,0
2,BGC0000002_aculeximycin,CCCC(O[C@H]1C[C@](C)(N)[C@H](O)[C@H](C)O1)C(C)...,VJKZKLDZOAFAEE-UHFFFAOYSA-N,Organic compounds,Lipids and lipid-like molecules,Prenol lipids,Terpene glycosides,Diterpene glycosides,,,,1
3,BGC0000003_AF-toxin,CCC(C)C(C(=O)OC(/C=C/C=C/C=C/C(=O)O)C1(CO1)C)O...,ONOBRFRRMLDPES-UHFFFAOYSA-N,Organic compounds,Organic acids and derivatives,Peptidomimetics,Depsipeptides,Depsipeptides,,,,0
4,BGC0000004_aflatoxin G1,[H][C@@]12OC=C[C@]1([H])C1=C(O2)C=C(OC)C2=C1OC...,XWIYFDMXXLINPU-UHFFFAOYSA-N,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Furanocoumarins,Difurocoumarolactones,Aflatoxins; Simple coumarins,Chromanes; Coumarins,Polyketides; Shikimates and Phenylpropanoids,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2107,BGC0002034_perquinoline A,C1CC2(C(C3=C(C=C(C=C3O)O)C(N2C1=O)C(=O)NCCCC(=...,CDFQOHHGTNUFPU-UHFFFAOYSA-N,,,,,,,,Alkaloids,0
2108,BGC0002034_perquinoline B,C1CC2(C(C3=C(C=C(C=C3O)O)C(N2C1=O)C(=O)NCCC(=O...,XHJZZGRZEGUFCM-UHFFFAOYSA-N,,,,,,,,Alkaloids,0
2109,BGC0002034_perquinoline C,C1CC2(C(C3=C(C=C(C=C3O)O)C(N2C1=O)C(=O)NCCCC(=...,ZNSLKIGPUIPFNC-UHFFFAOYSA-N,,,,,,,,Alkaloids,0
2110,BGC0002035_ilicicolin H,CC=CC1C2CC(CCC2C(=CC1C(=O)C3=C(C(=CNC3=O)C4=CC...,BYVVOONSAAQMKI-UHFFFAOYSA-N,Organic compounds,Organoheterocyclic compounds,Pyridines and derivatives,Phenylpyridines,Phenylpyridines,Pyridine alkaloids,Nicotinic acid alkaloids,Alkaloids,0


In [4]:
mibig_activity_dict = {}
missing_mets_list = []

for i,r in mibig_df.iterrows():
    met_name = r['compound_name'].split('_')[1].replace(' ','_').lower().split('_')[0]
    for key in bioactivity_dict.keys():
        if type(key) != float:
            if met_name in key:
                print(r['compound_name'].split('_')[0],met_name,key,bioactivity_dict[key])
                if type(bioactivity_dict[key]) == float:
                    mibig_activity_dict[r['compound_name'].split('_')[0]] = 'unknown'
                else:
                    mibig_activity_dict[r['compound_name'].split('_')[0]] = bioactivity_dict[key]
            else:
                missing_mets_list.append(met_name)

BGC0000001 abyssomicin abyssomicin antibacterial
BGC0000002 aculeximycin aculeximycin antibacterial-antifungal
BGC0000014 ambruticin ambruticin antifungal
BGC0000016 amphotericin amphotericin antifungal
BGC0000020 ansamitocin ansamitocin cytotoxic
BGC0000021 apoptolidin apoptolidin cytotoxic
BGC0000023 aurafuron aurafuron antifungal-cytotoxic
BGC0000024 aureothin aureothin antibacterial-antifungal-cytotoxic
BGC0000024 aureothin neoaureothin antifungal-cytotoxic
BGC0000025 avermectin avermectin cytotoxic
BGC0000026 avilamycin avilamycin antibacterial
BGC0000026 avilamycin avilamycin antibacterial
BGC0000028 bafilomycin bafilomycin antibacterial-antifungal-cytotoxic
BGC0000031 borrelidin borrelidin antibacterial-cytotoxic
BGC0000033 calicheamicin calicheamicin antibacterial-antifungal-cytotoxic
BGC0000034 candicidin candicidin antifungal
BGC0000035 chalcomycin chalcomycin antibacterial
BGC0000035 chalcomycin dihyrdochalcomycin unknown
BGC0000036 chlorothricin chlorothricin antibacterial


BGC0000220 enterocin enterocin antibacterial
BGC0000220 enterocin enterocin_AS_48 antibacterial
BGC0000220 enterocin enterocin_a antibacterial
BGC0000220 enterocin enterocin_nkr_5_3b antibacterial
BGC0000221 erdacin erdacin nan
BGC0000223 rabelomycin rabelomycin antibacterial-cytotoxic
BGC0000224 fredericamycin fredericamycin antibacterial-antifungal-cytotoxic
BGC0000225 frenolicin frenolicin antibacterial-antifungal
BGC0000227 granaticin granaticin antibacterial
BGC0000227 granaticin granaticin_2 antibacterial
BGC0000228 granaticin granaticin antibacterial
BGC0000228 granaticin granaticin_2 antibacterial
BGC0000229 grincamycin grincamycin cytotoxic
BGC0000232 hatomarubigin hatomarubigin cytotoxic
BGC0000232 hatomarubigin hatomarubigin cytotoxic
BGC0000232 hatomarubigin hatomarubigin cytotoxic
BGC0000232 hatomarubigin hatomarubigin cytotoxic
BGC0000233 hedamycin hedamycin antibacterial-cytotoxic
BGC0000234 jadomycin jadomycin antibacterial
BGC0000235 kiamycin kiamycin cytotoxic
BGC0000

BGC0000466 yatakemycin yatakemycin antifungal-cytotoxic
BGC0000467 yersiniabactin yersiniabactin nan
BGC0000467 yersiniabactin yersiniabactin_2 nan
BGC0000468 bottromycin bottromycin antibacterial
BGC0000468 bottromycin bottromycin_a2 antibacterial
BGC0000468 bottromycin bottromycin_a2_2 antibacterial
BGC0000468 bottromycin bottromycin_d antibacterial
BGC0000469 bottromycin bottromycin antibacterial
BGC0000469 bottromycin bottromycin_a2 antibacterial
BGC0000469 bottromycin bottromycin_a2_2 antibacterial
BGC0000469 bottromycin bottromycin_d antibacterial
BGC0000470 bottromycin bottromycin antibacterial
BGC0000470 bottromycin bottromycin_a2 antibacterial
BGC0000470 bottromycin bottromycin_a2_2 antibacterial
BGC0000470 bottromycin bottromycin_d antibacterial
BGC0000472 anacyclamide anacyclamide unknown
BGC0000475 patellamide patellamide_a cytotoxic
BGC0000475 patellamide patellamide_a cytotoxic
BGC0000476 patellamide patellamide_a cytotoxic
BGC0000477 patellin patellin_2_3 unknown
BGC0000

BGC0000649 carotenoid carotenoid nan
BGC0000649 carotenoid rubivivax_gelatnosus_carotenoid nan
BGC0000649 carotenoid rhobacter_sphaeroides_241_carotenoid nan
BGC0000649 carotenoid pantoea_anantis_carotenoid nan
BGC0000649 carotenoid staph_aureus_carotenoid nan
BGC0000649 carotenoid carotenoid_2 unknown
BGC0000649 carotenoid carotenoid_3 nan
BGC0000649 carotenoid carotenoid_4 nan
BGC0000649 carotenoid carotenoid_5 nan
BGC0000649 carotenoid carotenoid_6 nan
BGC0000649 carotenoid carotenoid_7 nan
BGC0000649 carotenoid carotenoid_8 nan
BGC0000649 carotenoid carotenoid_9 nan
BGC0000649 carotenoid carotenoid_10 nan
BGC0000649 carotenoid carotenoid_11 nan
BGC0000649 carotenoid carotenoid_12 nan
BGC0000649 carotenoid carotenoid_13 nan
BGC0000649 carotenoid carotenoid_14 nan
BGC0000649 carotenoid carotenoid_15 nan
BGC0000649 carotenoid carotenoid_16 nan
BGC0000649 carotenoid carotenoid_17 nan
BGC0000650 carotenoid carotenoid nan
BGC0000650 carotenoid rubivivax_gelatnosus_carotenoid nan
BGC00006

BGC0000752 capsular capsular_polysaccharide_29 nan
BGC0000752 capsular capsular_polysaccharide_30 nan
BGC0000753 capsular capsular_polysaccharide nan
BGC0000753 capsular capsular_polysaccharide_2 nan
BGC0000753 capsular capsular_polysaccharide_3 nan
BGC0000753 capsular capsular_polysaccharide_4 nan
BGC0000753 capsular capsular_polysaccharide_5 nan
BGC0000753 capsular capsular_polysaccharide_6 nan
BGC0000753 capsular capsular_polysaccharide_7 nan
BGC0000753 capsular capsular_polysaccharide_8 nan
BGC0000753 capsular capsular_polysaccharide_9 nan
BGC0000753 capsular capsular_polysaccharide_10 nan
BGC0000753 capsular capsular_polysaccharide_11 nan
BGC0000753 capsular capsular_polysaccharide_12 nan
BGC0000753 capsular capsular_polysaccharide_13 nan
BGC0000753 capsular capsular_polysaccharide_14 nan
BGC0000753 capsular capsular_polysaccharide_15 nan
BGC0000753 capsular capsular_polysaccharide_16 nan
BGC0000753 capsular capsular_polysaccharide_17 nan
BGC0000753 capsular capsular_polysaccharid

BGC0000969 chondramide chondramide cytotoxic
BGC0000970 chondrochloren chondrochloren antibacterial-antifungal
BGC0000971 cinnabaramide cinnabaramide antifungal-cytotoxic
BGC0000973 collismycin collismycin antibacterial-antifungal
BGC0000974 crocacin crocacin antifungal-cytotoxic
BGC0000976 curacin curacin cytotoxic
BGC0000976 curacin curacin_2 cytotoxic
BGC0000976 curacin curacin_a cytotoxic
BGC0000976 curacin curacin_3 cytotoxic
BGC0000978 cylindrospermopsin cylindrospermopsin cytotoxic
BGC0000978 cylindrospermopsin cylindrospermopsin_2 cytotoxic
BGC0000978 cylindrospermopsin cylindrospermopsin_3 cytotoxic
BGC0000978 cylindrospermopsin cylindrospermopsin_4 cytotoxic
BGC0000979 cylindrospermopsin cylindrospermopsin cytotoxic
BGC0000979 cylindrospermopsin cylindrospermopsin_2 cytotoxic
BGC0000979 cylindrospermopsin cylindrospermopsin_3 cytotoxic
BGC0000979 cylindrospermopsin cylindrospermopsin_4 cytotoxic
BGC0000980 cylindrospermopsin cylindrospermopsin cytotoxic
BGC0000980 cylindrospe

BGC0001265 melanin melanin unknown
BGC0001265 melanin melanin_2 unknown
BGC0001265 melanin melanin_3 unknown
BGC0001265 melanin melanin_4 unknown
BGC0001265 melanin melanin_5 unknown
BGC0001285 pseudopyronine pseudopyronine antibacterial-antifungal-cytotoxic
BGC0001285 pseudopyronine pseudopyronine antibacterial-antifungal-cytotoxic
BGC0001287 chaxamycin chaxamycin antibacterial-cytotoxic
BGC0001287 chaxamycin chaxamycin antibacterial-cytotoxic
BGC0001287 chaxamycin chaxamycin antibacterial-cytotoxic
BGC0001287 chaxamycin chaxamycin antibacterial-cytotoxic
BGC0001288 maklamicin maklamicin antibacterial
BGC0001293 cyclizidine cyclizidine cytotoxic
BGC0001295 cremeomycin cremeomycin antibacterial-antifungal-cytotoxic
BGC0001296 streptazone streptazone_e unknown
BGC0001299 chlorotonil chlorotonil antibacterial-antifungal
BGC0001300 anthracimycin anthracimycin antibacterial
BGC0001300 anthracimycin anthracimycin_2 antibacterial
BGC0001302 lomofungin lomofungin antibacterial-antifungal-cyto

BGC0001431 myxochromide myxochromide nan
BGC0001431 myxochromide myxochromide_d unknown
BGC0001431 myxochromide myxochromide_d_2 unknown
BGC0001431 myxochromide myxochromide_d_3 unknown
BGC0001431 myxochromide myxochromide_d_4 unknown
BGC0001431 myxochromide myxochromide_b unknown
BGC0001431 myxochromide myxochromide_a unknown
BGC0001431 myxochromide myxochromide_c unknown
BGC0001431 myxochromide myxochromide_c_2 unknown
BGC0001431 myxochromide myxochromide_a_2 unknown
BGC0001431 myxochromide myxochromide_a_3 unknown
BGC0001431 myxochromide myxochromide_a_4 unknown
BGC0001431 myxochromide myxochromide_s unknown
BGC0001431 myxochromide myxochromide_s_2 unknown
BGC0001431 myxochromide myxochromide_s_3 unknown
BGC0001431 myxochromide myxochromide_d_5 unknown
BGC0001432 myxochromide myxochromide nan
BGC0001432 myxochromide myxochromide_d unknown
BGC0001432 myxochromide myxochromide_d_2 unknown
BGC0001432 myxochromide myxochromide_d_3 unknown
BGC0001432 myxochromide myxochromide_d_4 unknown

In [5]:
len(mibig_activity_dict)

806

In [6]:
mibig_activity_dict

{'BGC0000001': 'antibacterial',
 'BGC0000002': 'antibacterial-antifungal',
 'BGC0000014': 'antifungal',
 'BGC0000016': 'antifungal',
 'BGC0000020': 'cytotoxic',
 'BGC0000021': 'cytotoxic',
 'BGC0000023': 'antifungal-cytotoxic',
 'BGC0000024': 'antifungal-cytotoxic',
 'BGC0000025': 'cytotoxic',
 'BGC0000026': 'antibacterial',
 'BGC0000028': 'antibacterial-antifungal-cytotoxic',
 'BGC0000031': 'antibacterial-cytotoxic',
 'BGC0000033': 'antibacterial-antifungal-cytotoxic',
 'BGC0000034': 'antifungal',
 'BGC0000035': 'unknown',
 'BGC0000036': 'antibacterial',
 'BGC0000038': 'unknown',
 'BGC0000040': 'cytotoxic',
 'BGC0000042': 'antibacterial',
 'BGC0000043': 'cytotoxic',
 'BGC0000044': 'unknown',
 'BGC0000051': 'antifungal',
 'BGC0000053': 'cytotoxic',
 'BGC0000054': 'antibacterial',
 'BGC0000055': 'antibacterial',
 'BGC0000059': 'antifungal',
 'BGC0000060': 'cytotoxic',
 'BGC0000061': 'antifungal',
 'BGC0000066': 'antifungal-cytotoxic',
 'BGC0000067': 'antifungal-cytotoxic',
 'BGC0000068'

In [7]:
mibig_bio_3_df = pd.read_csv('./mibig_3_bioactivity.csv',sep='\t',names=['MIBIG_ID','Activity'])

mibig_bio_3_dict = dict(zip(mibig_bio_3_df['MIBIG_ID'],mibig_bio_3_df['Activity']))

mibig_bio_3_dict

{'BGC0000018': 'Antibacterial',
 'BGC0000019': 'Antibacterial',
 'BGC0000025': 'Cytotoxic',
 'BGC0000032': 'Antibacterial',
 'BGC0000034': 'Antifungal',
 'BGC0000035': 'Antibacterial',
 'BGC0000040': 'Antiviral',
 'BGC0000042': 'Antibacterial',
 'BGC0000047': 'Antibacterial',
 'BGC0000050': 'Antibacterial',
 'BGC0000052': 'Antifungal',
 'BGC0000058': 'Cytotoxic',
 'BGC0000059': 'Antibacterial',
 'BGC0000060': 'Cytotoxic',
 'BGC0000061': 'Antifungal',
 'BGC0000066': 'Antibacterial',
 'BGC0000067': 'Antibacterial',
 'BGC0000068': 'Antibacterial',
 'BGC0000073': 'Antibacterial',
 'BGC0000074': 'Antibacterial',
 'BGC0000075': 'Antibacterial',
 'BGC0000078': 'Cytotoxic',
 'BGC0000079': 'Antibacterial',
 'BGC0000084': 'Antibacterial',
 'BGC0000085': 'Antibacterial',
 'BGC0000086': 'Antibacterial',
 'BGC0000087': 'Antibacterial',
 'BGC0000091': 'Cytotoxic',
 'BGC0000093': 'Antibacterial',
 'BGC0000096': 'Antibacterial',
 'BGC0000100': 'Antiprotozoa',
 'BGC0000105': 'Antibacterial',
 'BGC00001

In [8]:
for key in mibig_activity_dict:
    if mibig_activity_dict[key] == 'unknown':
        if key in mibig_bio_3_dict.keys():
            mibig_activity_dict[key] = mibig_bio_3_dict[key].lower()

for key in mibig_bio_3_dict:
    if key not in mibig_activity_dict:
        mibig_activity_dict[key] = mibig_bio_3_dict[key].lower()

In [9]:
bgc_subtype_df = pd.read_csv("./bgc_subtype_df.csv",'\t',names=['BGC','subtype'])

bgc_subtype_df

Unnamed: 0,BGC,subtype
0,BGC0000001,Modular type I polyketide
1,BGC0000002,Polyketide
2,BGC0000003,Polyketide
3,BGC0000004,Polyketide
4,BGC0000005,Polyketide
...,...,...
1921,BGC0002045,Type II polyketide
1922,BGC0002055,Trans-AT type I polyketide
1923,BGC0002056,Trans-AT type I polyketide
1924,BGC0002057,Trans-AT type I polyketide


In [10]:
subtype_type_dict = {}
with open("./subtype_type_df.csv") as f:
    for line in f:
        (key, val) = line.split(',')
        subtype_type_dict[key] = val.strip('\n')
        
subtype_type_dict

{'Modular type I polyketide': 'PKS',
 'Polyketide': 'PKS',
 'Other': 'Other',
 'Alkaloid Modular type I polyketide': 'Alkaloid-PKS',
 'Oligosaccharide': 'Oligosaccharide',
 'Iterative type I polyketide': 'PKS',
 'Modular type I polyketide Iterative type I polyketide Oligosaccharide': 'Oligosaccharide-PKS',
 'Modular type I polyketide Hybrid/tailoring saccharide': 'Oligosaccharide-PKS',
 'Iterative type I polyketide Enediyne type I polyketide': 'PKS',
 'Modular type I polyketide Trans-AT type I polyketide': 'PKS',
 'Terpene Iterative type I polyketide': 'PKS-Terpene',
 'Iterative type I polyketide Hybrid/tailoring saccharide': 'Oligosaccharide-PKS',
 'NRP Enediyne type I polyketide': 'NRPS-PKS',
 'NRP Modular type I polyketide': 'NRPS-PKS',
 'Trans-AT type I polyketide': 'PKS',
 'Polyketide NRP': 'NRPS-PKS',
 'Iterative type I polyketide Trans-AT type I polyketide': 'PKS',
 'Type II polyketide': 'PKS',
 'Alkaloid': 'Alkaloid',
 'Type III polyketide': 'PKS',
 'Type II polyketide Hybrid/t

In [11]:
type_col = []

for i,r in bgc_subtype_df.iterrows():
    type_col.append(subtype_type_dict[r['subtype']])
    
bgc_subtype_df['type'] = type_col

bgc_subtype_df

Unnamed: 0,BGC,subtype,type
0,BGC0000001,Modular type I polyketide,PKS
1,BGC0000002,Polyketide,PKS
2,BGC0000003,Polyketide,PKS
3,BGC0000004,Polyketide,PKS
4,BGC0000005,Polyketide,PKS
...,...,...,...
1921,BGC0002045,Type II polyketide,PKS
1922,BGC0002055,Trans-AT type I polyketide,PKS
1923,BGC0002056,Trans-AT type I polyketide,PKS
1924,BGC0002057,Trans-AT type I polyketide,PKS


In [12]:
bgc_type_dict = dict(zip(bgc_subtype_df.BGC,bgc_subtype_df.type))

bgc_type_dict

{'BGC0000001': 'PKS',
 'BGC0000002': 'PKS',
 'BGC0000003': 'PKS',
 'BGC0000004': 'PKS',
 'BGC0000005': 'PKS',
 'BGC0000006': 'PKS',
 'BGC0000007': 'PKS',
 'BGC0000008': 'PKS',
 'BGC0000009': 'PKS',
 'BGC0000010': 'PKS',
 'BGC0000011': 'PKS',
 'BGC0000012': 'PKS',
 'BGC0000013': 'PKS',
 'BGC0000014': 'PKS',
 'BGC0000016': 'Other',
 'BGC0000017': 'Alkaloid-PKS',
 'BGC0000018': 'PKS',
 'BGC0000019': 'PKS',
 'BGC0000020': 'PKS',
 'BGC0000021': 'PKS',
 'BGC0000022': 'PKS',
 'BGC0000023': 'PKS',
 'BGC0000024': 'PKS',
 'BGC0000025': 'PKS',
 'BGC0000026': 'Oligosaccharide',
 'BGC0000027': 'PKS',
 'BGC0000028': 'PKS',
 'BGC0000029': 'PKS',
 'BGC0000030': 'PKS',
 'BGC0000031': 'PKS',
 'BGC0000032': 'PKS',
 'BGC0000033': 'PKS',
 'BGC0000034': 'PKS',
 'BGC0000035': 'PKS',
 'BGC0000036': 'Oligosaccharide-PKS',
 'BGC0000037': 'PKS',
 'BGC0000038': 'PKS',
 'BGC0000039': 'PKS',
 'BGC0000040': 'PKS',
 'BGC0000041': 'PKS',
 'BGC0000042': 'PKS',
 'BGC0000043': 'PKS',
 'BGC0000044': 'PKS',
 'BGC0000045': 

# Getting classes from CENA's antiSMASH files

In [19]:
def get_feature_class(bgc_name,genome_name):
    filename = "./antismash_results_CENAs/%s/%s"%(genome_name,bgc_name)
    if os.path.exists(filename):
        input_handle = open(filename,'r')
        for seq_record in SeqIO.parse(input_handle,'genbank'):
            edge_list,type_list = [],[]
            for feature in seq_record.features:
                if feature.type == 'cand_cluster':
                    for qual in feature.qualifiers:
                        if qual == 'contig_edge':
                            ctg_edge = feature.qualifiers[qual]
                            edge_list.append(ctg_edge[0])
                        if qual == 'product':
                            bgc_type = feature.qualifiers[qual]
                            for item_type in bgc_type:
                                if item_type not in type_list:
                                    type_list.append(item_type)
            return edge_list,type_list
    else:
        return 'NA','NA'
            
all_classes = []

for root, dirs, files in os.walk("./antismash_results_CENAs/"):
    for file in files:
        if file.endswith(".gbk"):
            ctg_edge,class_list = get_feature_class(file,os.path.basename(root))
            for class_type in class_list:
                all_classes.append(class_type)
    
all_classes = np.unique(all_classes)

all_classes

array(['CDPS', 'LAP', 'NRPS', 'NRPS-like', 'RRE-containing', 'RiPP-like',
       'T1PKS', 'arylpolyene', 'betalactone', 'cyanobactin', 'hglE-KS',
       'ladderane', 'lanthipeptide-class-ii', 'lanthipeptide-class-v',
       'lassopeptide', 'microviridin', 'other', 'proteusin', 'resorcinol',
       'siderophore', 'spliceotide', 'terpene', 'thioamitides',
       'thiopeptide'], dtype='<U22')

In [20]:
common_dict = {"A":"NA",
"CDPS":"NRPS",
"LAP":"RiPP",
"N":"NA",
"NAGGN":"NRPS",
"NRPS":"NRPS",
"NRPS-like":"NRPS",
"PBDE":"Phenolic",
"PKS-like":"PKS",
"PUFA":"PKS",
"PpyS-KS":"PKS",
"T1PKS":"PKS",
"T2PKS":"PKS",
"T3PKS":"PKS",
"TfuA-related":"RiPP",
"amglyccycl":"Minor",
"arylpolyene":["PKS","Phenolic"],
"bacteriocin":"RiPP",
"betalactone":"Minor",
"blactam":"Minor",
"butyrolactone":"Minor",
"cyanobactin":"RiPP",
"ectoine":"Minor",
"furan":"Minor",
"fused":"Minor",
"hglE-KS":"PKS",
"hserlactone":["PKS","NRPS"],
"indole":"Minor",
"ladderane":"RiPP",
"lanthipeptide":"RiPP",
"lassopeptide":"RiPP",
"linaridin":"RiPP",
"melanin":"NRPS",
"microviridin":"RiPP",
"nucleoside":"Nucleoside",
"oligosaccharide":"Oligosaccharide",
"other":"Other",
"phenazine":"Minor",
"phosphonate":"Phosphonate",
"proteusin":"RiPP",
"resorcinol":"PKS",
"sactipeptide":"RiPP",
"siderophore":"Siderophore",
"terpene":"Terpene",
"thiopeptide":"RiPP",
"transAT-PKS":"PKS",
"transAT-PKS-like":"PKS",
"RRE-containing":"RiPP",
"RiPP-like":"RiPP",
"lanthipeptide-class-ii":"RiPP",
"lanthipeptide-class-v":"RiPP",
"spliceotide":"Minor",
"thioamitides":"RiPP"}

In [21]:
for item in all_classes:
    if item not in common_dict.keys():
        print(item)

# Creating class dataframe

In [22]:
combined_list1,all_classes,type_list = [],[],[]

for key in bgc_type_dict:
    if 'tRNA' in bgc_type_dict[key]:
        type_list = ['tRNA_derived']
    else:
        type_list = bgc_type_dict[key].split('-')
    for item in type_list:
        if item not in all_classes:
            all_classes.append(item)
    combined_list1.append(type_list)
    
combined_list1

[['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Other'],
 ['Alkaloid', 'PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide', 'PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide', 'PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide', 'PKS'],
 ['PKS'],
 ['PKS'

In [23]:
combined_list2 = []
cena_bgc_list = []

for root, dirs, files in os.walk("./antismash_results_CENAs/"):
    for file in files:
        if 'region' in file:
            if file.endswith(".gbk"):
                ctg_edge,class_list = get_feature_class(file,os.path.basename(root))
                combined_classes = []
                for bgc_class in class_list:
                    if type(common_dict[bgc_class]) != list:
                        combined_classes.append(common_dict[bgc_class])
                        if common_dict[bgc_class] not in all_classes:
                            all_classes.append(common_dict[bgc_class])
                    else:
                        for item in common_dict[bgc_class]:
                            combined_classes.append(item)
                            if item not in all_classes:
                                all_classes.append(item)
                cena_bgc_list.append(file)
                combined_list2.append(combined_classes)
    
combined_list2

[['PKS', 'PKS'],
 ['PKS', 'PKS'],
 ['NRPS'],
 ['PKS'],
 ['Terpene'],
 ['RiPP', 'Terpene'],
 ['PKS', 'NRPS'],
 ['NRPS'],
 ['PKS'],
 ['Terpene'],
 ['NRPS'],
 ['RiPP'],
 ['RiPP', 'RiPP'],
 ['Other'],
 ['NRPS'],
 ['PKS'],
 ['RiPP'],
 ['NRPS'],
 ['NRPS', 'PKS'],
 ['NRPS', 'Minor', 'NRPS'],
 ['RiPP'],
 ['NRPS'],
 ['RiPP'],
 ['NRPS'],
 ['RiPP'],
 ['RiPP'],
 ['PKS', 'Phenolic'],
 ['Terpene'],
 ['Terpene'],
 ['PKS'],
 ['Terpene'],
 ['PKS'],
 ['NRPS', 'PKS'],
 ['PKS', 'NRPS'],
 ['RiPP'],
 ['RiPP'],
 ['NRPS'],
 ['RiPP'],
 ['Terpene', 'NRPS'],
 ['PKS', 'PKS'],
 ['RiPP'],
 ['RiPP'],
 ['RiPP'],
 ['RiPP'],
 ['Terpene'],
 ['RiPP', 'RiPP'],
 ['NRPS'],
 ['Terpene'],
 ['NRPS', 'PKS', 'Terpene'],
 ['Terpene'],
 ['NRPS', 'PKS'],
 ['NRPS'],
 ['NRPS', 'NRPS'],
 ['PKS'],
 ['PKS', 'NRPS', 'NRPS', 'Other'],
 ['RiPP'],
 ['RiPP'],
 ['NRPS', 'PKS', 'NRPS'],
 ['RiPP'],
 ['NRPS'],
 ['Terpene'],
 ['NRPS', 'PKS', 'NRPS'],
 ['PKS', 'PKS', 'RiPP'],
 ['PKS', 'PKS'],
 ['PKS'],
 ['PKS', 'PKS'],
 ['NRPS'],
 ['Terpene'],
 ['

In [24]:
all_classes

['PKS',
 'Other',
 'Alkaloid',
 'Oligosaccharide',
 'Terpene',
 'NRPS',
 'Cyclitol',
 'Aminocoumarin',
 'Betalactam',
 'Siderophore',
 'Pyrrolobenzodiazepine',
 'RiPP',
 'Butyrolactone',
 'Nucleoside',
 'Phenazine',
 'Aminoglycoside',
 'tRNA_derived',
 'Phosphonate',
 'Minor',
 'Phenolic']

In [25]:
combined_list = combined_list1 + combined_list2

combined_list

[['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Other'],
 ['Alkaloid', 'PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide', 'PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide', 'PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide', 'PKS'],
 ['PKS'],
 ['PKS'

In [26]:
class_df = pd.DataFrame(columns=(all_classes))

bgc_list = list(bgc_type_dict.keys()) + cena_bgc_list

for i,combined_classes in enumerate(combined_list):
    row_extension = []
    for final_class in all_classes:
        if final_class in combined_classes:
            row_extension.append(1)
        else:
            row_extension.append(0)
    class_df.loc[bgc_list[i]] = row_extension
    
class_df = class_df.sort_index(axis=0)
    
class_df

Unnamed: 0,PKS,Other,Alkaloid,Oligosaccharide,Terpene,NRPS,Cyclitol,Aminocoumarin,Betalactam,Siderophore,Pyrrolobenzodiazepine,RiPP,Butyrolactone,Nucleoside,Phenazine,Aminoglycoside,tRNA_derived,Phosphonate,Minor,Phenolic
BGC0000001,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000002,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000003,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000004,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000005,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
main-chr.region008.gbk,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
main-chr.region009.gbk,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
main-chr.region010.gbk,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
main-chr.region011.gbk,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [28]:
class_df[class_df['Aminoglycoside'] == 1]

Unnamed: 0,PKS,Other,Alkaloid,Oligosaccharide,Terpene,NRPS,Cyclitol,Aminocoumarin,Betalactam,Siderophore,Pyrrolobenzodiazepine,RiPP,Butyrolactone,Nucleoside,Phenazine,Aminoglycoside,tRNA_derived,Phosphonate,Minor,Phenolic
BGC0000953,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


# Creating substructure dataframe

In [32]:
def get_spec_dict(genomeID):
    gbk_paths = glob.glob('./antismash_results_all/%s/*region*.gbk'%genomeID)
    gbk_paths.sort()
    spec_dict = {}
    col_list = []
    for gbk_file in gbk_paths:
        input_handle = open(gbk_file)
        for seq_record in SeqIO.parse(input_handle,'genbank'):
            sub_list = []
            for feature in seq_record.features:
                if feature.type == 'aSDomain':
                    if 'specificity' in feature.qualifiers:
                        sub_list.append(feature.qualifiers['specificity'])
                        if feature.qualifiers['specificity'] not in col_list:
                            col_list.append(feature.qualifiers['specificity'])
            spec_dict[os.path.basename(gbk_file)] = sub_list
    return spec_dict,col_list

In [38]:
def split_string_at(s, c, n):
    words = s.split(c)
    return c.join(words[:n]), c.join(words[n:])

antismash_glob = glob.glob('./antismash_results_all/*/')

for antismash_path in antismash_glob:
    genomeID = split_string_at(antismash_path,'/',2)[1].rstrip('/')
    if antismash_path == antismash_glob[0]:
        spec_dict_final,col_list_final = get_spec_dict(genomeID)
    else:
        spec_dict_temp,col_list_temp = get_spec_dict(genomeID)
        spec_dict_final.update(spec_dict_temp)
        for spec_name in col_list_temp:
            if spec_name not in col_list_final:
                col_list_final.append(spec_name)
    print(genomeID,len(spec_dict_final),len(col_list_final))

BGC0001102 1 5
BGC0001330 2 11
BGC0000686 2 11
BGC0001754 3 15
BGC0001566 4 15
BGC0000672 7 15
BGC0000440 8 18
BGC0001592 9 18
BGC0000818 10 18
BGC0000024 11 19
BGC0000216 12 19
BGC0001559 13 19
BGC0001901 14 20
BGC0000229 15 20
BGC0000827 16 20
BGC0000211 17 20
BGC0000023 18 23
BGC0000447 19 27
BGC0001595 20 27
BGC0000675 21 27
BGC0001561 22 27
BGC0001753 23 27
BGC0001337 24 27
BGC0001939 25 29
BGC0001105 26 29
BGC0000820 26 29
BGC0001906 27 29
BGC0000471 28 29
BGC0001791 29 31
BGC0000643 30 31
BGC0000227 31 31
BGC0000829 32 31
BGC0001301 33 31
BGC0001133 34 31
BGC0001557 36 32
BGC0000485 37 32
BGC0001765 37 32
BGC0000816 38 32
BGC0000218 39 32
BGC0000688 40 32
BGC0001568 41 32
BGC0001930 42 32
BGC0001762 43 32
BGC0001550 44 32
BGC0001908 45 33
BGC0001306 45 33
BGC0000012 46 33
BGC0000220 47 33
BGC0001796 48 33
BGC0000644 49 33
CENA71 71 40
BGC0001339 72 41
BGC0001937 73 41
BGC0000449 74 42
BGC0000811 75 42
BGC0001963 76 44
BGC0001709 77 44
BGC0000079 78 45
BGC0000845 79 45
BGC0001997

BGC0000700 441 82
BGC0000532 442 82
BGC0001626 443 82
BGC0001414 444 82
BGC0001070 445 82
BGC0001242 446 82
BGC0000510 446 82
BGC0000722 447 82
BGC0000346 448 82
BGC0001294 449 82
BGC0000174 450 82
BGC0000948 450 82
BGC0001260 451 82
BGC0000180 452 82
BGC0001052 453 82
BGC0001436 455 82
BGC0001604 456 82
BGC0001099 457 82
BGC0000977 458 82
BGC0000379 459 82
BGC0001409 460 82
BGC0000983 461 82
BGC0001851 462 82
BGC0001603 463 82
BGC0001431 464 82
BGC0000187 465 82
BGC0001055 466 82
BGC0001267 467 82
BGC0000173 468 82
BGC0000341 469 82
BGC0001293 470 82
BGC0000725 470 82
BGC0000517 471 82
BGC0001258 472 82
BGC0000984 473 82
BGC0001856 474 82
BGC0000528 475 82
BGC0000970 476 82
BGC0001063 477 82
BGC0001251 478 82
BGC0000383 479 82
BGC0001635 480 82
BGC0001407 481 82
BGC0000713 482 82
BGC0000145 483 82
BGC0000979 484 82
BGC0000377 485 82
BGC0001438 486 82
BGC0001860 487 82
BGC0000348 488 82
BGC0001894 488 82
BGC0000946 489 82
BGC0001090 490 82
BGC0000142 491 82
BGC0000526 492 82
BGC0000714

BGC0001543 836 93
BGC0000491 837 93
BGC0001923 838 93
BGC0001749 839 93
BGC0000805 839 93
BGC0000039 840 93
BGC0001544 841 93
BGC0000496 842 93
BGC0001776 843 93
BGC0001312 844 93
BGC0001120 844 93
BGC0000234 845 93
BGC0000006 846 93
BGC0001782 847 93
BGC0000650 848 93
BGC0001993 849 93
BGC0000841 850 93
BGC0000419 851 93
BGC0001967 852 93
BGC0001369 853 93
CENA21 869 93
BGC0000426 870 93
BGC0000614 871 93
BGC0000270 872 93
BGC0001190 873 93
BGC0000042 874 93
BGC0001356 875 93
BGC0000284 876 93
BGC0001164 877 93
BGC0001958 878 93
BGC0001500 879 93
BGC0001732 880 93
BGC0001960 881 93
BGC0001538 882 93
BGC0000248 883 93
BGC0001994 883 93
BGC0000846 884 93
BGC0001735 885 93
BGC0001507 886 93
BGC0001163 887 93
BGC0000283 888 93
BGC0001197 889 93
BGC0000045 890 93
BGC0000879 891 93
BGC0000277 892 93
BGC0000613 893 93
BGC0000421 894 93
BGC0001358 895 94
BGC0000884 895 94
BGC0001956 896 94
BGC0000870 896 94
BGC0000428 897 94
BGC0000087 898 94
BGC0001155 899 94
BGC0001969 900 94
BGC0001367 901

BGC0001443 1268 100
BGC0000591 1269 100
BGC0001027 1270 100
BGC0001215 1271 101
BGC0001488 1271 101
BGC0000768 1271 101
BGC0000902 1272 101
BGC0001824 1273 101
BGC0001018 1274 101
BGC0000761 1274 101
BGC0002036 1275 101
BGC0000553 1276 101
BGC0001481 1277 101
BGC0000137 1278 101
BGC0000305 1279 101
BGC0001011 1280 101
BGC0001223 1281 101
BGC0000795 1281 101
BGC0001647 1282 101
BGC0001475 1283 101
BGC0000934 1284 101
BGC0000108 1285 101
BGC0002009 1285 101
BGC0000598 1286 101
BGC0001678 1286 101
BGC0001812 1287 101
BGC0001472 1288 101
BGC0000792 1288 101
BGC0001640 1289 101
BGC0001224 1290 101
BGC0001016 1291 101
BGC0000302 1292 101
BGC0000130 1293 101
BGC0000554 1294 101
BGC0001486 1295 101
BGC0000766 1295 101
BGC0002031 1295 101
BGC0001815 1296 101
BGC0001029 1297 101
BGC0000759 1297 101
BGC0000933 1297 101
BGC0001239 1297 101
BGC0000911 1298 101
BGC0000549 1299 101
BGC0001808 1300 101
BGC0001034 1301 101
BGC0001206 1301 101
BGC0001662 1302 101
BGC0000582 1303 101
BGC0001696 1304 101


BGC0000691 1675 110
BGC0001571 1675 110
BGC0000665 1675 110
BGC0001585 1676 110
BGC0000457 1677 110
BGC0000033 1678 110
BGC0000201 1679 110
BGC0001920 1680 110
BGC0001578 1681 110
BGC0000698 1681 110
BGC0000208 1682 110
BGC0000806 1683 110
BGC0001775 1684 110
BGC0000495 1685 110
BGC0001547 1685 110
BGC0001123 1686 110
BGC0001311 1687 110
BGC0000839 1688 110
BGC0000005 1688 110
BGC0000237 1689 110
BGC0000653 1690 110
BGC0001781 1691 110
BGC0000461 1692 110
BGC0000801 1692 110
BGC0000459 1693 110
BGC0001927 1694 110
BGC0001329 1695 110
BGC0000466 1695 110
BGC0000654 1696 110
BGC0001786 1697 110
BGC0000230 1698 110
BGC0000002 1699 110
BGC0001316 1699 110
BGC0001918 1700 110
BGC0001124 1701 110
BGC0001540 1701 110
BGC0001772 1702 110
BGC0000600 1703 110
BGC0000432 1704 111
BGC0001184 1705 111
BGC0000056 1706 111
BGC0000264 1707 111
BGC0001170 1708 111
BGC0001342 1709 111
BGC0000290 1710 111
BGC0001726 1711 111
BGC0001514 1711 111
BGC0001389 1712 111
BGC0000069 1713 111
BGC0001987 1713 111


In [39]:
spec_dict_final

{'BGC0001102.1.region001.gbk': [['consensus: X'],
  ['consensus: mal', 'PKS signature: Malonyl-CoA', 'Minowa: mal'],
  ['KR activity: active', 'KR stereochemistry: B1'],
  ['KR activity: inactive', 'KR stereochemistry: C2'],
  ['consensus: X'],
  ['consensus: mal', 'PKS signature: Malonyl-CoA', 'Minowa: mal'],
  ['Minowa: NH2']],
 'BGC0001330.1.region001.gbk': [['consensus: X'],
  ['Minowa: NH2'],
  ['consensus: mmal', 'PKS signature: Malonyl-CoA', 'Minowa: mmal'],
  ['KR activity: active', 'KR stereochemistry: A1'],
  ['consensus: mal', 'PKS signature: (unknown)', 'Minowa: mal'],
  ['consensus: gly'],
  ['consensus: ser'],
  ['consensus: mal', 'PKS signature: Malonyl-CoA', 'Minowa: mal'],
  ['KR activity: active', 'KR stereochemistry: (unknown)'],
  ['consensus: gly']],
 'BGC0001754.1.region001.gbk': [['Minowa: NH2'],
  ['consensus: mal', 'PKS signature: Malonyl-CoA', 'Minowa: mal'],
  ['consensus: thr'],
  ['consensus: tyr'],
  ['consensus: ser'],
  ['consensus: X'],
  ['consensus: v

In [40]:
def filter_col_list(col_list_final):
    for spec_col in col_list_final:
        if 'inactive' in str(spec_col):
            print(spec_col)
            col_list_final.remove(spec_col)

filter_col_list(col_list_final)
filter_col_list(col_list_final) ### I'm not sure why I need to run twice to remove all the inactive

col_list_final

['KR activity: inactive', 'KR stereochemistry: C2']
['KR activity: inactive', 'KR stereochemistry: C1']
['KR activity: inactive', 'KR stereochemistry: (unknown)']
['consensus: mal', 'PKS signature: Malonyl-CoA', 'Minowa: inactive']
['KR activity: inactive', 'KR stereochemistry: A1']
['consensus: pk', 'PKS signature: (unknown)', 'Minowa: inactive']


[['consensus: X'],
 ['consensus: mal', 'PKS signature: Malonyl-CoA', 'Minowa: mal'],
 ['KR activity: active', 'KR stereochemistry: B1'],
 ['Minowa: NH2'],
 ['consensus: mmal', 'PKS signature: Malonyl-CoA', 'Minowa: mmal'],
 ['KR activity: active', 'KR stereochemistry: A1'],
 ['consensus: mal', 'PKS signature: (unknown)', 'Minowa: mal'],
 ['consensus: gly'],
 ['consensus: ser'],
 ['KR activity: active', 'KR stereochemistry: (unknown)'],
 ['consensus: thr'],
 ['consensus: tyr'],
 ['consensus: val'],
 ['consensus: ala'],
 ['consensus: hpg'],
 ['consensus: dhpg'],
 ['consensus: bht'],
 ['consensus: mmal', 'PKS signature: Methylmalonyl-CoA', 'Minowa: mmal'],
 ['consensus: pk', 'PKS signature: Malonyl-CoA', 'Minowa: mxmal'],
 ['consensus: mxmal', 'PKS signature: (unknown)', 'Minowa: mxmal'],
 ['consensus: mmal', 'PKS signature: Methylmalonyl-CoA', 'Minowa: benz'],
 ['consensus: pro'],
 ['consensus: leu'],
 ['consensus: gln'],
 ['consensus: dab'],
 ['consensus: pk', 'PKS signature: Methylmalo

In [41]:
len(col_list_final)

105

In [42]:
presence_dict = {}

for key in spec_dict_final.keys():
    short_key = key.split('.1')[0]
    values_list = []
    for i,v in enumerate(col_list_final):
        if v in spec_dict_final[key]:
            values_list.append(1)
        else:
            values_list.append(0)
    presence_dict[short_key] = values_list

presence_df = pd.DataFrame.from_dict(presence_dict).T

name_dict = dict(zip(presence_df.columns,col_list_final))

presence_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,100,101,102,103,104
BGC0001102,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0001330,1,1,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
BGC0001754,1,1,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
BGC0001566,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0000672,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BGC0000863,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BGC0000299,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0001179,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0000897,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
for key in name_dict:
    name_dict[key] = str(name_dict[key])
    
presence_df = presence_df.rename(columns = name_dict, inplace = False)

presence_df = presence_df.sort_index()

presence_df

Unnamed: 0,['consensus: X'],"['consensus: mal', 'PKS signature: Malonyl-CoA', 'Minowa: mal']","['KR activity: active', 'KR stereochemistry: B1']",['Minowa: NH2'],"['consensus: mmal', 'PKS signature: Malonyl-CoA', 'Minowa: mmal']","['KR activity: active', 'KR stereochemistry: A1']","['consensus: mal', 'PKS signature: (unknown)', 'Minowa: mal']",['consensus: gly'],['consensus: ser'],"['KR activity: active', 'KR stereochemistry: (unknown)']",...,"['consensus: pk', 'PKS signature: CHC-CoA', 'Minowa: CHC-CoA']",['consensus: LDAP'],['consensus: aad'],"['consensus: pk', 'PKS signature: Isobutyryl-CoA', 'Minowa: 2metbut']",['consensus: his'],"['consensus: mal', 'PKS signature: 2-Rhyd-Malonyl-CoA', 'Minowa: mal']",['consensus: cap'],['consensus: alpha-hydroxy-isocaproic-acid'],"['consensus: mmal', 'PKS signature: 2-Methylbutyryl-CoA', 'Minowa: mmal']",['consensus: lys-b']
BGC0000001,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BGC0000002,0,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BGC0000003,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BGC0000004,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0000006,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
main-chr.region008.gbk,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
main-chr.region009.gbk,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
main-chr.region010.gbk,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
main-chr.region011.gbk,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# Merging dataframes

In [44]:
indexes_to_keep = []

for index in presence_df.index:
    if index in class_df.index:
        indexes_to_keep.append(index)
        
print(len(indexes_to_keep))

class_df = class_df.loc[indexes_to_keep]

class_df

1706


Unnamed: 0,PKS,Other,Alkaloid,Oligosaccharide,Terpene,NRPS,Cyclitol,Aminocoumarin,Betalactam,Siderophore,Pyrrolobenzodiazepine,RiPP,Butyrolactone,Nucleoside,Phenazine,Aminoglycoside,tRNA_derived,Phosphonate,Minor,Phenolic
BGC0000001,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000002,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000003,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000004,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000006,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
main-chr.region008.gbk,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
main-chr.region009.gbk,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
main-chr.region010.gbk,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
main-chr.region011.gbk,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [45]:
presence_df = presence_df.loc[indexes_to_keep]

presence_df

Unnamed: 0,['consensus: X'],"['consensus: mal', 'PKS signature: Malonyl-CoA', 'Minowa: mal']","['KR activity: active', 'KR stereochemistry: B1']",['Minowa: NH2'],"['consensus: mmal', 'PKS signature: Malonyl-CoA', 'Minowa: mmal']","['KR activity: active', 'KR stereochemistry: A1']","['consensus: mal', 'PKS signature: (unknown)', 'Minowa: mal']",['consensus: gly'],['consensus: ser'],"['KR activity: active', 'KR stereochemistry: (unknown)']",...,"['consensus: pk', 'PKS signature: CHC-CoA', 'Minowa: CHC-CoA']",['consensus: LDAP'],['consensus: aad'],"['consensus: pk', 'PKS signature: Isobutyryl-CoA', 'Minowa: 2metbut']",['consensus: his'],"['consensus: mal', 'PKS signature: 2-Rhyd-Malonyl-CoA', 'Minowa: mal']",['consensus: cap'],['consensus: alpha-hydroxy-isocaproic-acid'],"['consensus: mmal', 'PKS signature: 2-Methylbutyryl-CoA', 'Minowa: mmal']",['consensus: lys-b']
BGC0000001,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BGC0000002,0,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BGC0000003,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BGC0000004,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0000006,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
main-chr.region008.gbk,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
main-chr.region009.gbk,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
main-chr.region010.gbk,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
main-chr.region011.gbk,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [46]:
final_knn_df = presence_df.join(class_df, how='outer')

final_knn_df

Unnamed: 0,['consensus: X'],"['consensus: mal', 'PKS signature: Malonyl-CoA', 'Minowa: mal']","['KR activity: active', 'KR stereochemistry: B1']",['Minowa: NH2'],"['consensus: mmal', 'PKS signature: Malonyl-CoA', 'Minowa: mmal']","['KR activity: active', 'KR stereochemistry: A1']","['consensus: mal', 'PKS signature: (unknown)', 'Minowa: mal']",['consensus: gly'],['consensus: ser'],"['KR activity: active', 'KR stereochemistry: (unknown)']",...,Pyrrolobenzodiazepine,RiPP,Butyrolactone,Nucleoside,Phenazine,Aminoglycoside,tRNA_derived,Phosphonate,Minor,Phenolic
BGC0000001,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BGC0000002,0,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BGC0000003,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BGC0000004,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BGC0000006,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
main-chr.region008.gbk,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
main-chr.region009.gbk,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
main-chr.region010.gbk,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
main-chr.region011.gbk,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [47]:
label_col = []

for i,r in final_knn_df.iterrows():
    if i in mibig_activity_dict:
        label_col.append(mibig_activity_dict[i])
    else:
        if 'region' in i:
            label_col.append('cena_bgc')
        else:
            label_col.append('unlabeled')
        
label_col

['antibacterial',
 'antibacterial-antifungal',
 'unlabeled',
 'unlabeled',
 'unlabeled',
 'unlabeled',
 'unlabeled',
 'unlabeled',
 'unlabeled',
 'unlabeled',
 'unlabeled',
 'antifungal',
 'unlabeled',
 'antibacterial',
 'cytotoxic',
 'cytotoxic',
 'unlabeled',
 'antifungal-cytotoxic',
 'antifungal-cytotoxic',
 'antibacterial',
 'unlabeled',
 'antibacterial-antifungal-cytotoxic',
 'antibacterial',
 'unlabeled',
 'antibacterial-cytotoxic',
 'antibacterial',
 'antibacterial-antifungal-cytotoxic',
 'antifungal',
 'antibacterial',
 'antibacterial',
 'unlabeled',
 'unknown',
 'unlabeled',
 'cytotoxic',
 'unlabeled',
 'antibacterial',
 'cytotoxic',
 'unknown',
 'unlabeled',
 'unlabeled',
 'antibacterial',
 'unlabeled',
 'antibacterial',
 'antifungal',
 'antifungal',
 'cytotoxic',
 'antibacterial',
 'antibacterial',
 'unlabeled',
 'unlabeled',
 'cytotoxic',
 'antifungal',
 'cytotoxic',
 'antifungal',
 'unlabeled',
 'unlabeled',
 'unlabeled',
 'antifungal',
 'antifungal-cytotoxic',
 'antifunga

In [48]:
final_knn_df['label'] = label_col

final_knn_df

Unnamed: 0,['consensus: X'],"['consensus: mal', 'PKS signature: Malonyl-CoA', 'Minowa: mal']","['KR activity: active', 'KR stereochemistry: B1']",['Minowa: NH2'],"['consensus: mmal', 'PKS signature: Malonyl-CoA', 'Minowa: mmal']","['KR activity: active', 'KR stereochemistry: A1']","['consensus: mal', 'PKS signature: (unknown)', 'Minowa: mal']",['consensus: gly'],['consensus: ser'],"['KR activity: active', 'KR stereochemistry: (unknown)']",...,RiPP,Butyrolactone,Nucleoside,Phenazine,Aminoglycoside,tRNA_derived,Phosphonate,Minor,Phenolic,label
BGC0000001,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,antibacterial
BGC0000002,0,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,antibacterial-antifungal
BGC0000003,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,unlabeled
BGC0000004,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,unlabeled
BGC0000006,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,unlabeled
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
main-chr.region008.gbk,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,cena_bgc
main-chr.region009.gbk,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,cena_bgc
main-chr.region010.gbk,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,cena_bgc
main-chr.region011.gbk,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,cena_bgc


In [49]:
np.unique(final_knn_df['label'])

array(['antibacterial', 'antibacterial-antifungal',
       'antibacterial-antifungal-cytotoxic', 'antibacterial-cytotoxic',
       'antifungal', 'antifungal-cytotoxic', 'antihelmintic',
       'antioxidant', 'antiprotozoa', 'antiviral', 'cena_bgc',
       'cytotoxic', 'cytotoxic-unknown', 'herbicide', 'inhibitor',
       'other', 'pigment', 'regulatory', 'siderophore', 'unknown',
       'unlabeled'], dtype=object)

In [50]:
final_knn_df = final_knn_df[final_knn_df['label'] != 'unlabeled']

final_knn_df

Unnamed: 0,['consensus: X'],"['consensus: mal', 'PKS signature: Malonyl-CoA', 'Minowa: mal']","['KR activity: active', 'KR stereochemistry: B1']",['Minowa: NH2'],"['consensus: mmal', 'PKS signature: Malonyl-CoA', 'Minowa: mmal']","['KR activity: active', 'KR stereochemistry: A1']","['consensus: mal', 'PKS signature: (unknown)', 'Minowa: mal']",['consensus: gly'],['consensus: ser'],"['KR activity: active', 'KR stereochemistry: (unknown)']",...,RiPP,Butyrolactone,Nucleoside,Phenazine,Aminoglycoside,tRNA_derived,Phosphonate,Minor,Phenolic,label
BGC0000001,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,antibacterial
BGC0000002,0,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,antibacterial-antifungal
BGC0000014,0,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,antifungal
BGC0000018,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,antibacterial
BGC0000020,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,cytotoxic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
main-chr.region008.gbk,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,cena_bgc
main-chr.region009.gbk,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,cena_bgc
main-chr.region010.gbk,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,cena_bgc
main-chr.region011.gbk,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,cena_bgc


# Running cross-validation

In [51]:
final_training_df = final_knn_df[final_knn_df['label'] != 'cena_bgc']

final_training_df

Unnamed: 0,['consensus: X'],"['consensus: mal', 'PKS signature: Malonyl-CoA', 'Minowa: mal']","['KR activity: active', 'KR stereochemistry: B1']",['Minowa: NH2'],"['consensus: mmal', 'PKS signature: Malonyl-CoA', 'Minowa: mmal']","['KR activity: active', 'KR stereochemistry: A1']","['consensus: mal', 'PKS signature: (unknown)', 'Minowa: mal']",['consensus: gly'],['consensus: ser'],"['KR activity: active', 'KR stereochemistry: (unknown)']",...,RiPP,Butyrolactone,Nucleoside,Phenazine,Aminoglycoside,tRNA_derived,Phosphonate,Minor,Phenolic,label
BGC0000001,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,antibacterial
BGC0000002,0,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,antibacterial-antifungal
BGC0000014,0,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,antifungal
BGC0000018,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,antibacterial
BGC0000020,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,cytotoxic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BGC0002028,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,antibacterial
BGC0002029,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,antibacterial-antifungal
BGC0002032,0,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,antifungal
BGC0002033,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,antibacterial


In [52]:
from sklearn.model_selection import StratifiedKFold

def get_filt_prec(jaccard_df):
    for cutoff in [0.3,0.5,0.7,0.9]:
        filtered_jaccard_df = jaccard_df[jaccard_df['jaccard_index'] >= cutoff]
        if len(filtered_jaccard_df) > 0:
            count = 0
            for i,r in filtered_jaccard_df.iterrows():
                if r['predicted_class'][0] not in r['true_class']:
                    count += 1
            precision_filtered = ((len(filtered_jaccard_df)-count)/len(filtered_jaccard_df))*100
            print('cutoff=%s'%cutoff,len(filtered_jaccard_df)-count,count,len(filtered_jaccard_df),round(precision_filtered,2))

X = final_training_df.drop('label',axis=1)
y = final_training_df['label']

skf = StratifiedKFold(n_splits=5, random_state=1066, shuffle=True)
for train_index, test_index in skf.split(X, y):
    training_df = final_training_df.iloc[train_index]
    testing_df = final_training_df.iloc[test_index]
    true_labels = testing_df['label']
    testing_df = testing_df.drop('label',axis=1)
    X_div = training_df.drop("label", axis=1)
    y_div = training_df["label"]
    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(X_div,y_div)
    distances, indices = nbrs.kneighbors(testing_df)
    y_div = y_div.reset_index(drop=True)
    neighbors_array = []
    for item in indices:
        candidate_list = []
        for i in range(1):
            candidate_list.append(y_div[item[i]])
        neighbors_array.append(candidate_list)
    neighbors_array = np.asarray(neighbors_array)
    correct_count = 0
    incorrect_count = 0
    predict_col,true_col = [],[]
    for i,item in enumerate(true_labels):
        predict_col.append(neighbors_array[i])
        true_col.append(item)
        if item in neighbors_array[i]:
            correct_count += 1
        else:
            incorrect_count += 1
    precision_score = round(correct_count/len(true_labels)*100,2)
    print('cutoff=0.0',correct_count,incorrect_count,len(true_labels),precision_score)
    def get_binary(fingerprint):
        new_row = []
        for index,item in enumerate(fingerprint):
            if item != 0:
                new_row.append(1)
            else:
                new_row.append(0)
        fingerprint = new_row
        return fingerprint
    jaccard_col,bgc_col = [],[]
    for i,bgc_id in enumerate(testing_df.index):
        bgc_col.append(bgc_id)
        count = 0
        train_index = int(indices[i][0])
        for j,r in training_df.iterrows():
            if count == train_index:
                training_fp = training_df.loc[j].drop('label')
            count += 1
        testing_fp = testing_df.loc[bgc_id]
        training_binary = get_binary(training_fp)
        testing_binary = get_binary(testing_fp)
        jaccard_index = jaccard_score(training_binary,testing_binary)
        jaccard_col.append(round(jaccard_index,2))
    jaccard_dict = {'bgc_ID': bgc_col, 'predicted_class': predict_col, 'true_class': true_col, 'jaccard_index': jaccard_col}
    jaccard_df = pd.DataFrame(jaccard_dict)
    jaccard_df = jaccard_df.sort_values('jaccard_index',ascending = False)
    jaccard_df = jaccard_df.reset_index(drop=True)
    get_filt_prec(jaccard_df)



cutoff=0.0 69 113 182 37.91
cutoff=0.3 81 99 180 45.0
cutoff=0.5 81 95 176 46.02
cutoff=0.7 75 83 158 47.47
cutoff=0.9 53 58 111 47.75
cutoff=0.0 68 114 182 37.36
cutoff=0.3 83 96 179 46.37
cutoff=0.5 82 92 174 47.13
cutoff=0.7 69 75 144 47.92
cutoff=0.9 51 58 109 46.79
cutoff=0.0 73 109 182 40.11
cutoff=0.3 81 96 177 45.76
cutoff=0.5 80 93 173 46.24
cutoff=0.7 75 77 152 49.34
cutoff=0.9 51 54 105 48.57
cutoff=0.0 72 110 182 39.56
cutoff=0.3 86 95 181 47.51
cutoff=0.5 82 93 175 46.86
cutoff=0.7 81 70 151 53.64
cutoff=0.9 58 52 110 52.73
cutoff=0.0 80 102 182 43.96
cutoff=0.3 89 93 182 48.9
cutoff=0.5 89 89 178 50.0
cutoff=0.7 79 72 151 52.32
cutoff=0.9 58 49 107 54.21


# Conclusion: 
## - Just class and substructure is not enough to predict bioactivity
## - We must use some sort of gene similarity
## - Testing the CENA samples won't produce fruitful results since the cross-validation failed