In [1]:
import pandas as pd
import numpy as np
import re
import networkx
from networkx.algorithms.components.connected import connected_components
from collections import defaultdict
import random
import glob
import os
from Bio import SeqIO
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import jaccard_score
import time
start = time.time()

# Loading MIBiG dataframes and creating dictionaries

In [2]:
bioactivity_df = pd.read_csv("./bioactivity_df-220203.csv",sep='\t',index_col=0)

bioactivity_dict = dict(zip(bioactivity_df['metabolite'],bioactivity_df['activity']))

bioactivity_dict

{'abyssomicin': 'antibacterial',
 'aurafuron': 'antifungal-cytotoxic',
 'aureothin': 'antibacterial-antifungal-cytotoxic',
 'avilamycin': 'antibacterial',
 'bafilomycin': 'antibacterial-antifungal-cytotoxic',
 'borrelidin': 'antibacterial-cytotoxic',
 'chlorothricin': 'antibacterial',
 'coelimycin': 'unknown',
 'dawenol': 'unknown',
 'erythromycin': 'antibacterial',
 'galbonolides': 'antifungal',
 'kedarcidin': 'antibacterial-cytotoxic',
 'lactimidomycin': 'antifungal-cytotoxic',
 'neoaureothin': 'antifungal-cytotoxic',
 'neocarzinostatin': 'antibacterial-cytotoxic',
 'nystatin': 'antifungal',
 'pactamycin': 'antibacterial-antifungal-cytotoxic',
 'salinomycin': 'antibacterial-cytotoxic',
 'soraphen': 'antifungal',
 'spirangien': 'antifungal-cytotoxic',
 'sporolide': 'unknown',
 'stambomycin': 'antibacterial-cytotoxic',
 'stigmatellin': 'antifungal',
 'tautomycetin': 'antifungal-cytotoxic',
 'tautomycin': 'antifungal-cytotoxic',
 'tiacumicin': 'antibacterial',
 '9methylstreptimidone': '

In [3]:
np.unique(list(bioactivity_dict.values()))

array(['antibacterial', 'antibacterial-antifungal',
       'antibacterial-antifungal-cytotoxic', 'antibacterial-cytotoxic',
       'antifungal', 'antifungal-cytotoxic', 'cytotoxic',
       'cytotoxic-unknown', 'nan', 'unknown'], dtype='<U34')

In [4]:
mibig_df = pd.read_csv("/Users/tiagoferreiraleao/Dropbox/tiago-NAS/mibig_classifications/All_MIBiG_compounds_with_CF_NPC_classes.txt",sep='\t')

mibig_df

Unnamed: 0,compound_name,smiles,inchi_key,cf_kingdom,cf_superclass,cf_class,cf_subclass,cf_direct_parent,npc_class,npc_superclass,npc_pathway,npc_isglycoside
0,BGC0000001_abyssomicin C,CC1C[C@]23OC(=O)C4=C2OC1C(O)C3\C=C/C(=O)[C@@H]...,FNEADFUPWHAVTA-UHFFFAOYSA-N,Organic compounds,Organoheterocyclic compounds,Oxanes,,Oxanes,Spirotetronate macrolides,Macrolides,Polyketides,0
1,BGC0000001_atrop-abyssomicin C,CC1CC23OC(=O)C4=C2OC1C(O)C3\C=C/C(=O)C(C)CC(C)...,FNEADFUPWHAVTA-UHFFFAOYSA-N,Organic compounds,Organoheterocyclic compounds,Oxanes,,Oxanes,Spirotetronate macrolides,Macrolides,Polyketides,0
2,BGC0000002_aculeximycin,CCCC(O[C@H]1C[C@](C)(N)[C@H](O)[C@H](C)O1)C(C)...,VJKZKLDZOAFAEE-UHFFFAOYSA-N,Organic compounds,Lipids and lipid-like molecules,Prenol lipids,Terpene glycosides,Diterpene glycosides,,,,1
3,BGC0000003_AF-toxin,CCC(C)C(C(=O)OC(/C=C/C=C/C=C/C(=O)O)C1(CO1)C)O...,ONOBRFRRMLDPES-UHFFFAOYSA-N,Organic compounds,Organic acids and derivatives,Peptidomimetics,Depsipeptides,Depsipeptides,,,,0
4,BGC0000004_aflatoxin G1,[H][C@@]12OC=C[C@]1([H])C1=C(O2)C=C(OC)C2=C1OC...,XWIYFDMXXLINPU-UHFFFAOYSA-N,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Furanocoumarins,Difurocoumarolactones,Aflatoxins; Simple coumarins,Chromanes; Coumarins,Polyketides; Shikimates and Phenylpropanoids,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2107,BGC0002034_perquinoline A,C1CC2(C(C3=C(C=C(C=C3O)O)C(N2C1=O)C(=O)NCCCC(=...,CDFQOHHGTNUFPU-UHFFFAOYSA-N,,,,,,,,Alkaloids,0
2108,BGC0002034_perquinoline B,C1CC2(C(C3=C(C=C(C=C3O)O)C(N2C1=O)C(=O)NCCC(=O...,XHJZZGRZEGUFCM-UHFFFAOYSA-N,,,,,,,,Alkaloids,0
2109,BGC0002034_perquinoline C,C1CC2(C(C3=C(C=C(C=C3O)O)C(N2C1=O)C(=O)NCCCC(=...,ZNSLKIGPUIPFNC-UHFFFAOYSA-N,,,,,,,,Alkaloids,0
2110,BGC0002035_ilicicolin H,CC=CC1C2CC(CCC2C(=CC1C(=O)C3=C(C(=CNC3=O)C4=CC...,BYVVOONSAAQMKI-UHFFFAOYSA-N,Organic compounds,Organoheterocyclic compounds,Pyridines and derivatives,Phenylpyridines,Phenylpyridines,Pyridine alkaloids,Nicotinic acid alkaloids,Alkaloids,0


In [5]:
mibig_activity_dict = {}
missing_mets_list = []

for i,r in mibig_df.iterrows():
    met_name = r['compound_name'].split('_')[1].replace(' ','_').lower().split('_')[0]
    for key in bioactivity_dict.keys():
        if type(key) != float:
            if met_name in key:
                print(r['compound_name'].split('_')[0],met_name,key,bioactivity_dict[key])
                if type(bioactivity_dict[key]) == float:
                    mibig_activity_dict[r['compound_name'].split('_')[0]] = 'unknown'
                else:
                    mibig_activity_dict[r['compound_name'].split('_')[0]] = bioactivity_dict[key]
            else:
                missing_mets_list.append(met_name)

BGC0000001 abyssomicin abyssomicin antibacterial
BGC0000002 aculeximycin aculeximycin antibacterial-antifungal
BGC0000014 ambruticin ambruticin antifungal
BGC0000016 amphotericin amphotericin antifungal
BGC0000020 ansamitocin ansamitocin cytotoxic
BGC0000021 apoptolidin apoptolidin cytotoxic
BGC0000023 aurafuron aurafuron antifungal-cytotoxic
BGC0000024 aureothin aureothin antibacterial-antifungal-cytotoxic
BGC0000024 aureothin neoaureothin antifungal-cytotoxic
BGC0000025 avermectin avermectin cytotoxic
BGC0000026 avilamycin avilamycin antibacterial
BGC0000026 avilamycin avilamycin antibacterial
BGC0000028 bafilomycin bafilomycin antibacterial-antifungal-cytotoxic
BGC0000031 borrelidin borrelidin antibacterial-cytotoxic
BGC0000033 calicheamicin calicheamicin antibacterial-antifungal-cytotoxic
BGC0000034 candicidin candicidin antifungal
BGC0000035 chalcomycin chalcomycin antibacterial
BGC0000035 chalcomycin dihyrdochalcomycin unknown
BGC0000036 chlorothricin chlorothricin antibacterial


BGC0000193 aclacinomycin aclacinomycin cytotoxic
BGC0000193 aclacinomycin aclacinomycin_2 cytotoxic
BGC0000193 aclacinomycin aclacinomycin_3 cytotoxic
BGC0000194 actinorhodin actinorhodin antibacterial
BGC0000195 alnumycin alnumycin antibacterial-cytotoxic
BGC0000195 alnumycin alnumycin antibacterial-cytotoxic
BGC0000195 alnumycin alnumycin antibacterial-cytotoxic
BGC0000195 alnumycin alnumycin antibacterial-cytotoxic
BGC0000197 aranciamycin aranciamycin antibacterial-cytotoxic
BGC0000198 arenimycin arenimycin antibacterial-cytotoxic
BGC0000199 arimetamycin arimetamycin cytotoxic
BGC0000199 arimetamycin arimetamycin cytotoxic
BGC0000199 arimetamycin arimetamycin cytotoxic
BGC0000200 arixanthomycin arixanthomycin antibacterial-cytotoxic
BGC0000200 arixanthomycin arixanthomycin antibacterial-cytotoxic
BGC0000200 arixanthomycin arixanthomycin antibacterial-cytotoxic
BGC0000201 auricin auricin antibacterial
BGC0000201 auricin auricin_deoxysugar_moieties unknown
BGC0000202 azicemicin azicem

BGC0000359 fuscachelin fuscachelin nan
BGC0000362 glycopeptidolipid glycopeptidolipid nan
BGC0000362 glycopeptidolipid glycopeptidolipid_1 nan
BGC0000362 glycopeptidolipid glycopeptidolipid_2 nan
BGC0000362 glycopeptidolipid glycopeptidolipid_3 nan
BGC0000362 glycopeptidolipid glycopeptidolipid_4 nan
BGC0000366 gobichelin gobichelin nan
BGC0000366 gobichelin gobichelin nan
BGC0000367 gramicidin gramicidin antibacterial
BGC0000371 heterobactin heterobactin nan
BGC0000371 heterobactin heterobactin nan
BGC0000373 holomycin holomycin antibacterial-antifungal-cytotoxic
BGC0000374 hormaomycin hormaomycin antibacterial
BGC0000374 hormaomycin hormaomycin antibacterial
BGC0000374 hormaomycin hormaomycin antibacterial
BGC0000374 hormaomycin hormaomycin antibacterial
BGC0000374 hormaomycin hormaomycin antibacterial
BGC0000374 hormaomycin hormaomycin antibacterial
BGC0000375 indigoidine indigoidine antibacterial
BGC0000377 koranimine koranimine unknown
BGC0000378 kutzneride kutznerides antifungal


BGC0000609 nocathiacin nocathiacin antibacterial
BGC0000609 nocathiacin nocathiacin_2 antibacterial
BGC0000610 nosiheptide nosiheptide antibacterial
BGC0000611 siomycin siomycin antibacterial-cytotoxic
BGC0000611 siomycin siomycin_2 antibacterial-cytotoxic
BGC0000612 thiocillin thiocillin antibacterial
BGC0000613 thiomuracin thiomuracins antibacterial
BGC0000614 thiostrepton thiostrepton antibacterial-cytotoxic
BGC0000618 enterocin enterocin antibacterial
BGC0000618 enterocin enterocin_AS_48 antibacterial
BGC0000618 enterocin enterocin_a antibacterial
BGC0000618 enterocin enterocin_nkr_5_3b antibacterial
BGC0000625 thioviridamide thioviridamide cytotoxic-unknown
BGC0000628 lactocillin lactocillin antibacterial
BGC0000632 brasilicardin brasilicardin_a antifungal-cytotoxic
BGC0000633 carotenoid carotenoid nan
BGC0000633 carotenoid rubivivax_gelatnosus_carotenoid nan
BGC0000633 carotenoid rhobacter_sphaeroides_241_carotenoid nan
BGC0000633 carotenoid pantoea_anantis_carotenoid nan
BGC0000

BGC0000677 cyclooctatin cyclooctatin nan
BGC0000678 pentalenolactone pentalenolactone antibacterial-antifungal
BGC0000678 pentalenolactone pentalenolactone_2 antibacterial-antifungal
BGC0000679 diazepinomicin diazepinomicin antibacterial
BGC0000690 hydroxystreptomycin 5_hydroxystreptomycin unknown
BGC0000691 acarbose acarbose nan
BGC0000692 apramycin apramycin antibacterial
BGC0000693 butirosin butirosin antibacterial
BGC0000693 butirosin butirosin_2 antibacterial
BGC0000693 butirosin butirosin antibacterial
BGC0000693 butirosin butirosin_2 antibacterial
BGC0000694 butirosin butirosin antibacterial
BGC0000694 butirosin butirosin_2 antibacterial
BGC0000695 fortimicin fortimicin antibacterial
BGC0000696 gentamicin gentamicin antibacterial
BGC0000696 gentamicin gentamicin_2 antibacterial
BGC0000698 hygromycin hygromycin_a antibacterial
BGC0000698 hygromycin hygromycin_b antibacterial-antifungal
BGC0000699 hygromycin hygromycin_a antibacterial
BGC0000699 hygromycin hygromycin_b antibacteri

BGC0000752 capsular capsular_polysaccharide_16 nan
BGC0000752 capsular capsular_polysaccharide_17 nan
BGC0000752 capsular capsular_polysaccharide_18 nan
BGC0000752 capsular capsular_polysaccharide_19 nan
BGC0000752 capsular capsular_polysaccharide_20 nan
BGC0000752 capsular capsular_polysaccharide_21 nan
BGC0000752 capsular capsular_polysaccharide_22 nan
BGC0000752 capsular capsular_polysaccharide_23 nan
BGC0000752 capsular capsular_polysaccharide_24 nan
BGC0000752 capsular capsular_polysaccharide_25 nan
BGC0000752 capsular capsular_polysaccharide_26 nan
BGC0000752 capsular capsular_polysaccharide_27 nan
BGC0000752 capsular capsular_polysaccharide_28 nan
BGC0000752 capsular capsular_polysaccharide_29 nan
BGC0000752 capsular capsular_polysaccharide_30 nan
BGC0000753 capsular capsular_polysaccharide nan
BGC0000753 capsular capsular_polysaccharide_2 nan
BGC0000753 capsular capsular_polysaccharide_3 nan
BGC0000753 capsular capsular_polysaccharide_4 nan
BGC0000753 capsular capsular_polysacc

BGC0000950 napsamycin napsamycin antibacterial
BGC0000950 napsamycin napsamycin antibacterial
BGC0000950 napsamycin napsamycin antibacterial
BGC0000950 napsamycin napsamycin antibacterial
BGC0000951 pacidamycin pacidamycin antibacterial
BGC0000951 pacidamycin pacidamycin antibacterial
BGC0000951 pacidamycin pacidamycin antibacterial
BGC0000951 pacidamycin pacidamycin antibacterial
BGC0000951 pacidamycin pacidamycin antibacterial
BGC0000951 pacidamycin pacidamycin antibacterial
BGC0000951 pacidamycin pacidamycin antibacterial
BGC0000951 pacidamycin pacidamycin antibacterial
BGC0000953 amicetin amicetin antibacterial
BGC0000954 ajudazol ajudazol antifungal
BGC0000955 althiomycin althiomycin antibacterial
BGC0000955 althiomycin althiomycin_2 antibacterial
BGC0000956 andrimid andrimid antibacterial
BGC0000958 antimycin antimycin antifungal-cytotoxic
BGC0000960 azinomycin fosfazinomycin antifungal
BGC0000960 azinomycin azinomycin_b antibacterial-cytotoxic
BGC0000961 bactobolin bactobolin an

BGC0001126 ambiguine ambiguine antibacterial-antifungal-cytotoxic
BGC0001126 ambiguine ambiguine antibacterial-antifungal-cytotoxic
BGC0001126 ambiguine ambiguine antibacterial-antifungal-cytotoxic
BGC0001127 jagaricin jagaricin antifungal
BGC0001132 xenotetrapeptide xenotetrapeptide unknown
BGC0001133 taxlllaid taxlllaid nan
BGC0001137 marinacarboline marinacarboline cytotoxic
BGC0001137 marinacarboline marinacarboline cytotoxic
BGC0001137 marinacarboline marinacarboline cytotoxic
BGC0001137 marinacarboline marinacarboline cytotoxic
BGC0001140 platencin platencin antibacterial
BGC0001140 platensimycin platensimycin antibacterial
BGC0001145 cyclothiazomycin cyclothiazomycin antifungal
BGC0001145 cyclothiazomycin cyclothiazomycin_B antibacterial-antifungal
BGC0001145 cyclothiazomycin cyclothiazomycin_C antibacterial
BGC0001146 cyclothiazomycin cyclothiazomycin antifungal
BGC0001146 cyclothiazomycin cyclothiazomycin_B antibacterial-antifungal
BGC0001146 cyclothiazomycin cyclothiazomycin_

BGC0001393 lugdunin lugdunin antibacterial
BGC0001394 phenalamide phenalamide antibacterial-antifungal
BGC0001395 teichomycin teichomycin antibacterial
BGC0001396 aldgamycin aldgamycin antibacterial
BGC0001396 aldgamycin aldgamycin antibacterial
BGC0001396 aldgamycin aldgamycin antibacterial
BGC0001396 aldgamycin aldgamycin antibacterial
BGC0001397 lidamycin validamycin antibacterial-antifungal
BGC0001397 lidamycin validamycin_2 antibacterial-antifungal
BGC0001397 lidamycin lidamycin antibacterial-cytotoxic
BGC0001406 telomycin telomycin antibacterial
BGC0001409 dutomycin dutomycin antibacterial-cytotoxic
BGC0001414 griselimycin griselimycin antibacterial
BGC0001415 althiomycin althiomycin antibacterial
BGC0001415 althiomycin althiomycin_2 antibacterial
BGC0001417 myxochromide myxochromide nan
BGC0001417 myxochromide myxochromide_d unknown
BGC0001417 myxochromide myxochromide_d_2 unknown
BGC0001417 myxochromide myxochromide_d_3 unknown
BGC0001417 myxochromide myxochromide_d_4 unknown
B

BGC0001558 cosmomycin cosmomycin cytotoxic
BGC0001558 cosmomycin cosmomycin_d antibacterial-cytotoxic
BGC0001566 cylindrocyclophane cylindrocyclophane cytotoxic
BGC0001572 desferrioxamine desferrioxamine nan
BGC0001572 desferrioxamine desferrioxamine_2 nan
BGC0001572 desferrioxamine desferrioxamine_b nan
BGC0001580 ebelactone ebelactone antifungal
BGC0001603 gentamicin gentamicin antibacterial
BGC0001603 gentamicin gentamicin_2 antibacterial
BGC0001612 ambiguine ambiguine antibacterial-antifungal-cytotoxic
BGC0001614 hassallidin hassallidins antifungal
BGC0001648 lasalocid lasalocid antibacterial
BGC0001670 monensin monensin antibacterial
BGC0001690 natamycin natamycin antifungal
BGC0001705 nodularin nodularin nan
BGC0001707 nosiheptide nosiheptide antibacterial
BGC0001709 nystatin nystatin antifungal
BGC0001710 obafluorin obafluorin antibacterial
BGC0001723 oviedomycin oviedomycin cytotoxic
BGC0001737 phenalamide phenalamide antibacterial-antifungal
BGC0001742 piericidin piericidin_a1

In [6]:
len(mibig_activity_dict)

806

In [7]:
mibig_activity_dict

{'BGC0000001': 'antibacterial',
 'BGC0000002': 'antibacterial-antifungal',
 'BGC0000014': 'antifungal',
 'BGC0000016': 'antifungal',
 'BGC0000020': 'cytotoxic',
 'BGC0000021': 'cytotoxic',
 'BGC0000023': 'antifungal-cytotoxic',
 'BGC0000024': 'antifungal-cytotoxic',
 'BGC0000025': 'cytotoxic',
 'BGC0000026': 'antibacterial',
 'BGC0000028': 'antibacterial-antifungal-cytotoxic',
 'BGC0000031': 'antibacterial-cytotoxic',
 'BGC0000033': 'antibacterial-antifungal-cytotoxic',
 'BGC0000034': 'antifungal',
 'BGC0000035': 'unknown',
 'BGC0000036': 'antibacterial',
 'BGC0000038': 'unknown',
 'BGC0000040': 'cytotoxic',
 'BGC0000042': 'antibacterial',
 'BGC0000043': 'cytotoxic',
 'BGC0000044': 'unknown',
 'BGC0000051': 'antifungal',
 'BGC0000053': 'cytotoxic',
 'BGC0000054': 'antibacterial',
 'BGC0000055': 'antibacterial',
 'BGC0000059': 'antifungal',
 'BGC0000060': 'cytotoxic',
 'BGC0000061': 'antifungal',
 'BGC0000066': 'antifungal-cytotoxic',
 'BGC0000067': 'antifungal-cytotoxic',
 'BGC0000068'

In [8]:
mibig_bio_3_df = pd.read_csv('./mibig_3_bioactivity.csv',sep='\t',names=['MIBIG_ID','Activity'])

mibig_bio_3_dict = dict(zip(mibig_bio_3_df['MIBIG_ID'],mibig_bio_3_df['Activity']))

mibig_bio_3_dict

{'BGC0000018': 'Antibacterial',
 'BGC0000019': 'Antibacterial',
 'BGC0000025': 'Cytotoxic',
 'BGC0000032': 'Antibacterial',
 'BGC0000034': 'Antifungal',
 'BGC0000035': 'Antibacterial',
 'BGC0000040': 'Antiviral',
 'BGC0000042': 'Antibacterial',
 'BGC0000047': 'Antibacterial',
 'BGC0000050': 'Antibacterial',
 'BGC0000052': 'Antifungal',
 'BGC0000058': 'Cytotoxic',
 'BGC0000059': 'Antibacterial',
 'BGC0000060': 'Cytotoxic',
 'BGC0000061': 'Antifungal',
 'BGC0000066': 'Antibacterial',
 'BGC0000067': 'Antibacterial',
 'BGC0000068': 'Antibacterial',
 'BGC0000073': 'Antibacterial',
 'BGC0000074': 'Antibacterial',
 'BGC0000075': 'Antibacterial',
 'BGC0000078': 'Cytotoxic',
 'BGC0000079': 'Antibacterial',
 'BGC0000084': 'Antibacterial',
 'BGC0000085': 'Antibacterial',
 'BGC0000086': 'Antibacterial',
 'BGC0000087': 'Antibacterial',
 'BGC0000091': 'Cytotoxic',
 'BGC0000093': 'Antibacterial',
 'BGC0000096': 'Antibacterial',
 'BGC0000100': 'Antiprotozoa',
 'BGC0000105': 'Antibacterial',
 'BGC00001

In [9]:
for key in mibig_activity_dict:
    if mibig_activity_dict[key] == 'unknown':
        if key in mibig_bio_3_dict.keys():
            mibig_activity_dict[key] = mibig_bio_3_dict[key].lower()

for key in mibig_bio_3_dict:
    if key not in mibig_activity_dict:
        mibig_activity_dict[key] = mibig_bio_3_dict[key].lower()

In [10]:
bgc_subtype_df = pd.read_csv("/Users/tiagoferreiraleao/Dropbox/tiago-NAS/NRPOmix/bgc_subtype_df.csv",'\t',names=['BGC','subtype'])

bgc_subtype_df

Unnamed: 0,BGC,subtype
0,BGC0000001,Modular type I polyketide
1,BGC0000002,Polyketide
2,BGC0000003,Polyketide
3,BGC0000004,Polyketide
4,BGC0000005,Polyketide
...,...,...
1921,BGC0002045,Type II polyketide
1922,BGC0002055,Trans-AT type I polyketide
1923,BGC0002056,Trans-AT type I polyketide
1924,BGC0002057,Trans-AT type I polyketide


In [11]:
subtype_type_dict = {}
with open("/Users/tiagoferreiraleao/Dropbox/tiago-NAS/NRPOmix/subtype_type_df.csv") as f:
    for line in f:
        (key, val) = line.split(',')
        subtype_type_dict[key] = val.strip('\n')
        
subtype_type_dict

{'Modular type I polyketide': 'PKS',
 'Polyketide': 'PKS',
 'Other': 'Other',
 'Alkaloid Modular type I polyketide': 'Alkaloid-PKS',
 'Oligosaccharide': 'Oligosaccharide',
 'Iterative type I polyketide': 'PKS',
 'Modular type I polyketide Iterative type I polyketide Oligosaccharide': 'Oligosaccharide-PKS',
 'Modular type I polyketide Hybrid/tailoring saccharide': 'Oligosaccharide-PKS',
 'Iterative type I polyketide Enediyne type I polyketide': 'PKS',
 'Modular type I polyketide Trans-AT type I polyketide': 'PKS',
 'Terpene Iterative type I polyketide': 'PKS-Terpene',
 'Iterative type I polyketide Hybrid/tailoring saccharide': 'Oligosaccharide-PKS',
 'NRP Enediyne type I polyketide': 'NRPS-PKS',
 'NRP Modular type I polyketide': 'NRPS-PKS',
 'Trans-AT type I polyketide': 'PKS',
 'Polyketide NRP': 'NRPS-PKS',
 'Iterative type I polyketide Trans-AT type I polyketide': 'PKS',
 'Type II polyketide': 'PKS',
 'Alkaloid': 'Alkaloid',
 'Type III polyketide': 'PKS',
 'Type II polyketide Hybrid/t

In [12]:
type_col = []

for i,r in bgc_subtype_df.iterrows():
    type_col.append(subtype_type_dict[r['subtype']])
    
bgc_subtype_df['type'] = type_col

bgc_subtype_df

Unnamed: 0,BGC,subtype,type
0,BGC0000001,Modular type I polyketide,PKS
1,BGC0000002,Polyketide,PKS
2,BGC0000003,Polyketide,PKS
3,BGC0000004,Polyketide,PKS
4,BGC0000005,Polyketide,PKS
...,...,...,...
1921,BGC0002045,Type II polyketide,PKS
1922,BGC0002055,Trans-AT type I polyketide,PKS
1923,BGC0002056,Trans-AT type I polyketide,PKS
1924,BGC0002057,Trans-AT type I polyketide,PKS


In [13]:
bgc_type_dict = dict(zip(bgc_subtype_df.BGC,bgc_subtype_df.type))

bgc_type_dict

{'BGC0000001': 'PKS',
 'BGC0000002': 'PKS',
 'BGC0000003': 'PKS',
 'BGC0000004': 'PKS',
 'BGC0000005': 'PKS',
 'BGC0000006': 'PKS',
 'BGC0000007': 'PKS',
 'BGC0000008': 'PKS',
 'BGC0000009': 'PKS',
 'BGC0000010': 'PKS',
 'BGC0000011': 'PKS',
 'BGC0000012': 'PKS',
 'BGC0000013': 'PKS',
 'BGC0000014': 'PKS',
 'BGC0000016': 'Other',
 'BGC0000017': 'Alkaloid-PKS',
 'BGC0000018': 'PKS',
 'BGC0000019': 'PKS',
 'BGC0000020': 'PKS',
 'BGC0000021': 'PKS',
 'BGC0000022': 'PKS',
 'BGC0000023': 'PKS',
 'BGC0000024': 'PKS',
 'BGC0000025': 'PKS',
 'BGC0000026': 'Oligosaccharide',
 'BGC0000027': 'PKS',
 'BGC0000028': 'PKS',
 'BGC0000029': 'PKS',
 'BGC0000030': 'PKS',
 'BGC0000031': 'PKS',
 'BGC0000032': 'PKS',
 'BGC0000033': 'PKS',
 'BGC0000034': 'PKS',
 'BGC0000035': 'PKS',
 'BGC0000036': 'Oligosaccharide-PKS',
 'BGC0000037': 'PKS',
 'BGC0000038': 'PKS',
 'BGC0000039': 'PKS',
 'BGC0000040': 'PKS',
 'BGC0000041': 'PKS',
 'BGC0000042': 'PKS',
 'BGC0000043': 'PKS',
 'BGC0000044': 'PKS',
 'BGC0000045': 

# Getting classes for CENA samples

In [64]:
def get_feature_class(bgc_name,genome_name):
    filename = "/Users/tiagoferreiraleao/Dropbox/tiago-NAS/BioGeneClassifier/antismash_results_CENAs/%s/%s"%(genome_name,bgc_name)
    if os.path.exists(filename):
        input_handle = open(filename,'r')
        for seq_record in SeqIO.parse(input_handle,'genbank'):
            edge_list,type_list = [],[]
            for feature in seq_record.features:
                if feature.type == 'cand_cluster':
                    for qual in feature.qualifiers:
                        if qual == 'contig_edge':
                            ctg_edge = feature.qualifiers[qual]
                            edge_list.append(ctg_edge[0])
                        if qual == 'product':
                            bgc_type = feature.qualifiers[qual]
                            for item_type in bgc_type:
                                if item_type not in type_list:
                                    type_list.append(item_type)
            return edge_list,type_list
    else:
        return 'NA','NA'
            
all_classes = []

for root, dirs, files in os.walk("/Users/tiagoferreiraleao/Dropbox/tiago-NAS/BGClassifier/antismash_results_CENAs/"):
    for file in files:
        if file.endswith(".gbk"):
            ctg_edge,class_list = get_feature_class(file,os.path.basename(root))
            for class_type in class_list:
                all_classes.append(class_type)
    
all_classes = np.unique(all_classes)

all_classes

array(['A', 'N'], dtype='<U1')

In [15]:
common_dict = {"A":"NA",
"CDPS":"NRPS",
"LAP":"RiPP",
"N":"NA",
"NAGGN":"NRPS",
"NRPS":"NRPS",
"NRPS-like":"NRPS",
"PBDE":"Phenolic",
"PKS-like":"PKS",
"PUFA":"PKS",
"PpyS-KS":"PKS",
"T1PKS":"PKS",
"T2PKS":"PKS",
"T3PKS":"PKS",
"TfuA-related":"RiPP",
"amglyccycl":"Minor",
"arylpolyene":["PKS","Phenolic"],
"bacteriocin":"RiPP",
"betalactone":"Minor",
"blactam":"Minor",
"butyrolactone":"Minor",
"cyanobactin":"RiPP",
"ectoine":"Minor",
"furan":"Minor",
"fused":"Minor",
"hglE-KS":"PKS",
"hserlactone":["PKS","NRPS"],
"indole":"Minor",
"ladderane":"RiPP",
"lanthipeptide":"RiPP",
"lassopeptide":"RiPP",
"linaridin":"RiPP",
"melanin":"NRPS",
"microviridin":"RiPP",
"nucleoside":"Nucleoside",
"oligosaccharide":"Oligosaccharide",
"other":"Other",
"phenazine":"Minor",
"phosphonate":"Phosphonate",
"proteusin":"RiPP",
"resorcinol":"PKS",
"sactipeptide":"RiPP",
"siderophore":"Siderophore",
"terpene":"Terpene",
"thiopeptide":"RiPP",
"transAT-PKS":"PKS",
"transAT-PKS-like":"PKS",
"RRE-containing":"RiPP",
"RiPP-like":"RiPP",
"lanthipeptide-class-ii":"RiPP",
"lanthipeptide-class-v":"RiPP",
"spliceotide":"Minor",
"thioamitides":"RiPP"}

In [16]:
for item in all_classes:
    if item not in common_dict.keys():
        print(item)

# Creating class dataframe

In [17]:
combined_list1,all_classes,type_list = [],[],[]

for key in bgc_type_dict:
    if 'tRNA' in bgc_type_dict[key]:
        type_list = ['tRNA_derived']
    else:
        type_list = bgc_type_dict[key].split('-')
    for item in type_list:
        if item not in all_classes:
            all_classes.append(item)
    combined_list1.append(type_list)
    
combined_list1

[['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Other'],
 ['Alkaloid', 'PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide', 'PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide', 'PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide', 'PKS'],
 ['PKS'],
 ['PKS'

In [53]:
combined_list2 = []
cena_bgc_list = []

for root, dirs, files in os.walk("/Users/tiagoferreiraleao/Dropbox/tiago-NAS/BGClassifier/antismash_results_CENAs/"):
    for file in files:
        if 'region' in file:
            if file.endswith(".gbk"):
                ctg_edge,class_list = get_feature_class(file,os.path.basename(root))
                combined_classes = []
                for bgc_class in class_list.split(','):
                    if type(common_dict[bgc_class]) != list:
                        combined_classes.append(common_dict[bgc_class])
                        if common_dict[bgc_class] not in all_classes:
                            all_classes.append(common_dict[bgc_class])
                    else:
                        for item in common_dict[bgc_class]:
                            combined_classes.append(item)
                            if item not in all_classes:
                                all_classes.append(item)
                cena_bgc_list.append(file.split('.gbk')[0])
                combined_list2.append(combined_classes)
    
combined_list2

NA


KeyError: 'NA'

In [19]:
combined_list = combined_list1 + combined_list2

combined_list

[['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Other'],
 ['Alkaloid', 'PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide', 'PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide', 'PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['PKS'],
 ['Oligosaccharide', 'PKS'],
 ['PKS'],
 ['PKS'

In [20]:
class_df = pd.DataFrame(columns=(all_classes))

bgc_list = list(bgc_type_dict.keys()) + cena_bgc_list

for i,combined_classes in enumerate(combined_list):
    row_extension = []
    for final_class in all_classes:
        if final_class in combined_classes:
            row_extension.append(1)
        else:
            row_extension.append(0)
    class_df.loc[bgc_list[i]] = row_extension
    
class_df = class_df.sort_index(axis=0)
    
class_df

Unnamed: 0,PKS,Other,Alkaloid,Oligosaccharide,Terpene,NRPS,Cyclitol,Aminocoumarin,Betalactam,Siderophore,Pyrrolobenzodiazepine,RiPP,Butyrolactone,Nucleoside,Phenazine,Aminoglycoside,tRNA_derived,Phosphonate,NA
BGC0000001,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000002,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000003,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000004,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000005,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
main-chr.region008,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
main-chr.region009,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
main-chr.region010,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
main-chr.region011,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


# Creating similarity dataframe

In [21]:
### obtaining bigscape dataframe and bigscape dictionary

def parse_bigscape_df(input_file,no_orphans):
    bigscape_df = pd.read_csv(input_file,sep='\t')
    bigscape_df.rename(columns=lambda x: re.sub(" ","_",x), inplace=True)
    if no_orphans == 1:
        bigscape_df = bigscape_df[bigscape_df.Clustername_1 != bigscape_df.Clustername_2]
    bigscape_df.reset_index(inplace=True,drop=True)
    return bigscape_df

def get_neighbors(target,dataframe,column1,column2):
    subset1 = dataframe[(dataframe[column1]==target)]
    subcat = subset1.append(dataframe[(dataframe[column2]==target)])
    temp_list = []
    for index,row in subcat.iterrows():
        temp_list.append(subcat[column1][index])
        temp_list.append(subcat[column2][index])
    temp_list = list(np.unique(temp_list))
    return temp_list

def to_edges(l):
    it = iter(l)
    last = next(it)
    for current in it:
        yield last, current
        last = current

def to_graph(l):
    G = networkx.Graph()
    for part in l:
        G.add_nodes_from(part)
        G.add_edges_from(to_edges(part))
    return G

def get_family_dict(components_list,dataframe,dictionary,column1,column2,column3):
    count = 0
    for family in list(components_list):
        count += 1
        for fam_member in family:
            dictionary['GCF%s'%count].append(fam_member)
    return dictionary

def main_get_families(input_file):
    bigscape_df = parse_bigscape_df(input_file,1)
    strain_list = list(np.unique([bigscape_df['Clustername_1']]+[bigscape_df['Clustername_2']]))
    targets_list = np.unique([bigscape_df.Clustername_1,bigscape_df.Clustername_2])
    neighbors_list = []
    for target in targets_list:
        neighbors_list.append(get_neighbors(target,bigscape_df,'Clustername_1','Clustername_2'))
    G = to_graph(neighbors_list)
    C = connected_components(G)
    gcf_dict = defaultdict(list)
    gcf_dict = get_family_dict(C,bigscape_df,gcf_dict,'Clustername_1','Clustername_2','Raw_distance')
    return bigscape_df,gcf_dict,strain_list


bigscape_df,bigscape_dict,strain_list = main_get_families("./bigscape_all_c030_220521_1898samples.txt")
bigscape_df["Raw_distance"] = 1-bigscape_df["Raw_distance"]

bigscape_dict

defaultdict(list,
            {'GCF1': ['BGC0000008.1.region001',
              'BGC0000004.1.region001',
              'BGC0000007.1.region001',
              'BGC0000009.1.region001',
              'BGC0000006.1.region001'],
             'GCF2': ['BGC0000020.1.region001', 'BGC0001511.1.region001'],
             'GCF3': ['BGC0001349.1.region001',
              'BGC0000097.1.region001',
              'BGC0000029.1.region001'],
             'GCF4': ['BGC0000031.1.region001', 'BGC0001533.1.region001'],
             'GCF5': ['BGC0000034.1.region001', 'BGC0000061.1.region001'],
             'GCF6': ['BGC0000047.1.region001',
              'BGC0001396.1.region001',
              'BGC0000035.1.region001'],
             'GCF7': ['BGC0000098.1.region001', 'BGC0000039.1.region001'],
             'GCF8': ['BGC0000051.1.region001', 'BGC0001580.1.region001'],
             'GCF9': ['BGC0000059.1.region001', 'BGC0002032.1.region001'],
             'GCF10': ['BGC0000063.1.region001', 'BGC0000062.1.re

In [22]:
len(bigscape_dict)

195

In [23]:
bigscape_df2 = parse_bigscape_df("./bigscape_all_c030_220521_1898samples.txt",0)
all_clusters = list(np.unique([bigscape_df2['Clustername_1']]+[bigscape_df2['Clustername_2']]))

len(all_clusters)

1753

In [24]:
strain_list_renamed = []

for item in strain_list:
    if 'BGC' in item:
        strain_list_renamed.append(item.split('.')[0])
    else:
        strain_list_renamed.append(item)
    
strain_list_renamed = list(np.unique(strain_list_renamed))

similarity_df = pd.DataFrame(columns=strain_list_renamed, index=range(0,len(strain_list_renamed)-1))
index_row = 0
row_names = []
for gcf in bigscape_dict:
    for cluster in bigscape_dict[gcf]:
        if 'BGC' in cluster:
            row_names.append(cluster.split('.')[0])
        else:
            row_names.append(cluster)
        temp_dict = {}
        if 'BGC' in cluster:
            self = cluster.split(".")[0]
        else:
            self = cluster
        temp_dict[self] = [1]
        temp_df = bigscape_df[bigscape_df.Clustername_1.str.contains(cluster) | 
                              bigscape_df.Clustername_2.str.contains(cluster)]
        for i,r in temp_df.iterrows():
            if temp_df.Clustername_1.loc[i] == cluster:
                target = temp_df.Clustername_2.loc[i]
                if 'BGC' in target:
                    target = str(target).split(".")[0]
                if target not in temp_dict:
                    temp_dict[target] = [temp_df.Raw_distance.loc[i]]
                else:
                    temp_dict[target] = temp_dict[target]+[temp_df.Raw_distance.loc[i]]
            else:
                target = temp_df.Clustername_1.loc[i]
                if 'BGC' in target:
                    target = str(target).split(".")[0]
                if target not in temp_dict.keys():
                    temp_dict[target] = [temp_df.Raw_distance.loc[i]]
                else:
                    temp_dict[target] = temp_dict[target]+[temp_df.Raw_distance.loc[i]]
        for key in temp_dict:
            if len(temp_dict[key]) > 1:
                new_value = max(temp_dict[key])
                temp_dict[key] = new_value
            else:
                temp_dict[key] = temp_dict[key][0]
        similarity_df.loc[index_row] = pd.Series(temp_dict)
        index_row += 1
similarity_df.fillna(0,inplace=True)
similarity_df.index = row_names
similarity_df = similarity_df[~similarity_df.index.duplicated(keep='first')]
similarity_df = similarity_df.sort_index(axis=0)

len(strain_list_renamed),len(similarity_df)

(519, 519)

In [25]:
similarity_df

Unnamed: 0,BGC0000004,BGC0000006,BGC0000007,BGC0000008,BGC0000009,BGC0000020,BGC0000029,BGC0000031,BGC0000034,BGC0000035,...,NZ_MTPU01000053.region001,NZ_MTPU01000054.region001,c00017_NODE_19...region001,c00023_NZ_JAEC...region001,c00035_NODE_35...region001,c00071_NODE_64...region001,chrpls1.region003,chrpls1.region005,chrpls3.region001,main-chr.region003
BGC0000004,1.000000,0.976886,0.952463,0.975877,0.880091,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
BGC0000006,0.976886,1.000000,0.972055,0.997262,0.894729,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
BGC0000007,0.952463,0.972055,1.000000,0.971047,0.893030,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
BGC0000008,0.975877,0.997262,0.971047,1.000000,0.894099,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
BGC0000009,0.880091,0.894729,0.893030,0.894099,1.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
c00071_NODE_64...region001,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,1.000000,0.0,0.0,0.0,0.706209
chrpls1.region003,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,1.0,0.0,0.0,0.000000
chrpls1.region005,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.745586,0.0,0.0,0.000000,0.0,1.0,0.0,0.000000
chrpls3.region001,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,1.0,0.000000


In [26]:
# cluster_list_renamed = []

# for item in all_clusters:
#     if 'BGC' in item:
#         cluster_list_renamed.append(item.split('.')[0])
#     else:
#         cluster_list_renamed.append(item)
    
# cluster_list_renamed = list(np.unique(cluster_list_renamed))

In [27]:
# orphan_list = []

# for cluster in cluster_list_renamed:
#     if cluster not in similarity_df.index:
#         orphan_list.append(cluster)
#         new_col = []
#         for index in similarity_df.index:
#             new_col.append(0)
#         similarity_df[cluster] = new_col
        
# similarity_df = similarity_df.sort_index(axis=1)
        
# similarity_df

In [28]:
# for cluster in cluster_list_renamed:
#     if cluster not in similarity_df.index:
#         new_row = []
#         for col in similarity_df.columns:
#             if cluster == col:
#                 new_row.append(1)
#             else:
#                 new_row.append(0)
#         similarity_df.loc[cluster] = new_row
        
# similarity_df

# Filtering and merging dataframes

In [29]:
indexes_to_keep = []

for index in similarity_df.index:
    if index in class_df.index:
        indexes_to_keep.append(index)
        
print(len(indexes_to_keep))

class_df = class_df.loc[indexes_to_keep]

class_df

519


Unnamed: 0,PKS,Other,Alkaloid,Oligosaccharide,Terpene,NRPS,Cyclitol,Aminocoumarin,Betalactam,Siderophore,Pyrrolobenzodiazepine,RiPP,Butyrolactone,Nucleoside,Phenazine,Aminoglycoside,tRNA_derived,Phosphonate,NA
BGC0000004,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000006,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000007,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000008,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BGC0000009,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
c00071_NODE_64...region001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
chrpls1.region003,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
chrpls1.region005,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
chrpls3.region001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [30]:
similarity_df = similarity_df.loc[indexes_to_keep]

similarity_df

Unnamed: 0,BGC0000004,BGC0000006,BGC0000007,BGC0000008,BGC0000009,BGC0000020,BGC0000029,BGC0000031,BGC0000034,BGC0000035,...,NZ_MTPU01000053.region001,NZ_MTPU01000054.region001,c00017_NODE_19...region001,c00023_NZ_JAEC...region001,c00035_NODE_35...region001,c00071_NODE_64...region001,chrpls1.region003,chrpls1.region005,chrpls3.region001,main-chr.region003
BGC0000004,1.000000,0.976886,0.952463,0.975877,0.880091,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
BGC0000006,0.976886,1.000000,0.972055,0.997262,0.894729,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
BGC0000007,0.952463,0.972055,1.000000,0.971047,0.893030,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
BGC0000008,0.975877,0.997262,0.971047,1.000000,0.894099,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
BGC0000009,0.880091,0.894729,0.893030,0.894099,1.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
c00071_NODE_64...region001,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,1.000000,0.0,0.0,0.0,0.706209
chrpls1.region003,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,1.0,0.0,0.0,0.000000
chrpls1.region005,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.745586,0.0,0.0,0.000000,0.0,1.0,0.0,0.000000
chrpls3.region001,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,1.0,0.000000


In [31]:
merged_training_df = class_df.join(similarity_df, how='outer')

merged_training_df

Unnamed: 0,PKS,Other,Alkaloid,Oligosaccharide,Terpene,NRPS,Cyclitol,Aminocoumarin,Betalactam,Siderophore,...,NZ_MTPU01000053.region001,NZ_MTPU01000054.region001,c00017_NODE_19...region001,c00023_NZ_JAEC...region001,c00035_NODE_35...region001,c00071_NODE_64...region001,chrpls1.region003,chrpls1.region005,chrpls3.region001,main-chr.region003
BGC0000004,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
BGC0000006,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
BGC0000007,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
BGC0000008,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
BGC0000009,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
c00071_NODE_64...region001,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.000000,0.0,0.0,1.000000,0.0,0.0,0.0,0.706209
chrpls1.region003,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,1.0,0.0,0.0,0.000000
chrpls1.region005,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.745586,0.0,0.0,0.000000,0.0,1.0,0.0,0.000000
chrpls3.region001,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,1.0,0.000000


# Creating label column

In [32]:
label_col = []

for i,r in merged_training_df.iterrows():
    if i in mibig_activity_dict:
        label_col.append(mibig_activity_dict[i])
    else:
        if 'BGC' not in i:
            label_col.append('cena_bgc')
        else:
            label_col.append('unlabeled')
        
label_col

['unlabeled',
 'unlabeled',
 'unlabeled',
 'unlabeled',
 'unlabeled',
 'cytotoxic',
 'antibacterial',
 'antibacterial-cytotoxic',
 'antifungal',
 'antibacterial',
 'unlabeled',
 'antibacterial',
 'antifungal',
 'antifungal',
 'antifungal',
 'unlabeled',
 'unlabeled',
 'unlabeled',
 'antifungal-cytotoxic',
 'antifungal-cytotoxic',
 'cytotoxic',
 'unlabeled',
 'unlabeled',
 'antifungal-cytotoxic',
 'antibacterial-cytotoxic',
 'antibacterial',
 'antibacterial',
 'cytotoxic',
 'antibacterial',
 'antifungal',
 'unlabeled',
 'antibacterial',
 'antifungal',
 'antibacterial',
 'antifungal',
 'unlabeled',
 'antibacterial-antifungal',
 'cytotoxic',
 'antifungal',
 'antibacterial',
 'antibacterial',
 'antibacterial',
 'cytotoxic',
 'antifungal-cytotoxic',
 'antifungal-cytotoxic',
 'antifungal-cytotoxic',
 'antibacterial',
 'antiviral',
 'cytotoxic',
 'cytotoxic',
 'antibacterial',
 'unknown',
 'unlabeled',
 'antibacterial-cytotoxic',
 'antibacterial-cytotoxic',
 'antibacterial',
 'antibacterial-c

In [33]:
merged_training_df['label'] = label_col

labeled_training_df = merged_training_df[merged_training_df['label'] != 'unlabeled']

labeled_training_df

Unnamed: 0,PKS,Other,Alkaloid,Oligosaccharide,Terpene,NRPS,Cyclitol,Aminocoumarin,Betalactam,Siderophore,...,NZ_MTPU01000054.region001,c00017_NODE_19...region001,c00023_NZ_JAEC...region001,c00035_NODE_35...region001,c00071_NODE_64...region001,chrpls1.region003,chrpls1.region005,chrpls3.region001,main-chr.region003,label
BGC0000020,1,0,0,0,0,0,0,0,0,0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,cytotoxic
BGC0000029,1,0,0,0,0,0,0,0,0,0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,antibacterial
BGC0000031,1,0,0,0,0,0,0,0,0,0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,antibacterial-cytotoxic
BGC0000034,1,0,0,0,0,0,0,0,0,0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,antifungal
BGC0000035,1,0,0,0,0,0,0,0,0,0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,antibacterial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
c00071_NODE_64...region001,0,0,0,0,0,0,0,0,0,0,...,0.0,0.000000,0.0,0.0,1.000000,0.0,0.0,0.0,0.706209,cena_bgc
chrpls1.region003,0,0,0,0,0,0,0,0,0,0,...,0.0,0.000000,0.0,0.0,0.000000,1.0,0.0,0.0,0.000000,cena_bgc
chrpls1.region005,0,0,0,0,0,0,0,0,0,0,...,0.0,0.745586,0.0,0.0,0.000000,0.0,1.0,0.0,0.000000,cena_bgc
chrpls3.region001,0,0,0,0,0,0,0,0,0,0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,1.0,0.000000,cena_bgc


In [34]:
merged_training_df[merged_training_df['label'] == 'cena_bgc']

Unnamed: 0,PKS,Other,Alkaloid,Oligosaccharide,Terpene,NRPS,Cyclitol,Aminocoumarin,Betalactam,Siderophore,...,NZ_MTPU01000054.region001,c00017_NODE_19...region001,c00023_NZ_JAEC...region001,c00035_NODE_35...region001,c00071_NODE_64...region001,chrpls1.region003,chrpls1.region005,chrpls3.region001,main-chr.region003,label
NZ_CP012036.1.region006,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc
NZ_CP012036.1.region008,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc
NZ_CP023278.1.region001,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc
NZ_CP023278.1.region010,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc
NZ_CP023278.1.region020,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc
NZ_CP023278.1.region022,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc
NZ_MTPU01000053.region001,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc
NZ_MTPU01000054.region001,0,0,0,0,0,0,0,0,0,0,...,1.0,0.0,0.794817,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc
c00017_NODE_19...region001,0,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.745586,0.0,0.0,cena_bgc
c00023_NZ_JAEC...region001,0,0,0,0,0,0,0,0,0,0,...,0.794817,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc


In [35]:
np.unique(labeled_training_df['label'])

array(['antibacterial', 'antibacterial-antifungal',
       'antibacterial-antifungal-cytotoxic', 'antibacterial-cytotoxic',
       'antifungal', 'antifungal-cytotoxic', 'antiviral', 'cena_bgc',
       'cytotoxic', 'cytotoxic-unknown', 'inhibitor', 'other', 'pigment',
       'siderophore', 'unknown'], dtype=object)

# Predicting the unknown

In [36]:
unlabeled_training_df = merged_training_df[merged_training_df['label'] == 'cena_bgc']

unlabeled_training_df[:30]

Unnamed: 0,PKS,Other,Alkaloid,Oligosaccharide,Terpene,NRPS,Cyclitol,Aminocoumarin,Betalactam,Siderophore,...,NZ_MTPU01000054.region001,c00017_NODE_19...region001,c00023_NZ_JAEC...region001,c00035_NODE_35...region001,c00071_NODE_64...region001,chrpls1.region003,chrpls1.region005,chrpls3.region001,main-chr.region003,label
NZ_CP012036.1.region006,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc
NZ_CP012036.1.region008,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc
NZ_CP023278.1.region001,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc
NZ_CP023278.1.region010,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc
NZ_CP023278.1.region020,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc
NZ_CP023278.1.region022,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc
NZ_MTPU01000053.region001,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc
NZ_MTPU01000054.region001,0,0,0,0,0,0,0,0,0,0,...,1.0,0.0,0.794817,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc
c00017_NODE_19...region001,0,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.745586,0.0,0.0,cena_bgc
c00023_NZ_JAEC...region001,0,0,0,0,0,0,0,0,0,0,...,0.794817,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,cena_bgc


In [37]:
unknown_bgcs = unlabeled_training_df.index

In [38]:
labeled_training_df = labeled_training_df[labeled_training_df['label'] != 'cena_bgc']

In [39]:
def running_knn(training_df,testing_df,k_value):
    j_col = []
    X_div = training_df.drop("label", axis=1)
    y_div = training_df["label"]
    nbrs = NearestNeighbors(n_neighbors=k_value, algorithm='ball_tree').fit(X_div,y_div)
    distances, indices = nbrs.kneighbors(testing_df)
    y_div = y_div.reset_index(drop=True)
    neighbors_array = []
    for item in indices:
        candidate_list = []
        for i in range(k_value):
            candidate_list.append(y_div[item[i]])
        neighbors_array.append(candidate_list)
    neighbors_array = np.asarray(neighbors_array)
    return neighbors_array,indices
    

neighbors_array,indices = running_knn(labeled_training_df,unlabeled_training_df.drop('label',axis=1),1)

bgc_col,bioact_col = [],[]

for i,item in enumerate(neighbors_array):
    print(unlabeled_training_df.index[i],item)
    bgc_col.append(unlabeled_training_df.index[i])
    bioact_col.append(item)
    
print(len(neighbors_array),len(bgc_col),len(bioact_col))

NZ_CP012036.1.region006 ['antibacterial']
NZ_CP012036.1.region008 ['unknown']
NZ_CP023278.1.region001 ['unknown']
NZ_CP023278.1.region010 ['antibacterial']
NZ_CP023278.1.region020 ['unknown']
NZ_CP023278.1.region022 ['antibacterial']
NZ_MTPU01000053.region001 ['antibacterial']
NZ_MTPU01000054.region001 ['antibacterial']
c00017_NODE_19...region001 ['antibacterial']
c00023_NZ_JAEC...region001 ['antibacterial']
c00035_NODE_35...region001 ['antibacterial-antifungal-cytotoxic']
c00071_NODE_64...region001 ['antibacterial']
chrpls1.region003 ['cytotoxic']
chrpls1.region005 ['antibacterial']
chrpls3.region001 ['antibacterial']
main-chr.region003 ['antibacterial']
16 16 16


In [40]:
jaccard_col,homol_col = [],[]

def get_binary(fingerprint):
    new_row = []
    for index,item in enumerate(fingerprint):
        if item != 0:
            new_row.append(1)
        else:
            new_row.append(0)
    fingerprint = new_row
    return fingerprint

for i,bgc_id in enumerate(unlabeled_training_df.index):
    count = 0
    train_index = int(indices[i][0])
    print(i,bgc_id,train_index,indices[i])
    for j,r in labeled_training_df.iterrows():
        if count == train_index:
            training_fp = labeled_training_df.loc[j].drop('label')
            homol_col.append(j)
        count += 1
    testing_fp = unlabeled_training_df.drop('label',axis=1).loc[bgc_id]
    training_binary = get_binary(training_fp)
    testing_binary = get_binary(testing_fp)
    jaccard_index = jaccard_score(training_binary,testing_binary)
    jaccard_col.append(jaccard_index)
    print(jaccard_index)
    
print(len(jaccard_col))

0 NZ_CP012036.1.region006 222 [222]
0.0
1 NZ_CP012036.1.region008 173 [173]
0.4
2 NZ_CP023278.1.region001 272 [272]
0.3333333333333333
3 NZ_CP023278.1.region010 222 [222]
0.0
4 NZ_CP023278.1.region020 72 [72]
0.5
5 NZ_CP023278.1.region022 222 [222]
0.0
6 NZ_MTPU01000053.region001 222 [222]
0.0
7 NZ_MTPU01000054.region001 222 [222]
0.0
8 c00017_NODE_19...region001 222 [222]
0.0
9 c00023_NZ_JAEC...region001 222 [222]
0.0
10 c00035_NODE_35...region001 263 [263]
0.7777777777777778
11 c00071_NODE_64...region001 222 [222]
0.0
12 chrpls1.region003 166 [166]
0.42857142857142855
13 chrpls1.region005 222 [222]
0.0
14 chrpls3.region001 222 [222]
0.0
15 main-chr.region003 222 [222]
0.0
16


In [41]:
frames = {'bgc':bgc_col,'bioactivity':bioact_col,'jaccard_score':jaccard_col,'homolog':homol_col}

valid_jaccard_df = pd.DataFrame(frames)

valid_jaccard_df = valid_jaccard_df[valid_jaccard_df['jaccard_score'] >= 0.3]

valid_jaccard_df = valid_jaccard_df.sort_values(by='jaccard_score',ascending=False)

valid_jaccard_df = valid_jaccard_df.reset_index(drop=True)

valid_jaccard_df

Unnamed: 0,bgc,bioactivity,jaccard_score,homolog
0,c00035_NODE_35...region001,[antibacterial-antifungal-cytotoxic],0.777778,BGC0001612
1,NZ_CP023278.1.region020,[unknown],0.5,BGC0000427
2,chrpls1.region003,[cytotoxic],0.428571,BGC0001016
3,NZ_CP012036.1.region008,[unknown],0.4,BGC0001028
4,NZ_CP023278.1.region001,[unknown],0.333333,BGC0001705


In [42]:
for key in bigscape_dict:
    if 'c00035_NODE_35...region001' in str(bigscape_dict[key]):
        print(bigscape_dict[key])

['BGC0001594.1.region001', 'c00035_NODE_35...region001', 'BGC0001126.1.region001', 'BGC0001612.1.region001', 'BGC0001501.1.region001', 'BGC0000668.1.region001', 'BGC0001595.1.region001']


In [43]:
for i,item in unlabeled_training_df.drop('label',axis=1).loc['c00035_NODE_35...region001'].iteritems():
    if item > 0:
        print(i,item)

NA 1
BGC0000668 0.846348911523819
BGC0001126 0.9291904419660569
BGC0001501 0.92709369212389
BGC0001594 0.7142514586448669
BGC0001595 0.7423691153526306
BGC0001612 0.9291904419660569
c00035_NODE_35...region001 1.0


In [44]:
for i,item in labeled_training_df.drop('label',axis=1).loc['BGC0001612'].iteritems():
    if item > 0:
        print(i,item)

Alkaloid 1
BGC0000668 0.8584925383329391
BGC0001126 1.0
BGC0001501 0.9679582305252552
BGC0001594 0.9150469452142715
BGC0001595 0.8034721612930298
BGC0001612 1.0
c00035_NODE_35...region001 0.9291904419660569


# BiG-SCAPE dereplication

In [45]:
for key in bigscape_dict:
    for item in bigscape_dict[key]:
        if 'BGC' not in item:
            print(key,bigscape_dict[key])

GCF52 ['NZ_CP023278.1.region020', 'BGC0000427.1.region001']
GCF81 ['BGC0001594.1.region001', 'c00035_NODE_35...region001', 'BGC0001126.1.region001', 'BGC0001612.1.region001', 'BGC0001501.1.region001', 'BGC0000668.1.region001', 'BGC0001595.1.region001']
GCF92 ['NZ_MTPU01000053.region001', 'NZ_CP012036.1.region006', 'BGC0000869.1.region001', 'NZ_CP023278.1.region022']
GCF92 ['NZ_MTPU01000053.region001', 'NZ_CP012036.1.region006', 'BGC0000869.1.region001', 'NZ_CP023278.1.region022']
GCF92 ['NZ_MTPU01000053.region001', 'NZ_CP012036.1.region006', 'BGC0000869.1.region001', 'NZ_CP023278.1.region022']
GCF117 ['BGC0001667.1.region001', 'BGC0001705.1.region001', 'chrpls1.region003', 'NZ_CP023278.1.region001', 'BGC0001016.1.region001']
GCF117 ['BGC0001667.1.region001', 'BGC0001705.1.region001', 'chrpls1.region003', 'NZ_CP023278.1.region001', 'BGC0001016.1.region001']
GCF121 ['NZ_CP012036.1.region008', 'BGC0001028.1.region001']
GCF177 ['NZ_CP023278.1.region010', 'BGC0001748.1.region001']
GCF188 ['

In [46]:
end = time.time()
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
run_time = "{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds)
print(run_time)

00:00:11.14


In [62]:
def get_feature_class(bgc_name,genome_name):
    filename = "/Users/tiagoferreiraleao/Dropbox/tiago-NAS/BGClassifier/antismash_results_CENA/%s/%s"%(genome_name,bgc_name)
    if os.path.exists(filename):
        input_handle = open(filename,'r')
        for seq_record in SeqIO.parse(input_handle,'genbank'):
            edge_list,type_list = [],[]
            for feature in seq_record.features:
                if feature.type == 'cand_cluster':
                    for qual in feature.qualifiers:
                        if qual == 'contig_edge':
                            ctg_edge = feature.qualifiers[qual]
                            edge_list.append(ctg_edge[0])
                        if qual == 'product':
                            bgc_type = feature.qualifiers[qual]
                            for item_type in bgc_type:
                                if item_type not in type_list:
                                    type_list.append(item_type)
            return edge_list,type_list
    else:
        return 'NA','NA'
            
all_classes = []

for root, dirs, files in os.walk("/Users/tiagoferreiraleao/Dropbox/tiago-NAS/BGClassifier/antismash_results_all/"):
    for file in files:
        if file.endswith(".gbk"):
#             print(file,os.path.basename(root))
            ctg_edge,class_list = get_feature_class(file,os.path.basename(root))
#             print(class_list)
            for class_type in class_list:
                all_classes.append(class_type)
    
all_classes = np.unique(all_classes)

all_classes

array(['A', 'N'], dtype='<U1')