In [1]:
%load_ext autoreload
%autoreload 2

from utils.prediction_models import *

# Organize datasets

- Dataset 1 Zeqian: strains from three sources: 
    - [Mario](https://www.nature.com/articles/s41396-019-0427-7): assembly downloaded from NCBI
    - [Karna](https://pubmed.ncbi.nlm.nih.gov/35085485/): assembly provided by Karna 
    - [Kaumudi](https://www.biorxiv.org/content/10.1101/2022.06.27.497809v1): newly sequenced. Trimmed with Trimmomatic and assembled by Unicycler. 
    
    Dataset 1 goes through a snakemake KO annotation + 16S annotation + CBM snakemake pipeline. Utilization data is from new experiments. 

- Dataset 2: [Matti](https://www.biorxiv.org/content/10.1101/2022.08.04.502823v1.abstract)
    
    Contigs and utilization data are collected from the supplementary materials. Genomes go through the same snakemake pipeline as dataset 1.
    
- Dataset 3: [BacDive](https://bacdive.dsmz.de/)

    Utilization data and accession numbers are downloaded from BacDive website. Genomes eggnog annotations are from [ProGenomes](https://progenomes.embl.de/) by accession ID. 16S sequences are downloaded from ENA database by accession ID.



All finalized datasets are stored in the same format:

- A Python dictionary with these keys:
    - 'ko_data': presence-absence KO matrix, with index being samples and columns being kos.
    - 'growth_data: binary growth matrix, with index being samples and columns being carbons. 
    - 'tree': a ete3.Tree object of phylogenetic tree. Note that the tree should be cleaned (no empty leaves). 
    - 'samples': an array of samples. Note that ko_data, growth_data, and tree should have the same samples. Samples with missing data are eliminated. 
    - 'carbons': an array of carbons. 
- The dictionary is saved as a pickle object. 
- (Maybe): save a text format of all these object. This is only for publishing and not used in any data processing. 

In [2]:
# Utility functions for all three datasets
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO
from genomics_utils import IO


def concatenate_barrnap_output(DIR_barrnap, FILE_out,prefix=None, exclude=None):
    missing_16S=[]
    records=[]
    if exclude is None:
        exclude=[]
    samples=[]
    for ff in os.listdir(DIR_barrnap):
        if ff.endswith("fna"):
            excluded=False
            for exc in exclude:
                if exc in ff:
                    excluded=True
                    print("Sample exluded:", ff)
                    break
            if not excluded:
                samples.append(ff.replace(".16S.fna",""))

    for sample in tqdm(samples):
        df=IO.read_fasta(os.path.join(DIR_barrnap,sample+".16S.fna"),df=True)
        if len(df)<=0:
            missing_16S.append(sample)
            continue
        df_16S=df[df['description'].str.contains('16S')]
        if len(df_16S)<=0:
            missing_16S.append(sample)
        else:
            line=df_16S.iloc[0]
            id_=sample
            if prefix is not None:
                id_=prefix+"_"+sample
            records.append(SeqRecord(seq=Seq(line['seq']), id=id_, description=line['description']))

    print(f"{len(missing_16S)} samples miss 16S sequences: ",missing_16S)

    with open(FILE_out, "w") as output_handle:
        SeqIO.write(records, output_handle, "fasta")

def generate_tree(INPUT, OUTPUT_ALIGNED, OUTPUT_CSV, TREE, threads=1,
        SILVA_DB="/home/zeqianli/scratch-midway3/bin/silva/SILVA_138.1_SSURef_opt.arb"):
    !/home/zeqianli/scratch-midway3/miniconda3/envs/sixteenS/bin/sina -i {INPUT} -o {OUTPUT_ALIGNED} -o {OUTPUT_CSV} -r {SILVA_DB} --search --search-db {SILVA_DB} --lca-fields tax_slv,tax_embl_ebi_ena,tax_embl_ebi_ena_name,tax_gtdb --threads {threads}
    !/home/zeqianli/scratch-midway3/miniconda3/envs/sixteenS/bin/fasttree -nt -gtr -gamma -out {TREE} {OUTPUT_ALIGNED}

# Dataset 1: Zeqian

In [3]:
DIR_zeqian="/home/zeqianli/project/zeqian/Carbon/data/zeqian"

## 1.1. Organize assemblies

In [None]:
# Mario 

ff_mario_ncbi="/home/zeqianli/Kuehn/Carbon/data/mario_ncbi_assembly.txt"
DIR_mario_raw="/home/zeqianli/scratch-midway2/Carbon/raw/Mario"
DIR_assembly="/home/zeqianli/scratch-midway2/Carbon/assembly"
df=pd.read_csv(ff_mario_ncbi,sep=",",header=None)

for _, name, acc, __ in tqdm(df.itertuples()):
    #!/home/zeqianli/project/zeqianli/miniconda3/envs/ncbi/bin/datasets download genome accession {acc} --filename {os.path.join(DIR_mario_raw, name+'.zip')}
    #!unzip -o -q {os.path.join(DIR_mario_raw, name+'.zip')} -d {os.path.join(DIR_mario_raw, name)}
    !cp {os.path.join(DIR_mario_raw, name)}/ncbi_dataset/data/{acc}/{acc}* {os.path.join(DIR_assembly, name+'.fasta')}

In [None]:
# Karna
import shutil, os, tqdm

DIR_raw_karna="/home/zeqianli/project/zeqianli/Kuehn/Carbon/data/zeqian/raw/Karna"
DIR_assembly="/home/zeqianli/project/zeqianli/Kuehn/Carbon/data/zeqian/assembly"


for ff in tqdm.tqdm([ff for ff in os.listdir(DIR_raw_karna) if ff.endswith(".fasta")],desc="Copying Karna assemblies"):
    shutil.copyfile(os.path.join(DIR_raw_karna, ff), os.path.join(DIR_assembly, ff.replace("_contigs.fasta",".fasta")))

In [None]:
# Kaumudi 

import shutil
DIR_raw_kaumudi="/home/zeqianli/scratch-midway2/Carbon/raw/Kaumudi"
DIR_assembly="/home/zeqianli/scratch-midway2/Carbon/assembly"

for ff in tqdm(os.listdir(DIR_raw_kaumudi),desc="Copying Kaumudi assemblies"):
    shutil.copyfile(os.path.join(DIR_raw_kaumudi, ff, 'assembly.fasta'), os.path.join(DIR_assembly, ff+'.fasta'))

## 1.2 Snakemake pipeline

A snakemake pipeline does the following steps:
1. KO annotation: assemblies -> prodigal -> kofamscan -> ko files for each samples
2. 16S: assemblies -> barnnap -> rRNA sequences for each sample
3. FBA: assemblies -> prodigal -> carveme -> custom FBA simulation script -> FBA predicted growth rate on carbons for each sample

Then I concatenate kos, 16S, and FBA outputs here, format data, and pickle it. 

Dry run:

```
snakemake --cores all -n
```

Actual run:

```
snakemake --cores all --conda-frontend conda --use-conda
```

## 1.3 16S and tree

In [4]:
concatenate_barrnap_output(f"{DIR_zeqian}/16S", f"{DIR_zeqian}/zeqian_16S.fna",prefix="zeqian",)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 140/140 [00:01<00:00, 112.39it/s]

4 samples miss 16S sequences:  ['HMWF031', 'HMWF011', 'HMWF007', 'HMWF030']





In [6]:
INPUT=f"{DIR_zeqian}/zeqian_16S.fna"
OUTPUT_ALIGNED=f"{DIR_zeqian}/zeqian_16S.aligned.fna"
OUTPUT_CSV=f"{DIR_zeqian}/zeqian_16S.aligned.csv"
TREE=f"{DIR_zeqian}/zeqian_16S.tree"
generate_tree(INPUT=INPUT, OUTPUT_ALIGNED=OUTPUT_ALIGNED, OUTPUT_CSV=OUTPUT_CSV, TREE=TREE, threads=48)

[K10:10:35 [SINA] [33m[1mThis is SINA 1.7.2.[m
Processing: 0 [00:00:28]
Scanning:   0% |      | 1/2224740 [00:00:00 / 00:00:56]
[A[AProcessing: 0 [00:00:28]
Scanning:   0% |      | 1/2224740 [00:00:00 / 00:02:32]
[A[AProcessing: 0 [00:00:28]
Scanning:   0% |  | 10274/2224740 [00:00:00 / 00:00:02]
[A[AProcessing: 0 [00:00:28]
Scanning:   1% |  | 20754/2224740 [00:00:00 / 00:00:02]
[A[AProcessing: 0 [00:00:28]
Scanning:   2% |  | 34145/2224740 [00:00:00 / 00:00:02]
[A[AProcessing: 0 [00:00:28]
Scanning:   2% |  | 47534/2224740 [00:00:00 / 00:00:02]
[A[AProcessing: 0 [00:00:28]
Scanning:   3% |  | 60486/2224740 [00:00:00 / 00:00:02]
[A[AProcessing: 0 [00:00:28]
Scanning:   3% |  | 73583/2224740 [00:00:00 / 00:00:01]
[A[AProcessing: 0 [00:00:28]
Scanning:   4% |  | 86678/2224740 [00:00:00 / 00:00:02]
[A[AProcessing: 0 [00:00:28]
Scanning:   4% |  | 99251/2224740 [00:00:00 / 00:00:02]
[A[AProcessing: 0 [00:00:28]
Scanning:   5% | | 113005/2224740 [00:00:00 / 00:00:02

##  1.4 Final organization

In [7]:
# ko
DIR_KOFAMSCAN=f"{DIR_zeqian}/kofamscan"
ko_data={}
for ff in tqdm(os.listdir(DIR_KOFAMSCAN)):
    ko_data[ff.replace('.ko','')]=pd.read_csv(os.path.join(DIR_KOFAMSCAN,ff),sep='\t',header=None,names=['orf','ko'])['ko'].value_counts()
ko_data=pd.DataFrame(ko_data).fillna(0).astype(int)
ko_data=(ko_data>0).astype(int).T # binarize

# growth data
growth_data=pd.read_csv(f"{DIR_zeqian}/raw/zl_growth_data_20220921.csv",index_col=0)

# tree
from ete3 import Tree
tree=Tree(f"{DIR_zeqian}/zeqian_16S.tree",format=1)

# Finalize 
zeqian_final=finalize_data(ko_data, growth_data, tree,remove_prefix=True)
with open(os.path.join(DIR_zeqian,"zeqian.pk"),'wb') as f:
    pickle.dump(zeqian_final,f)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 140/140 [00:03<00:00, 38.08it/s]


96 samples:  ['HMWF001' 'HMWF003' 'HMWF005' 'HMWF006' 'HMWF008' 'HMWF009' 'HMWF010'
 'HMWF013' 'HMWF014' 'HMWF015' 'HMWF016' 'HMWF017' 'HMWF018' 'HMWF019'
 'HMWF021' 'HMWF022' 'HMWF023' 'HMWF025' 'HMWF026' 'HMWF028' 'HMWF029'
 'OTU2469' 'OTU3427' 'OTU3971' 'OTU4895' 'OTU4908' 'OTU5201' 'OTU530'
 'OTU672' 'OTU695' 'OTU736' 'pd19367' 'sia0102' 'sia0103' 'sia0703'
 'sia0901' 'sia0902' 'sia0905' 'sib0112' 'sib0115' 'sib0205' 'sib0402'
 'sib0404' 'sib0509' 'sib0612' 'sic0102' 'sic0104' 'sic0105' 'sic0204'
 'sic0402' 'sic0404' 'sic0604' 'sic0702' 'sic1001' 'sic1004' 'sic1005'
 'sic1102' 'sic1205' 'sic1506' 'sid0104' 'sid0204' 'sid0306' 'sid0307'
 'sid0404' 'sid0406' 'sid0505' 'sid0601' 'sid0605' 'sid0705' 'sid0801'
 'sid0901' 'sid1001' 'sid1006' 'sie0102' 'sie1401' 'sie1917' 'sif0513'
 'sif0514' 'sif0532' 'sif0611' 'sif0613' 'sif0617' 'sif0631' 'sif0715'
 'sif0732' 'sif1233' 'sif1732' 'sif1831' 'sif1833' 'sif2231' 'sif2232'
 'sif2233' 'sif2332' 'sif2416' 'sif2431' 'sif2433']


## 1.4 FBA results

In [None]:
import re

def find_sample_name(ff):
    return re.findall(r"\/?([^_/]*)[_\w]*.csv",ff)[0]

def concatenate(ffs,output):
    growths=[pd.read_csv(ff,index_col=0,names=[find_sample_name(ff)]) for ff in ffs]
    growth_matrix=pd.concat(growths,axis=1)
    return growth_matrix

DIR_fba=f"{DIR_zeqian}/fba_prediction"
DIR_fba_concat=f"{DIR_zeqian}/fba_concatenated"
samples=np.unique([find_sample_name(ff) for ff in os.listdir(DIR_fba)])
thresh=0.05

for suffix in ['','_gapfill','_force_uptake','_gapfill_force_uptake']:
    ffs=[os.path.join(DIR_fba,samples+suffix+'.csv') for samples in samples]
    growth_matrix=concatenate(ffs,os.path.join(DIR_zeqian,'fba_prediction'+suffix+'.csv'))
    growth_matrix.to_csv(os.path.join(DIR_fba_concat,'zeqian_fba'+suffix+'.csv'))
    (growth_matrix>thresh).astype(int).to_csv(os.path.join(DIR_fba_concat,'zeqian_fba'+suffix+f'_binarized_{thresh}.csv'))


# Dataset 2: Matti

Source: SI in https://www.biorxiv.org/content/10.1101/2022.08.04.502823v1.abstract

In [2]:
DIR_matti="/home/zeqianli/project/zeqian/Carbon/data/matti"

## 2.1 Organize assemblies

In [None]:
# Copy contigs 
import shutil

DIR_assembly_raw=f"{DIR_matti}/raw/Code & Data for Fundamental metabolic strategies of heterotrophic bacteria/Genomes & annotations/SC192 contigs"
DIR_assembly=f"{DIR_matti}/contigs"
for ff in os.listdir(DIR_assembly_raw):
    if ff.endswith('.fsa'):
        shutil.copy(os.path.join(DIR_assembly_raw)ra,ff),os.path.join(DIR_assembly,ff)

In [3]:
# Copy assemblies 
import shutil, os
from tqdm import tqdm

for ff in tqdm(os.listdir(f"{DIR_matti}/contigs"),desc='Copying matti assemblies...'):
    if ff.endswith('.fsa'):
        shutil.copyfile(os.path.join(f"{DIR_matti}/contigs",ff), os.path.join(f"{DIR_matti}/assembly",ff.replace('.fsa','.fasta')))

Copying matti assemblies...: 100%|██████████| 442/442 [00:12<00:00, 35.36it/s]


## 2.2. Snakemake pipeline

Dry run: 

```
snakemake --cores all -n --configfile "/home/zeqianli/Kuehn/Carbon/snakemake/config/config_matti.yml"
```

Actual run:

```
snakemake --cores all --conda-frontend conda --use-conda --configfile "/home/zeqianli/Kuehn/Carbon/snakemake/config/config_matti.yml"
```
 

## 2.3 16S and tree  

In [10]:
# Concatenate barrnap output
concatenate_barrnap_output(f"{DIR_matti}/16S", f"{DIR_matti}/matti_16S.fna",prefix='matti',exclude=['DSS-3']) # DSS-3 sample has 16S annotation error. 

Sample exluded: DSS-3.16S.fna


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 220/220 [00:03<00:00, 56.96it/s]

17 samples miss 16S sequences:  ['G2R10', '6C06', 'AS85', 'AS84', '3D05', '3B05', 'AS13', 'AS76', 'AS81', 'AS71', 'AS21', 'AS92', 'AS82', '1A01', 'AS80', '1A06', 'AS26']





In [11]:
# Build tree
INPUT=f"{DIR_matti}/matti_16S.fna"
OUTPUT_ALIGNED=f"{DIR_matti}/matti_16S.aligned.fna"
OUTPUT_CSV=f"{DIR_matti}/matti_16S.aligned.csv"
TREE=f"{DIR_matti}/matti_16S.tree"
generate_tree(INPUT=INPUT, OUTPUT_ALIGNED=OUTPUT_ALIGNED, OUTPUT_CSV=OUTPUT_CSV, TREE=TREE,threads=48)

[K10:19:13 [SINA] [33m[1mThis is SINA 1.7.2.[m
Processing: 0 [00:00:28]
Scanning:   0% |      | 1/2224740 [00:00:00 / 00:01:21]
[A[AProcessing: 0 [00:00:28]
Scanning:   0% |      | 1/2224740 [00:00:00 / 00:03:02]
[A[AProcessing: 0 [00:00:28]
Scanning:   0% |   | 5088/2224740 [00:00:00 / 00:00:04]
[A[AProcessing: 0 [00:00:28]
Scanning:   0% |  | 10151/2224740 [00:00:00 / 00:00:04]
[A[AProcessing: 0 [00:00:28]
Scanning:   1% |  | 15211/2224740 [00:00:00 / 00:00:06]
[A[AProcessing: 0 [00:00:28]
Scanning:   1% |  | 17322/2224740 [00:00:00 / 00:00:07]
[A[AProcessing: 0 [00:00:28]
Scanning:   1% |  | 19298/2224740 [00:00:00 / 00:00:07]
[A[AProcessing: 0 [00:00:28]
Scanning:   1% |  | 21258/2224740 [00:00:00 / 00:00:08]
[A[AProcessing: 0 [00:00:28]
Scanning:   1% |  | 23255/2224740 [00:00:00 / 00:00:08]
[A[AProcessing: 0 [00:00:28]
Scanning:   1% |  | 29838/2224740 [00:00:00 / 00:00:07]
[A[AProcessing: 0 [00:00:28]
Scanning:   2% |  | 40292/2224740 [00:00:00 / 00:00:05

## 1.4 Final organization

In [3]:
pd.read_excel(f"{DIR_matti}/raw/Code & Data for Fundamental metabolic strategies of heterotrophic bacteria/Supplementary files/SI Table 3 (growth data).xlsx",sheet_name='growth data',header=1)

Unnamed: 0,strain ID,kingdom,phylum,class,order,family,genus,species,"sugar-acid-preference (averaged over 3 experiments, -1=acids, +1=sugars)",genome %GC,glucuronate,gluconate,galacturonate,gluconate lactone,glucuronate lactone,glucosamine,glcnac,mannosamine,galnac,galactosamine,melibiose,lactose,sucrose,maltose,cellobiose,lactulose,raffinose,fucose,rhamnose,arabinose,xylose,mannose,glucose,galactose,fructose,ribose,alpha-cyclodextrin,erythrose,proline,glutamine,glycine,arginine,aspartate,cysteine,lysine,glutamate,sarcosine,histidine,isoleucine,leucine,methionine,taurine,valine,tyrosine,phenylalanine,beta-alanine,hydroxyproline,betaine,tryptophan,alanine,asparagine,serine,threonine,homoserine,lactate,pyruvate,fumarate,acetate,propionate,succinate,malate,isocitrate,citrate,oxalacetate,butyrate,valerate,malonate,oxoglutarate,tartrate,glycolate,3m2-oxybutyrate,adenine,agarose,alginate,arabinan,arabinoxylan,benzoate,butanol,chondroitin,cytidine,cytosine,deoxyuridine,dextran,dna hmw,erythritol,ethanol,ethyl benzoate,galactan,galactomannan,galacturonate lmw,glucomannan,glycerol,glycogen,i-carrageenan,inositol,inulin,k-carrageenan,lambda-carrageenan,laminarin,maltitol,mannitol,mb,methyl benzoate,pectin,phba,porphyran,propanol,pullulan,sorbitol,starch,thymidine,thymine,ulvan,uracil,uridine,xylan,xylitol,xyloglucan
0,12B01,Bacteria,Proteobacteria,Gammaproteobacteria,Vibrionales,Vibrionaceae,Vibrio,Vibrio splendidus,0.387115,45.134365,0.00000,4.284763,0.000000,7.467442,0.000000,4.723077,3.096046,0.0,0.0,0.0,0.000000,1.912280,0.000000,3.873178,3.003433,0.000000,0.000000,0.0,0.000000,0.0,0.000000,4.502887,5.257096,2.585598,4.198539,3.128913,3.187303,0.0,2.701693,2.838122,2.448704,0.000000,1.995602,1.961662,0.0,2.694711,0.0,3.246137,0.000000,0.000000,0.0,0.277249,0.0,0.228526,0.0,0.0,0.000000,0.0,0.0,7.994511,1.909057,4.975520,14.994219,0.0,3.899723,3.616221,3.966907,1.898887,2.154002,3.825211,0.000000,0.000000,4.686823,8.152699,0.000000,0.000000,0.000000,6.451770,0.0,0.0,0.000000,0.0,0.0,14.997866,0.0,0.0,0.000000,0.000000,11.174209,4.566931,0.0,1.354267,0.00000,14.943537,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,1.858748,2.923706,0.0,0.000000,0.0,0.0,14.826257,0.000000,0.000000,3.188782,11.920130,0.0,12.282191,0.000000,0.0,0.000000,3.689943,0.0,2.986775,1.751500,0.0,0.000000,0.0,3.996350,0.0,0.000000,0.0
1,13B01,Bacteria,Proteobacteria,Gammaproteobacteria,Vibrionales,Vibrionaceae,Vibrio,Vibrio splendidus,-0.262048,45.231178,0.00000,4.779399,0.000000,5.874979,0.000000,4.816162,2.735344,0.0,0.0,0.0,0.000000,1.985814,0.000000,4.375111,4.559644,0.000000,0.000000,0.0,0.000000,0.0,0.000000,8.111099,5.009757,3.872793,8.391777,3.293969,14.920750,0.0,3.090551,4.352981,1.091184,0.000000,2.089573,1.790740,0.0,2.906032,0.0,2.915911,0.000000,0.771227,0.0,6.947360,0.0,0.293861,0.0,0.0,0.000000,0.0,0.0,3.145412,2.025119,3.706825,7.068026,0.0,3.674555,4.205921,3.294501,3.039669,3.073141,3.593015,0.000000,0.000000,14.989950,7.947605,0.000000,0.000000,0.000000,8.268066,0.0,0.0,0.000000,0.0,0.0,5.176302,0.0,0.0,0.000000,0.000000,14.918494,14.981055,0.0,2.073064,0.00000,11.208661,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,2.703160,2.999882,0.0,0.000000,0.0,0.0,14.778241,0.000000,0.000000,4.455986,12.327267,0.0,14.738816,0.000000,0.0,0.000000,7.996975,0.0,2.267510,1.459929,0.0,0.000000,0.0,14.971317,0.0,0.000000,0.0
2,1A01,Bacteria,Proteobacteria,Gammaproteobacteria,Vibrionales,Vibrionaceae,Vibrio,Vibrio lentus,0.235795,45.139398,0.00000,8.945088,0.000000,14.988878,14.978233,14.862683,5.303138,0.0,0.0,0.0,0.000000,1.948337,0.000000,3.512932,5.461269,0.000000,0.000000,0.0,0.000000,0.0,0.000000,7.879345,5.090288,2.967875,4.752317,3.460146,0.000000,0.0,1.697890,0.000000,1.944855,0.472948,2.202045,0.790644,0.0,2.466059,0.0,0.000000,0.000000,0.000000,0.0,7.952601,0.0,3.035368,0.0,0.0,0.000000,0.0,0.0,2.536726,2.928435,2.953249,0.000000,0.0,7.165133,4.838843,3.229778,0.000000,0.000000,3.988907,0.000000,0.000000,0.000000,6.458105,0.000000,0.000000,0.000000,10.037879,0.0,0.0,0.000000,0.0,0.0,14.992432,0.0,0.0,0.000000,0.000000,8.434360,3.987642,0.0,1.826967,0.00000,14.287000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,2.208053,3.273947,0.0,0.000000,0.0,0.0,14.898172,14.579869,0.000000,8.033570,11.022539,0.0,12.204413,0.000000,0.0,0.000000,3.524878,0.0,3.648574,0.856679,0.0,0.000000,0.0,3.428649,0.0,0.000000,0.0
3,1A06,Bacteria,Proteobacteria,Gammaproteobacteria,Vibrionales,Vibrionaceae,Vibrio,Vibrio breoganii,0.853738,46.428602,0.00000,0.000000,0.000000,0.000000,0.000000,14.783576,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,4.033903,1.793761,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,2.023462,0.000000,14.960031,1.977268,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.418512,0.000000,0.0,0.768931,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.692086,0.825323,0.882093,0.0,0.000000,0.841267,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.125101,0.000000,0.000000,0.000000,0.579295,0.0,0.0,0.000000,0.0,0.0,1.491146,0.0,0.0,0.000000,0.000000,13.953953,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.300005,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,1.832139,4.770863,0.0,2.573017,0.000000,0.0,0.000000,0.000000,0.0,1.769360,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
4,3B05,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Nitrincolaceae,Neptunomonas,Neptunomonas phycophila,-0.511795,46.004129,3.14315,5.254451,2.930394,14.061493,7.989875,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.738220,4.220125,0.000000,4.022708,0.000000,0.000000,0.0,4.319450,2.067187,0.000000,1.052590,1.912622,0.000000,0.0,1.560417,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,3.121574,3.903862,1.058876,0.000000,0.0,4.482367,8.236923,4.657778,4.233204,2.290385,4.624842,3.188789,7.270759,5.009391,10.280403,2.925499,2.326214,14.980253,5.963035,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,1.844583,1.568588,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.0,1.729252,0.0,0.000000,0.0,0.0,0.0,2.731052,0.000000,0.0,3.055279,0.0,0.0,14.814042,0.000000,0.000000,3.251595,3.959807,0.0,0.000000,5.072863,0.0,2.004678,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,1.138197,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,I2R16,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Psychrosphaera,,-0.019734,40.867696,0.00000,0.000000,0.000000,0.000000,2.973870,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.786246,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,2.394197,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.570447,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,10.769438,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
174,I3M07,Bacteria,Proteobacteria,Gammaproteobacteria,Vibrionales,Vibrionaceae,Vibrio,Vibrio cyclitrophicus,0.301999,44.755719,0.00000,5.907038,0.000000,14.958417,0.000000,4.159876,2.339897,0.0,0.0,0.0,0.000000,0.000000,4.327482,4.601682,4.439852,0.000000,0.000000,0.0,0.000000,0.0,0.000000,7.168942,4.011225,3.676473,5.686577,2.718742,0.000000,0.0,1.796661,2.741532,1.978313,0.000000,2.226309,1.432870,0.0,2.357139,0.0,0.000000,0.000000,0.000000,0.0,3.324931,0.0,2.033015,0.0,0.0,0.000000,0.0,0.0,1.866258,2.213197,2.462419,0.000000,0.0,0.000000,3.402176,4.056008,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5.804675,0.000000,0.000000,0.000000,14.992289,0.0,0.0,0.000000,0.0,0.0,14.990890,0.0,0.0,0.000000,0.000000,9.569520,0.000000,0.0,0.000000,0.00000,12.006979,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,2.135981,0.000000,0.0,0.000000,0.0,0.0,7.810808,0.000000,0.000000,3.429919,10.993864,0.0,6.212158,0.000000,0.0,0.000000,0.000000,0.0,2.250446,0.000000,0.0,3.207585,0.0,0.000000,0.0,0.000000,0.0
175,I3M17,Bacteria,Bacteroidota,Bacteroidia,Cytophagales,Cyclobacteriaceae,Cyclobacterium,,0.913059,39.516564,0.00000,0.000000,0.000000,0.000000,0.000000,1.097691,1.168855,0.0,0.0,0.0,0.915705,1.042495,1.384302,1.087059,1.134934,0.994844,1.281668,0.0,1.043393,0.0,0.460116,1.257355,1.090759,1.075238,0.519631,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,1.906134,0.000000,0.0,0.000000,0.64304,0.000000,0.0,0.000000,0.0,1.635587,0.0,0.0,0.0,0.864621,0.000000,0.0,0.000000,0.0,0.0,3.046262,0.320654,1.349295,14.985723,0.886371,0.0,1.204013,0.000000,0.0,0.000000,12.144200,0.0,0.000000,0.000000,0.0,2.940937,0.0,0.000000,0.0,0.000000,0.0
176,I3R07,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Nitrincolaceae,Amphritea,,-0.487280,48.590526,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,7.987684,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,3.500777,14.997840,2.629699,2.046098,0.000000,3.061268,0.000000,7.981679,0.000000,14.961015,0.000000,0.000000,0.000000,14.998863,0.0,0.0,0.597757,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,8.008653,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0


In [12]:
# ko
DIR_KOFAMSCAN=f"{DIR_matti}/kofamscan"
ko_data={}
for ff in tqdm(os.listdir(DIR_KOFAMSCAN)):
    ko_data[ff.replace('.ko','')]=pd.read_csv(os.path.join(DIR_KOFAMSCAN,ff),sep='\t',header=None,names=['orf','ko'])['ko'].value_counts()
ko_data=pd.DataFrame(ko_data).fillna(0).astype(int)
ko_data=(ko_data>0).astype(int).T # binarize

# Growth data
growth_raw=pd.read_excel(f"{DIR_matti}/raw/Code & Data for Fundamental metabolic strategies of heterotrophic bacteria/Supplementary files/SI Table 3 (growth data).xlsx",sheet_name='growth data',header=1)
growth_data=growth_raw.drop(columns=["kingdom","phylum","class","order","family","genus","species",r"sugar-acid-preference (averaged over 3 experiments, -1=acids, +1=sugars)",r"genome %GC"]).copy()
growth_data.rename(columns={"strain ID":"sample"},inplace=True)
if growth_data['sample'].duplicated().any():
    print("Warning: duplicated sample")
growth_data.set_index('sample',inplace=True)
growth_data=(growth_data>0).astype(int)

# tree
from ete3 import Tree
tree=Tree(f"{DIR_matti}/matti_16S.tree",format=1)

# Finalize 
matti_final=finalize_data(ko_data, growth_data, tree,remove_prefix=True, min_zeros=10, min_ones=10, min_growth_data_samples=None) # Keep data with at least 10 zeros and 10 ones. Note that DSS-3 sample is removed in a previous step. 
with open(os.path.join(DIR_matti,"matti.pk"),'wb') as f:
    pickle.dump(matti_final,f)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221/221 [00:05<00:00, 42.49it/s]


172 samples:  ['12B01' '13B01' '3C02' '3D04' '3F01' '4A09' '4A09_2' '4A10' '4B03' '4B04'
 '4C11' '4D01' '4D10' '4E07' '4F10' '4G03' '4G09' '4H09' '5C01' '5D01'
 '5F01' '5F06' '5G01' '6B07' '6C01' '6D02' '6D03' '6E01' '6E02' '6E03'
 '6G02' 'A1M10' 'A1R05' 'A1R06' 'A1R11' 'A2M03' 'A2M07' 'A2R01' 'A2R05'
 'A2R07' 'A2R16' 'A2R20' 'A3M03' 'A3M17' 'A3R04' 'A3R06' 'A3R12' 'A3R16'
 'AS25' 'AS40' 'AS56' 'AS69' 'AS88' 'AS94' 'B1R08' 'B1R10' 'B1R15' 'B2M06'
 'B2M13' 'B2M14' 'B2M17' 'B2R03' 'B2R04' 'B2R08' 'B2R09' 'B2R14' 'B2R14_2'
 'B2R22' 'B3M02' 'B3M03' 'B3M12' 'B3M18' 'B3R02' 'B3R02_2' 'B3R09' 'B3R10'
 'B3R15' 'B3R18' 'C1M14' 'C1R02' 'C1R06' 'C1R06_2' 'C1R08' 'C2M01' 'C2M04'
 'C2M04_2' 'C2M11' 'C2M19' 'C2R07' 'C2R07_2' 'C2R09' 'C2R11' 'C2R13'
 'C2R21' 'C3M06' 'C3M08' 'C3M10' 'C3M11' 'C3R04' 'C3R06' 'C3R06_2' 'C3R10'
 'C3R12' 'C3R14' 'C3R15' 'C3R17' 'C3R19' 'D2M02' 'D2M19' 'D2M19_2' 'D2R04'
 'D2R05' 'D2R18' 'D2R19' 'D3M06' 'D3M08' 'D3M17' 'D3M17_2' 'D3R04' 'D3R05'
 'D3R19' 'E2M05' 'E2M18' 'E2R1

# Dataset 3: BacDive

In [2]:
DIR_bacdive="/home/zeqianli/project/zeqian/Carbon/data/bacdive"

## Download raw data 

In [None]:
# Downlaod raw data
# In BacDive search section (https://bacdive.dsmz.de/advsearch), filter by NCBI taxon ID and metabolite utilization being available. Download the spreadsheet.

import bacdive # Note this section needs [BacDive Python API](https://pypi.org/project/bacdive/).

strains=pd.read_csv("download/20221024_bacdive.csv",dtype={'ID':str}).dropna(subset=['ID'])

EMAIL=None # Put your own email
PASSWORD=None # Put your own password
client = bacdive.BacdiveClient(EMAIL,PASSWORD)

ids=strains['ID'].tolist()
raw=[]
MAX_QUERY=100

for i in range(0,len(ids),MAX_QUERY):
    try:
        search=client.search(id=';'.join(ids[i:i+MAX_QUERY]))
        if search:
            result=client.retrieve()
            raw.extend(list(result))
            print("Query success: ",i)
        else:
            print('No result for',ids[i:i+MAX_QUERY])
            raise ValueError
    except Exception as e:
        print(f"Query failed i={i}. {e}")

with open(f"{DIR_bacdive}/raw/raw.pk",'wb') as f:
    pickle.dump(raw, f)

# TODO: id, taxon_id, biosample_id, taxa lookup 

## Parse utilization data

In [None]:
def get_utilization_data(record,keys=['Chebi-ID','metabolite','utilization activity','kind of utilization tested']):
    # TODO: specify if the data source is api-test or not
    bacdive_id=record['General']['BacDive-ID']
    outs=[]
    try:
        records=record['Physiology and metabolism']['metabolite utilization']
        if isinstance(records,dict):
            records=[records] # Use the first record for now. 
    except KeyError:
        return outs
    
    for r in records:
        out={'id':bacdive_id}
        for k in keys:
            try:
                out[k]=str(r[k])
            except KeyError:
                out[k]=None
        outs.append(out)
    return outs

raw=pd.read_pickle(f"{DIR_bacdive}/raw/raw.pk")
df_utilization=[]
for record in raw:
    df_utilization+=get_utilization_data(record)
df_utilization=pd.DataFrame(df_utilization,dtype=str)

In [None]:
df_utilization=df_utilization[df_utilization['kind of utilization tested'].isin(['energy source','carbon source'])] # Filter out other kinds of utilization.
df_utilization=df_utilization[df_utilization['utilization activity'].isin(['+','-'])] # Remove uncertain records.
df_utilization['metabolite']=df_utilization['metabolite'].str.replace('D-','').str.replace('L-','') # Remove D- and L- prefix. 
df_utilization['metabolite']=df_utilization['metabolite'].str.replace('\w*sodium\ +','',regex=True).replace('\w*potassium\ +','',regex=True) 
df_utilization['metabolite']=df_utilization['metabolite'].str.replace(r'\(.*\)- *','',regex=True) # braket prefixes. E.g., (-)-quinic acid -> quinic acid, methyl (R)-lactate -> methyl lactate
df_utilization['metabolite']=df_utilization['metabolite'].str.replace(r'\d+ *%','',regex=True) # This changes "1 % sodium lactate" to "lactate" 

df_utilization=df_utilization.groupby(['id','metabolite']).filter(lambda x: x['utilization activity'].nunique()==1) # Remove conflicting records
df_utilization=df_utilization.drop_duplicates(subset=['id','metabolite']) # Remove duplicated records.
df_utilization=df_utilization.groupby('metabolite').filter(lambda x: len(x)>=9) # Minimum 9 samples
df_utilization=df_utilization.pivot(index='id',columns='metabolite',values='utilization activity') # Convert to wide format
df_utilization=df_utilization.replace({'+':1,'-':0}) # Convert to binary

Note: I shortened metabolite names and merged records. Only <1% of records are affected by this change and these records are removed. See SI. 

In [None]:
df_utilization.to_csv(f'{DIR_bacdive}/bacdive_utilization.csv')
df_utilization.to_pickle(f'{DIR_bacdive}/bacdive_utilization.pk')

## Annotation data from progenomes

In [None]:
# Parse bacdive entries
def get_genome_info(record,keys=['description','accession','NCBI tax ID']):
    out={'id':record['General']['BacDive-ID']}
    try:
        record=record['Sequence information']['Genome sequences']
        if isinstance(record,list):
            record=record[0] # Use the first record for now. 
    except KeyError:
        for k in keys:
            out[k]=None
        return out
    
    for k in keys:
        try:
            out[k]=str(record[k])
        except KeyError:
            out[k]=None
    return out

raw=pd.read_pickle(f"{DIR_bacdive}/raw/raw.pk")
df_genome=[]
for record in raw:
    df_genome.append(get_genome_info(record))
df_genome=pd.DataFrame(df_genome,dtype=str)
df_genome=df_genome.dropna(subset=['NCBI tax ID']) # This entry is required for progenomes 

In [None]:
from io import StringIO
import urllib.request

# Get biosample id
def get_ncbi_biosample_id(row):
    #id=row['NCBI tax ID']
    acc=row['accession']
    try:
        out=!datasets summary genome accession {acc}.1
        out=json.load(StringIO(out[0]))['assemblies'][0]['assembly']['biosample_accession']
        return out
    except:
        print("Error querying biosample ID for ",row['id'], acc)
        return None

def download_eggnog(row,overwrite=True,dir_out="/media/zeqian/cf24fe8e-c86d-4712-b58b-9b0f3a718ba91/CarbonUtilization/bacdive/raw/eggnog/"):
    ff_out=os.path.join(dir_out, f"{row['id']}_{row['accession']}_{row['NCBI tax ID']}_{row['biosample']}.csv")
    if os.path.exists(ff_out):
        print("File exists: ",row['id'])
        if not overwrite:
            return
            
    progenomes_eggnog_temp="https://progenomes.embl.de/dumpAnnotation.cgi?p={NCBI_tax_ID}.{Biosample_ID}&t=e&a={NCBI_tax_ID}" 
    url=progenomes_eggnog_temp.format(NCBI_tax_ID=row['NCBI tax ID'],Biosample_ID=row['biosample'])

    try:
        urllib.request.urlretrieve(url,ff_out)
        print("Downloaded ",row['id'])
    except:
        print("Error downloaing eggnog for ",row['id'])
        return None

In [None]:
# Download from progenomes
timeout=30 

dir_download=f"{DIR_bacdive}/raw/eggnog/"
downloaded=set([x.split('_')[0] for x in os.listdir(dir_download)])
from multiprocessing import Process

def retrieve_biosample_and_download_eggnog(row):
    biosample_id=get_ncbi_biosample_id(row)
    row=row.copy()
    row['biosample']=biosample_id
    download_eggnog(row)


for _, row in df_genome.iterrows():
    if row['id'] in downloaded:
        print("Already downloaded: ",row['id'])
        continue
    
    try:
        p=Process(target=retrieve_biosample_and_download_eggnog,args=(row,))
        p.start()
        p.join(timeout)

        if p.is_alive():
            print("Download timed out: ",row['id'])
            p.terminate()
            p.join()
    except Exception as e:
        print("Error downloading: ",row['id'])
        print(e)


In [None]:
# Remove invalid downloads

for ff in os.listdir(dir_download):
    min_line_number=2
    ff=os.path.join(dir_download,ff)
    with open(ff) as f:
        if len(f.readlines())<min_line_number:
            print("Removing invalid file: ",ff)
            os.remove(ff)

Some bacdive genomes are duplicate. But for each bacdive genome, each query_name has a unique annotation. 

In [None]:
# Convert eggnog records to a matrix

ko_matrix={}
eggnog_matrix={}

for ff in tqdm(os.listdir(f"{DIR_bacdive}/raw/eggnog")):
    df_eggnog=pd.read_csv(os.path.join(f"{DIR_bacdive}/raw/eggnog",ff),sep='\t')
    bacdive_id, ncbi_acc0, ncbi_acc1, ncbi_tax_id, biosample_id=ff.replace('.csv','').split('_')
    ncbi_acc=ncbi_acc0+'_'+ncbi_acc1

    df_eggnog['KEGG_primary']=df_eggnog['KEGG_KO'].str.split(',').str[0]
    df_eggnog['root_og']=df_eggnog['EGGNOG_OGS'].str.extract(r'root,([\w\d]+)@')

    ko_matrix[bacdive_id]=df_eggnog[df_eggnog['KEGG_primary'].str.startswith('ko:')].groupby('KEGG_primary').size().to_dict()
    eggnog_matrix[bacdive_id]=df_eggnog.groupby('root_og').size().to_dict()

In [None]:
ko_matrix

In [None]:
ko_matrix=pd.DataFrame(ko_matrix).T.fillna(0).astype(int)
ko_matrix.columns=ko_matrix.columns.str.replace('ko:','')

ko_matrix.to_pickle(f"{DIR_bacdive}/bacdive_ko_data.pk")
# eggnog_matrix=pd.DataFrame(eggnog_matrix).T.fillna(0).astype(int)
# eggnog_matrix.to_pickle(os.path.join(dir_download,'eggnog_matrix.pk'))

## 16S data from ENA

#### Download 16S data from ENA database

In [None]:
!wget -r ftp://ftp.ebi.ac.uk/pub/databases/ena/rRNA/snapshot_latest/con/* -P "/home/zeqianli/project/zeqianli/Kuehn/Carbon/data/bacdive/raw/ena"
!wget -r ftp://ftp.ebi.ac.uk/pub/databases/ena/rRNA/snapshot_latest/std/* -P "/home/zeqianli/project/zeqianli/Kuehn/Carbon/data/bacdive/raw/ena"

In [None]:
# Concatenate files in the directory that ends with ".fasta.gz"
!cat $(find "/home/zeqianli/project/zeqianli/Kuehn/Carbon/data/bacdive/raw/ena" -name "*.fasta.gz") > "/home/zeqianli/project/zeqianli/Kuehn/Carbon/data/bacdive/raw/ena_rRNA.fasta.gz"

print("Total number of fasta records:")
!zcat "/home/zeqianli/project/zeqianli/Kuehn/Carbon/data/bacdive/raw/ena_rRNA.fasta.gz" | grep -c ">"

In [None]:
# Process ENA database
from Bio import SeqIO
import gzip

def parse_16S_file(ff):

    def parse_record(record):
        dic={}
        dic['description']=record.description
        dic['seq']=str(record.seq)
        return dic
    
    df=[]
    with gzip.open(ff,'rt') as f:
        for record in tqdm(SeqIO.parse(f,'fasta')):
            df.append(parse_record(record))
    
    return pd.DataFrame(df)


ena_full=parse_16S_file("/home/zeqianli/project/zeqianli/Kuehn/Carbon/data/bacdive/raw/ena_rRNA.fasta.gz")

ena_full['ena_acc']=ena_full['description'].str.split('|').str[1].str.split(':').str[0] # Each ENA accession has a version number (e.g. 12456678.1). Keep both versions (12456678.1 and 12456678).
ena_full['ena_acc_trimmed']=ena_full['ena_acc'].str.split('.').str[0]

ena_full.to_pickle("/home/zeqianli/project/zeqianli/Kuehn/Carbon/data/bacdive/raw/ena_rRNA.pk")

#### Match BacDive 16S accessions with ENA database

In [None]:
raw=pd.read_pickle(f"{DIR_bacdive}/raw/raw.pk")
sixteenS_acc=[]
for record in tqdm(raw):
    bacdive_id=str(record['General']['BacDive-ID'])
    sixteenS_keys=['description','accession','length','database','NCBI tax ID']

    try:
        sixteenS_info=record['Sequence information']['16S sequences']
    except KeyError:
        print("16S data unavailable for ",bacdive_id)

    if isinstance(sixteenS_info,dict):
        sixteenS_info=[sixteenS_info]

    for info in sixteenS_info:
        dic={'bacdive_id':bacdive_id}
        for key in sixteenS_keys:
            try:
                dic[key]=str(info[key])
            except KeyError:
                continue
        sixteenS_acc.append(dic)

sixteenS_acc=pd.DataFrame(sixteenS_acc)
sixteenS_acc=sixteenS_acc[sixteenS_acc['database']=='ena'] # Use only ENA database for now. 
sixteenS_acc.dropna(subset=['length'],inplace=True) # only one record with a wrong accession ID. 

# Each bacdive record can have multiple ENA accession IDs. Some are better and some are wrong (e.g. https://bacdive.dsmz.de/strain/5223). Select the best record after merging with ENA sequences. 

In [None]:
# Match accession IDs with ENA database
ena_full=pd.read_pickle(f"{DIR_bacdive}/raw/ena_rRNA.pk")
ena=ena_full[ena_full['description'].str.contains(f'16[sS]',regex=True)] # Keep only 16S sequences
ena=ena[(ena['ena_acc'].isin(sixteenS_acc['accession'])) | (ena['ena_acc_trimmed'].isin(sixteenS_acc['accession']))]

ena['partial']=ena['description'].str.contains('partial',regex=True)
ena['ena_length']=ena['seq'].str.len()
ena['_ind']=ena.index.values
ena=ena.sort_values(by=['ena_acc_trimmed','ena_length','partial','_ind'],ascending=[True,False,False,True]).drop_duplicates(subset='ena_acc_trimmed',keep='first') # If multiple 16S sequences are present for the same accession, keep the non-partial one, and then the longest one. Manually checked and this is ok. If the length in bacdive and length in ENA are different, it's ok. 
ena.columns=ena.columns.map(lambda x: x+'_ena')


In [None]:
sixteenS_acc['accession_trimmed']=sixteenS_acc['accession'].str.split('.').str[0]
# Merge with bacdive data
# Note that bacdive enc accession number can be either the full number or the trimmed number. I check that in ENA database, each trimmed accession has only one full accession. So trim the bacdive accession and then merge with the ENA trimmed accession is safe. 
bacdive_with_16S=pd.merge(sixteenS_acc,ena[['description_ena','seq_ena','ena_acc_trimmed_ena','partial_ena','ena_length_ena']],left_on='accession_trimmed',right_on='ena_acc_trimmed_ena',how='left') # First, try to merge with the full accession number

bacdive_with_16S=bacdive_with_16S.dropna(subset=['seq_ena']) # Drop records with no ENA sequences. Checked that there is indeed problems on BacDive (e.g. https://bacdive.dsmz.de/strain/160325)
bacdive_with_16S=bacdive_with_16S.sort_values(by=['bacdive_id','ena_length_ena'],ascending=[True,False]).drop_duplicates(subset=['bacdive_id'],keep='first') # If multiple 16S sequences are present, keep the longest one. Probably ok. In some records (https://bacdive.dsmz.de/strain/9984), the longest record is not the same reference as other ones. I don't see why this is not ok. 


In [None]:
# Write to a fasta file
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

seqs=[]
for _, row in bacdive_with_16S.iterrows():
    seqs.append(SeqRecord(Seq(row['seq_ena']),id='bacdive_'+row['bacdive_id'],description=row['description_ena']))

SeqIO.write(seqs,f"{DIR_bacdive}/bacdive_16S.fna",'fasta')

#### Construct tree

In [8]:
# Build tree
INPUT=f"{DIR_bacdive}/bacdive_16S.fna"
OUTPUT_ALIGNED=f"{DIR_bacdive}/bacdive_16S.aligned.fna"
OUTPUT_CSV=f"{DIR_bacdive}/bacdive_16S.aligned.csv"
TREE=f"{DIR_bacdive}/bacdive_16S.tree"
generate_tree(INPUT=INPUT, OUTPUT_ALIGNED=OUTPUT_ALIGNED,OUTPUT_CSV=OUTPUT_CSV, TREE=TREE,threads=48)

[K17:36:48 [SINA] [33m[1mThis is SINA 1.7.2.[m
Processing: 0 [00:00:28]
Scanning:   0% |      | 1/2224740 [00:00:00 / 00:01:14]
[A[AProcessing: 0 [00:00:28]
Scanning:   0% |      | 1/2224740 [00:00:00 / 00:02:41]
[A[AProcessing: 0 [00:00:28]
Scanning:   0% |   | 8676/2224740 [00:00:00 / 00:00:02]
[A[AProcessing: 0 [00:00:28]
Scanning:   1% |  | 17350/2224740 [00:00:00 / 00:00:03]
[A[AProcessing: 0 [00:00:28]
Scanning:   1% |  | 22774/2224740 [00:00:00 / 00:00:03]
[A[AProcessing: 0 [00:00:28]
Scanning:   1% |  | 28500/2224740 [00:00:00 / 00:00:03]
[A[AProcessing: 0 [00:00:28]
Scanning:   2% |  | 38202/2224740 [00:00:00 / 00:00:03]
[A[AProcessing: 0 [00:00:28]
Scanning:   2% |  | 47903/2224740 [00:00:00 / 00:00:03]
[A[AProcessing: 0 [00:00:28]
Scanning:   3% |  | 59020/2224740 [00:00:00 / 00:00:03]
[A[AProcessing: 0 [00:00:28]
Scanning:   3% |  | 70135/2224740 [00:00:00 / 00:00:03]
[A[AProcessing: 0 [00:00:28]
Scanning:   4% |  | 78539/2224740 [00:00:00 / 00:00:03

## Final organize

In [3]:
# save all data
 
ko_data=pd.read_pickle(f"{DIR_bacdive}/bacdive_ko_data.pk")
ko_data=(ko_data>0).astype(int) # count matrix -> presence-absence matrix 

growth_data=pd.read_pickle(f"{DIR_bacdive}/bacdive_utilization.pk")
tree=Tree(f"{DIR_bacdive}/bacdive_16S.tree",format=1)

bacdive_final=finalize_data(ko_data,growth_data,tree,remove_prefix=True, min_zeros=10, min_ones=10, min_growth_data_samples=100)
with open(f"{DIR_bacdive}/bacdive.pk",'wb') as f:
    pickle.dump(bacdive_final,f)


4349 samples:  ['10173' '10174' '10430' ... '8629' '98' '99']
