In [3]:
import os
import subprocess
import time
from itertools import islice
import pandas as pd
import glob

# Notebook 2 - Downloading NCBI, Running AntiSMASH and Obtaining Metadata

This notebook helps to download all cyanobacterial genomes at NCBI and to run antiSMASH for the downloaded genomes.


**Downloading NCBI Genomes**

- Search for cyanobacteria at [this link here](https://www.ncbi.nlm.nih.gov/Traces/wgs/?page=1&view=all&search=cyanobacteria)
- Download spreadsheet using the download button
- Upload the file into your server (folder named inputs) and then run the cell below

In [5]:
if os.path.exists("./inputs/wgs_selector.csv"):
    !mkdir ./ncbi_genomes/
    commands = []
    count = 0
    !cat ./inputs/wgs_selector.csv | tr "\t" "~" | cut -d"~" -f1 | sed 1d > ./inputs/ids.txt
    with open("./inputs/ids.txt") as ids:
        for code in ids:
            prefix = code.split(',')[0]
            count += 1
            if not os.path.isfile('./ncbi_genomes/%s.fasta'%prefix):
                if '_' in prefix:
                    prefix = prefix.split('_')[1]
                if len(prefix) > 6:
                    line1 = "wget https://sra-download.ncbi.nlm.nih.gov/traces/wgs01/wgs_aux/%s/%s/%s/%s/%s.1.fsa_nt.gz"%(prefix[0:2],prefix[2:4],prefix[4:6],prefix,prefix)
                    line2 = "gunzip -c %s.1.fsa_nt.gz > %s.fasta"%(prefix,prefix)
                    commands.append(line1)
                    commands.append(line2)
                else:
                    line1 = "wget https://sra-download.ncbi.nlm.nih.gov/traces/wgs03/wgs_aux/%s/%s/%s/%s.1.fsa_nt.gz"%(prefix[0:2],prefix[2:4],prefix,prefix)
                    line2 = "gunzip -c %s.1.fsa_nt.gz > %s.fasta"%(prefix,prefix)
                    commands.append(line1)
                    commands.append(line2)
    table1_handle = open('./ncbi_genomes/download_cyanobacteria.sh', "w")
    cmd_df = pd.DataFrame(commands)
    cmd_df.to_csv(table1_handle, sep='\t', index=False, header=False)
    table1_handle.close()
    !rm ./inputs/ids.txt
else:
    raise ValueError("File ./inputs/wgs_selector.txt not found, please upload the file inside the folder ./inputs/")
    
print(count)

mkdir: cannot create directory ‘./ncbi_genomes/’: File exists
2540


To download the selected genomes, run:

```bash
cd ./ncbi_genomes/

sh ./download_cyanobacteria.sh

rm *.gz
```

In [21]:
!ls ./ncbi_genomes/*fasta | wc -l

2102


**Removing NCBI FASTA Files with Size Zero**

In [9]:
glob_list = glob.glob('./ncbi_genomes/*fasta')

for item in glob_list:
    if os.stat(item).st_size == 0:
        cmd = 'rm %s'%item
        subprocess.call(cmd,shell=True)

In [10]:
!ls ./ncbi_genomes/*fasta | wc -l

1923


**Running antiSMASH**

https://github.com/mwang87/IOMEGA_Antismash_pipeline

To run, your dependencies will be:
```
docker
nextflow - you can install via conda
```

To actually run:
```
Put files in to input_sequences folder
make run
```

**Obtaning Metadata for Table 1 and Dataset S1 (sheet 1)**

In [7]:
from Bio import SeqIO
from Bio.SeqUtils import GC
import numpy as np
import glob
import pandas as pd

glob_AS = glob.glob('./ncbi_antismash/nf_output/*/')
genome_list,taxa_list,fragBGC,compBGC,totalBGC,seen = [],[],[],[],[],[]

for AS_path in glob_AS:
    genome_file = (AS_path.split('/')[3]).split('_')[0]
    if genome_file.split('.')[0] != 'mibig':
        genome_list.append(genome_file.split('.')[0])
        genome_path = "./ncbi_genomes/%s"%genome_file
        with open(genome_path) as f:
            first_line = f.readline()
            if 'TPA_asm:' in str(first_line):
                taxa = first_line.split(' ')[2]
            if 'uncultured' in str(first_line):
                taxa = first_line.split(' ')[2]
            if 'TPA_asm:' not in str(first_line) and 'uncultured' not in str(first_line):
                taxa = first_line.split(' ')[1]
            taxa_list.append(taxa)
        glob_BGCs = glob.glob('%s/*region*.gbk'%AS_path)
        if glob_BGCs:
            frag_count = 0
            comp_count = 0
            for BGC in glob_BGCs:
                input_handle = open(BGC,"r")
                for record in SeqIO.parse(input_handle,"genbank"):
                    for feat in record.features:
                        if feat.type == 'cand_cluster':
                            if BGC not in seen:
                                seen.append(BGC)
                                if str(feat.qualifiers['contig_edge'][0]) == 'True':
                                    frag_count += 1
                                else:
                                    comp_count += 1
            fragBGC.append(frag_count)
            compBGC.append(comp_count)
            totalBGC.append(frag_count+comp_count)
        else:
            fragBGC.append(0)
            compBGC.append(0)
            totalBGC.append(0)
    

def get_draft_counts(query_genome):
    input_handle = open(query_genome,"r")
    node_count = 0
    gc_count = []
    for record in SeqIO.parse(input_handle,"fasta"):
        node_count += 1
        gc_count.append(GC(record.seq))
    gc_average = np.average(gc_count)
    return node_count,gc_average
    input_handle.close()

node_list,gc_list = [],[]
    
for genome in genome_list:
    node_count,gc_average = get_draft_counts('./ncbi_genomes/%s.fasta'%genome)
    node_list.append(node_count)
    gc_list.append(gc_average)

print(len(genome_list),len(taxa_list),len(node_list),len(gc_list),len(fragBGC),len(compBGC),len(totalBGC))

frames = {'GenomeID':genome_list,'Taxa':taxa_list,'Scaffold_count':node_list,
         'GC_content':gc_list,'Fragmented_BGCs':fragBGC,'Complete_BGCs':compBGC,
         'Total_BGCs':totalBGC}

metadata_df = pd.DataFrame(data=frames)

# metadata_df.to_csv('cyanobiome_metadata_df-TFL200507.tsv',sep='\t')

1919 1919 1919 1919 1919 1919 1919


In [8]:
metadata_df

Unnamed: 0,GenomeID,Taxa,Scaffold_count,GC_content,Fragmented_BGCs,Complete_BGCs,Total_BGCs
0,QCLE01,Prochlorococcus,58,31.954075,0,3,3
1,DCSN01,Pseudanabaena,259,42.444255,1,2,3
2,CACKIV01,Synechococcaceae,49,31.173408,2,0,2
3,CACJNU01,Prochlorococcus,41,32.097832,3,1,4
4,QCFU01,Prochlorococcus,26,35.902717,1,4,5
...,...,...,...,...,...,...,...
1914,MWOR01,Prochlorococcus,281,30.968521,0,3,3
1915,JAADAL01,Microcystis,765,42.738184,17,0,17
1916,JFML01,Prochlorococcus,255,30.940778,0,4,4
1917,QCEH01,Prochlorococcus,15,32.569129,0,1,1


In [9]:
np.average(metadata_df['Fragmented_BGCs']),np.average(metadata_df['Complete_BGCs'])

(3.9676915059927045, 2.5257946847316313)