`2019-09-07` `Tiago Ferreira Leao` **v1.0**

# Cyanobiome MASH

```
Requirements: packages below
              'master_df-TFL190425.txt' (run processing_master_df_datasetS1.ipynb)
```

**Importing General Dependencies**

In [3]:
import numpy as np
import pandas as pd
import glob
import subprocess
import os
import time
import re
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import islice
from sys import argv
from Bio import SeqIO
%matplotlib inline
sns.set()

**Installing MASH**

In [2]:
if os.path.exists("/home/tiago/Desktop/cyanet/cyanobiome/mash-Linux64-v2.0/mash"):
    print("MASH already downloaded")
else:
    !wget https://github.com/marbl/Mash/releases/download/v2.0/mash-Linux64-v2.0.tar
    !tar xopf mash-Linux64-v2.0.tar; rm mash-Linux64-v2.0.tar

MASH already downloaded


**Download NCBI genomes**

Search for cyanobacteria at [this link here](https://www.ncbi.nlm.nih.gov/Traces/wgs/?page=1&view=all&search=cyanobacteria)

Download file using the download button

Upload the file into your server and Jupyter interface and then run the cell below

In [None]:
start_time = time.time()

if os.path.exists("./inputs/wgs_selector.csv"):
    commands = []
    count = 0
    !cat ./ods/wgs_selector.csv | tr "\t" "~" | cut -d"~" -f1 | sed 1d > ./ids.txt
    with open("./ids.txt") as ids:
        for code in ids:
            prefix = code.strip("\n")[:6]
            line = "wget 'ftp://ftp.ncbi.nlm.nih.gov/sra/wgs_aux/%s/%s/%s/%s.1.fsa_nt.gz' -O - | gunzip > ./ods/%s.fasta"%(prefix[0:2],prefix[2:4],prefix,prefix,prefix)
            commands.append(line)
    table1_handle = open('./download_cyanobacteria.sh', "w")
    cmd_df = pd.DataFrame(commands)
    cmd_df.to_csv(table1_handle, sep='\t', index=False, header=False)
    table1_handle.close()
    !rm ./ods/ids.txt
else:
    !mkdir ./ods
    raise ValueError("File ./ods/wgs_selector.txt not found, please upload the file inside the folder ./ods")

!sh ./ods/download_cyanobacteria.sh
    
print('\n' + "--- %s seconds ---" %(time.time()-start_time))

**Renaming NCBI**

In [None]:
def split_at(s, c, n):
    words = s.split(c)
    return c.join(words[:n]), c.join(words[n:])

def rename(filename, outpath):
    strain = split_at(os.path.basename(filename), '_', 1)[0]
    new_file = []
    input_handle = open("%s"%filename, "rU")
    for record in SeqIO.parse(input_handle, "fasta"):
        substring = split_at(record.id, '_', 2)[0]
        new_id = "%s_%s"%(strain,substring)
        record.id = "%s"%new_id
        record.description = "%s"%new_id
        new_file.append(record)
    output_handle = open("%s/%s.ren.fasta"%(outpath,strain), "w")
    SeqIO.write(new_file, output_handle, "fasta")
    output_handle.close()
    input_handle.close()

count = 0

for file in os.listdir("/home/gerwick-lab/Desktop/data/genomes/refseq_cyano/renamed/"):
    if file.endswith(".fasta"):
        rename(os.path.join("/home/gerwick-lab/Desktop/data/genomes/refseq_cyano/",
                            file),
                            '/home/gerwick-lab/Desktop/data/genomes/refseq_cyano/renamed/')
        count += 1

print("%s fasta files were renamed"%count)

**Sketching NCBI dataset**

This step is going to create the minhashes from the references to be probed in the soon to be queried genomes

In [2]:
start_time = time.time()

if os.path.exists("/home/tiago/Desktop/cyanet/cyanobiome/mash-Linux64-v2.0/ncbi_genomes.msh"):
    print("Sketch already exists")
else:
    !./mash-Linux64-v2.0/mash sketch -o /home/tiago/Desktop/cyanet/cyanobiome/mash-Linux64-v2.0/ncbi_genomes /home/gerwick-lab/Desktop/data/genomes/refseq_cyano/renamed/*.fasta
    
print('\n' + "--- %s seconds ---" %(time.time()-start_time))

Sketch already exists

--- 0.00106596946716 seconds ---


**Cateloging NCBI genomes into their respective genera**

In [3]:
from Bio import SeqIO

def split_string_at(s, c, n):
    words = s.split(c)
    return c.join(words[:n]), c.join(words[n:])

def extract_ncbi_taxa(inputs_folder):
    input_list = glob.glob("%s*.fasta"%inputs_folder)
    taxa = {}
    for item in input_list:
        records = list(SeqIO.parse(item, "fasta"))
        strain = os.path.splitext(os.path.basename(item))[0]
        if records:
            if "TPA_asm" in str(records[0]):
                genera = split_string_at(genera,":",1)[0]
            else:
                genera = split_string_at(records[0].description," ",1)[1]
            if "Candidatus" in str(genera):
                taxa[strain] = str(genera).partition(' ')[2].partition(' ')[0]
            else:
                taxa[strain] = str(genera).partition(' ')[0].lstrip("[").rstrip("]")
    return taxa
            
taxa_dict = extract_ncbi_taxa("/home/gerwick-lab/Desktop/data/genomes/refseq_cyano/")

In [4]:
taxa_dict

{'AADV02': 'Crocosphaera',
 'AANO01': 'Synechococcus',
 'AANP01': 'Synechococcus',
 'AAOK01': 'Synechococcus',
 'AATZ01': 'Synechococcus',
 'AAUA01': 'Synechococcus',
 'AAVU01': 'Lyngbya',
 'AAVW01': 'Nodularia',
 'AAXW01': 'Cyanothece',
 'AAZV01': 'Leptolyngbya',
 'ABRS01': 'Coleofasciculus',
 'ABRV01': 'Synechococcus',
 'ABSE01': 'Cyanobium',
 'ABYK01': 'Arthrospira',
 'ACDW01': 'Prochlorococcus',
 'ACSK03': 'Arthrospira',
 'ACYA01': 'Cylindrospermopsis',
 'ACYB01': 'Raphidiopsis',
 'ADXL01': 'Synechococcus',
 'ADXM01': 'Synechococcus',
 'AEPQ01': 'Moorea',
 'AESD01': 'Crocosphaera',
 'AFEJ01': 'Acaryochloris',
 'AFJC01': 'Microcoleus',
 'AFXD01': 'Arthrospira',
 'AGCR01': 'Tolypothrix',
 'AGIK01': 'Synechococcus',
 'AGIZ01': 'Fischerella',
 'AGJC02': 'Cyanothece',
 'AHGV01': 'Thermanaerovibrio',
 'AJHB01': 'Microcystis',
 'AJLJ01': 'Fischerella',
 'AJLK01': 'Fischerella',
 'AJLL01': 'Fischerella',
 'AJLM01': 'Chlorogloeopsis',
 'AJLN01': 'Chlorogloeopsis',
 'AJTX02': 'Prochlorothrix

In [61]:
import csv

with open('/home/tiago/Desktop/cyanet/cyanobiome/mash-Linux64-v2.0/mash_taxonomy.csv', 'wb') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in taxa_dict.items():
       writer.writerow([key, value])

**Obtaining MASH distances from a given Cyanobiome finished genome to a NCBI reference**

In [4]:
def get_mash_df(inputs_folder,sketch):
    start_time = time.time()
    input_list = glob.glob("%s*.fasta"%inputs_folder)
    final_df = pd.DataFrame()
    for index,item in enumerate(input_list):
        scores = []
        cmd = "/home/tiago/Desktop/cyanet/cyanobiome/mash-Linux64-v2.0/mash dist %s %s > /home/tiago/Desktop/cyanet/cyanobiome/mash-Linux64-v2.0/temp.txt"%(sketch,item)
        subprocess.call(cmd,shell=True)
#         print cmd
        df = pd.read_csv("/home/tiago/Desktop/cyanet/cyanobiome/mash-Linux64-v2.0/temp.txt",sep="\t",names=["reference","query","distance","pvalue","matching-hashes"])
        df = df[df.reference != item]
        row = df[df.distance == df.distance.min()]
        final_df = final_df.append(row[:1], ignore_index = True)
    print('\n' + "--- %s seconds ---" %(time.time()-start_time))
    return final_df
        
mash_df = get_mash_df("/home/tiago/Desktop/cyanet/cyanobiome/final_scaffolds/","/home/tiago/Desktop/cyanet/cyanobiome/mash-Linux64-v2.0/ncbi_genomes.msh")


--- 63.8576719761 seconds ---


In [5]:
for item in mash_df['reference']:
    print item

/home/gerwick-lab/Desktop/data/genomes/refseq_cyano/renamed/JTHE01.ren.fasta
/home/gerwick-lab/Desktop/data/genomes/refseq_cyano/renamed/AEPQ01.ren.fasta
/home/gerwick-lab/Desktop/data/genomes/refseq_cyano/renamed/MKZS01.ren.fasta
/home/gerwick-lab/Desktop/data/genomes/refseq_cyano/renamed/MJGC01.ren.fasta
/home/gerwick-lab/Desktop/data/genomes/refseq_cyano/renamed/ABRS01.ren.fasta
/home/gerwick-lab/Desktop/data/genomes/refseq_cyano/renamed/ALVP01.ren.fasta
/home/gerwick-lab/Desktop/data/genomes/refseq_cyano/renamed/LGSU01.ren.fasta
/home/gerwick-lab/Desktop/data/genomes/refseq_cyano/renamed/LGSU01.ren.fasta
/home/gerwick-lab/Desktop/data/genomes/refseq_cyano/renamed/LUFH01.ren.fasta
/home/gerwick-lab/Desktop/data/genomes/refseq_cyano/renamed/LGSU01.ren.fasta
/home/gerwick-lab/Desktop/data/genomes/refseq_cyano/renamed/AEPQ01.ren.fasta
/home/gerwick-lab/Desktop/data/genomes/refseq_cyano/renamed/ABRS01.ren.fasta
/home/gerwick-lab/Desktop/data/genomes/refseq_cyano/renamed/MKZS01.ren.fasta

**Number of singletons using MASH paper metric of D ≤ 0.05 and p-value ≤ 10E–10**

In [6]:
75-len(mash_df[mash_df["distance"] <= 0.05])

54

In [17]:
mash_df[mash_df["distance"] <= 0.05]["pvalue"]

1     0.0
2     0.0
10    0.0
12    0.0
14    0.0
15    0.0
16    0.0
20    0.0
23    0.0
25    0.0
26    0.0
33    0.0
36    0.0
37    0.0
39    0.0
40    0.0
45    0.0
51    0.0
63    0.0
66    0.0
68    0.0
Name: pvalue, dtype: float64

**Obtaining closest taxa for Cyanobiome genomes**

In [39]:
taxa_dict2 = {}

for i,r in mash_df.iterrows():
    ncbi_taxa = os.path.splitext(os.path.basename((r[0])))[0].partition('.')[0]
    strain = os.path.splitext(os.path.basename((r[1])))[0].partition('_')[0]
    taxa_dict2[strain] = taxa_dict[ncbi_taxa]

In [40]:
taxa_dict2

{'1A2': 'Moorea',
 '1A3': 'Moorea',
 '1A7': 'Nostoc',
 '1B1': 'Coleofasciculus',
 '1B6': 'Leptolyngbya',
 '1B8': 'Hydrocoleum',
 '1C1': 'Hydrocoleum',
 '1C2': 'Microcystis',
 '1C4': 'Cyanothece',
 '1C9': 'Moorea',
 '1D4': 'Moorea',
 '1D8': 'Kamptonema',
 '1D9': 'Microcystis',
 '1E1': 'Kamptonema',
 '1E3': 'Moorea',
 '1E4': 'Leptolyngbya',
 '1F2': 'Moorea',
 '1F9': 'Hydrocoleum',
 '1G1': 'Sphaerospermopsis',
 '1G2': 'Sphaerospermopsis',
 '1G6': 'Moorea',
 '1H2': 'Hydrocoleum',
 '1H4': 'Hydrocoleum',
 '1H5': 'Hydrocoleum',
 '1H6': 'Hydrocoleum',
 '1I2': 'Desertifilum',
 '1I4': 'Coleofasciculus',
 '1I7': 'Hydrocoleum',
 '2A8': 'Merismopedia',
 '2A9': 'Lyngbya',
 '2B3': 'Hydrocoleum',
 '2B6': 'Nostoc',
 '2B7': 'Moorea',
 '2B9': 'Hydrocoleum',
 '2C1': 'Coleofasciculus',
 '2C2': 'Hydrocoleum',
 '2C4': 'Moorea',
 '2C9': 'Hydrocoleum',
 '2D1': 'Hydrocoleum',
 '2D2': 'Microcystis',
 '2D8': 'Hydrocoleum',
 '2E6': 'Coleofasciculus',
 '2E9': 'Coleofasciculus',
 '2F4': 'Hydrocoleum',
 '2F5': 'Hydro

**Correcting noomeclature replacing with manually online BLAST searched 16S rRNA**

In [56]:
taxa_dict3 = {}

for item in taxa_dict2:
    if taxa_dict2[item] != "Hydrocoleum":
        taxa_dict3[item] = taxa_dict2[item]
    else:
        taxa_dict3[item] = "Okeania"
    if item == "ISBB":
        taxa_dict3[item] = "Leptolyngbya"

In [57]:
taxa_dict3

{'1A2': 'Moorea',
 '1A3': 'Moorea',
 '1A7': 'Nostoc',
 '1B1': 'Coleofasciculus',
 '1B6': 'Leptolyngbya',
 '1B8': 'Okeania',
 '1C1': 'Okeania',
 '1C2': 'Microcystis',
 '1C4': 'Cyanothece',
 '1C9': 'Moorea',
 '1D4': 'Moorea',
 '1D8': 'Kamptonema',
 '1D9': 'Microcystis',
 '1E1': 'Kamptonema',
 '1E3': 'Moorea',
 '1E4': 'Leptolyngbya',
 '1F2': 'Moorea',
 '1F9': 'Okeania',
 '1G1': 'Sphaerospermopsis',
 '1G2': 'Sphaerospermopsis',
 '1G6': 'Moorea',
 '1H2': 'Okeania',
 '1H4': 'Okeania',
 '1H5': 'Okeania',
 '1H6': 'Okeania',
 '1I2': 'Desertifilum',
 '1I4': 'Coleofasciculus',
 '1I7': 'Okeania',
 '2A8': 'Merismopedia',
 '2A9': 'Lyngbya',
 '2B3': 'Okeania',
 '2B6': 'Nostoc',
 '2B7': 'Moorea',
 '2B9': 'Okeania',
 '2C1': 'Coleofasciculus',
 '2C2': 'Okeania',
 '2C4': 'Moorea',
 '2C9': 'Okeania',
 '2D1': 'Okeania',
 '2D2': 'Microcystis',
 '2D8': 'Okeania',
 '2E6': 'Coleofasciculus',
 '2E9': 'Coleofasciculus',
 '2F4': 'Okeania',
 '2F5': 'Okeania',
 '2G3': 'Scytonema',
 '2G4': 'Okeania',
 '2G5': 'Okeani

In [62]:
import csv

with open('/home/tiago/Desktop/cyanet/cyanobiome/mash-Linux64-v2.0/mash_taxonomy.csv', 'a') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in taxa_dict3.items():
       writer.writerow([key, value])

The final taxonomy on the MASH file ("mash_taxonomy_2.csv") was manually corrected using 16S assigments and phylogenomic clading

**Attaching new column to master_df**

In [6]:
mash_df = pd.read_csv('/home/tiago/Desktop/cyanet/cyanobiome/mash-Linux64-v2.0/mash_taxonomy_2.csv',sep=',',header=None)

new_names = []

for i,r in mash_df.iterrows():
    if len(mash_df[0].loc[i]) <= 4:
        strain = "SIO"+mash_df[0].loc[i]
        new_names.append(strain)
    else:
        new_names.append(mash_df[0].loc[i])
        
mash_df[0] = new_names

mash_dict = pd.Series(mash_df[1].values,index=mash_df[0]).to_dict()

In [7]:
mash_df[:5]

Unnamed: 0,0,1
0,NRIU01,Geitlerinema
1,LUPT01,Prochlorococcus
2,MWPC01,Prochlorococcus
3,AVFY01,Planktothrix
4,JFLU01,Prochlorococcus


In [5]:
master_df = pd.read_csv("/home/tiago/Desktop/cyanet/cyanobiome/tables/master_df-TFL190425.txt",sep="\t")

master_df

Unnamed: 0,collectionID,genomeID,completeness,#scaffolds,GC,#fragBGCs,#completeBGCs,#allBGCs
0,ASG15JUL146CUL,SIO3F2,99.09,454,49.40,10,0,10
1,ASX22JUL142CUL,SIO1E4,99.73,659,52.29,12,18,30
2,CobbledownCUL,SIO4C1,99.18,849,53.63,21,4,25
3,numC11CUL,SIO4C4,94.38,1496,46.66,4,2,6
4,numC15BCUL,SIO4C5,98.64,1204,52.29,15,0,15
5,PAB16MAY116CUL,SIO3F4,98.91,1509,45.42,9,0,9
6,PAB18MAY119CUL,SIO1E1,99.18,1398,46.96,23,1,24
7,PAL23MAY131CUL,SIO1I2,98.51,1046,47.98,10,4,14
8,PAL24MAY135-contACUL,SIO1I4,99.46,245,53.06,11,0,11
9,PAP25JUN122CUL,SIO1D8,97.09,1787,48.24,9,0,9


In [8]:
taxa_col = []

for i,r in master_df.iterrows():
    strain = master_df.loc[i]["genomeID"]
    taxa_col.append(mash_dict[strain])
    
master_df["MASH-taxa"] = taxa_col

master_df[:5]

Unnamed: 0,collectionID,genomeID,completeness,#scaffolds,GC,#fragBGCs,#completeBGCs,#allBGCs,MASH-taxa
0,ASG15JUL146CUL,SIO3F2,99.09,454,49.4,10,0,10,Spirulina
1,ASX22JUL142CUL,SIO1E4,99.73,659,52.29,12,18,30,Leptolyngbya
2,CobbledownCUL,SIO4C1,99.18,849,53.63,21,4,25,Leptolyngbya
3,numC11CUL,SIO4C4,94.38,1496,46.66,4,2,6,Kamptonema
4,numC15BCUL,SIO4C5,98.64,1204,52.29,15,0,15,Leptolyngbya


In [9]:
master_df.to_csv("/home/tiago/Desktop/cyanet/cyanobiome/tables/master_df-TFL190425-taxaID.txt",sep="\t",index_label=False)

In [1]:
!cp /home/tiago/Desktop/cyanet/cyanobiome/mash-Linux64-v2.0/mash_taxonomy.csv /home/tiago/Desktop/cyanet/genomics_pnas/outputs/
!cp /home/tiago/Desktop/cyanet/cyanobiome/mash-Linux64-v2.0/mash_taxonomy_2.csv /home/tiago/Desktop/cyanet/genomics_pnas/outputs/
!cp /home/tiago/Desktop/cyanet/cyanobiome/tables/master_df-TFL190425-taxaID.txt /home/tiago/Desktop/cyanet/genomics_pnas/outputs/

```
Outputs: mash_taxonomy.csv
         mash_taxonomy_2.csv
         master_df-TFL190425-taxaID.txt
```