# MASH Analysis for Finished and NCBI Cyanobacterial Genomes

### Outline



**Importing General Dependencies**

In [1]:
import numpy as np
import pandas as pd
import glob
import subprocess
import os
import time
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()

**Installing MASH**

In [2]:
if os.path.exists("./mash-OSX64-v2.2/mash"):
    print("MASH already downloaded")
else:
    !wget https://github.com/marbl/Mash/releases/download/v2.2/mash-OSX64-v2.2.tar
    !tar xopf mash-OSX64-v2.2.tar; rm mash-OSX64-v2.2.tar

MASH already downloaded


**Sketching Finished Cyanobiome and NCBI Datasets**

This step is going to create the minhashes from the references to be probed in the soon to be queried genomes

In [3]:
start_time = time.time()

if os.path.exists("./mash-OSX64-v2.2/ncbi_genomes.msh"):
    print("Sketch already exists")
else:
    !./mash-OSX64-v2.2/mash sketch -o ./mash-OSX64-v2.2/ncbi_genomes ./ncbi_genomes/*.fasta
    
print('\n' + "--- %s seconds ---" %(time.time()-start_time))

Sketch already exists

--- 0.00020003318786621094 seconds ---


In [4]:
query_genomes = ['JAAHII01','JAAHIH01','JAAHGF01','JAAHHC01','JAAHHD01','JAAHFN01','JAAHGM01','JAAHGB01','JAAHFW01','JAAHFZ01','JAAHFP01','JAAHGH01','JAAHGC01','JAAHGA01','JAAHHJ01','JAAHGO01','JAAHGU01','JAAHGW01','JAAHGS01','JAAHHM01','JAAHHP01','JAAHHN01','JAAHGL01','JAAHFU01']

!mkdir ./query_genomes/

mkdir: ./query_genomes/: File exists


In [15]:
glob_list = glob.glob('./ncbi_genomes/*fasta')

for item in glob_list:
    name = os.path.basename(item).split('.')[0]
    if name in query_genomes:
        cmd = 'cp %s ./query_genomes/'%item
        subprocess.call(cmd,shell=True)

In [5]:
def get_mash_df(inputs_folder,sketch):
    start_time = time.time()
    input_list = glob.glob("%s*.fasta"%inputs_folder)
    final_df = pd.DataFrame()
    for index,item in enumerate(input_list):
        scores = []
        ncbi_item = './ncbi_genomes/%s'%(os.path.basename(item))
        print(item,ncbi_item)
        cmd = "./mash-OSX64-v2.2/mash dist %s %s > ./mash-OSX64-v2.2/temp.txt"%(sketch,item)
        subprocess.call(cmd,shell=True)
        df = pd.read_csv("./mash-OSX64-v2.2/temp.txt",sep="\t",names=["reference","query","distance","pvalue","matching-hashes"])
        df = df[df.reference != ncbi_item]
        row = df[df.distance == df.distance.min()]
        final_df = final_df.append(row[:1], ignore_index = True)
    print('\n' + "--- %s seconds ---" %(time.time()-start_time))
    return final_df
        
mash_df = get_mash_df("./query_genomes/",
                      "./mash-OSX64-v2.2/ncbi_genomes.msh")

./query_genomes/JAAHHP01.fasta ./ncbi_genomes/JAAHHP01.fasta
./query_genomes/JAAHGA01.fasta ./ncbi_genomes/JAAHGA01.fasta
./query_genomes/JAAHGH01.fasta ./ncbi_genomes/JAAHGH01.fasta
./query_genomes/JAAHHM01.fasta ./ncbi_genomes/JAAHHM01.fasta
./query_genomes/JAAHHD01.fasta ./ncbi_genomes/JAAHHD01.fasta
./query_genomes/JAAHGU01.fasta ./ncbi_genomes/JAAHGU01.fasta
./query_genomes/JAAHFU01.fasta ./ncbi_genomes/JAAHFU01.fasta
./query_genomes/JAAHGB01.fasta ./ncbi_genomes/JAAHGB01.fasta
./query_genomes/JAAHFW01.fasta ./ncbi_genomes/JAAHFW01.fasta
./query_genomes/JAAHGW01.fasta ./ncbi_genomes/JAAHGW01.fasta
./query_genomes/JAAHGC01.fasta ./ncbi_genomes/JAAHGC01.fasta
./query_genomes/JAAHHN01.fasta ./ncbi_genomes/JAAHHN01.fasta
./query_genomes/JAAHFZ01.fasta ./ncbi_genomes/JAAHFZ01.fasta
./query_genomes/JAAHGO01.fasta ./ncbi_genomes/JAAHGO01.fasta
./query_genomes/JAAHGF01.fasta ./ncbi_genomes/JAAHGF01.fasta
./query_genomes/JAAHGS01.fasta ./ncbi_genomes/JAAHGS01.fasta
./query_genomes/JAAHHC01

In [6]:
mash_df

Unnamed: 0,reference,query,distance,pvalue,matching-hashes
0,./ncbi_genomes/JAAHHM01.fasta,./query_genomes/JAAHHP01.fasta,0.00024,0.0,990/1000
1,./ncbi_genomes/JAAHGB01.fasta,./query_genomes/JAAHGA01.fasta,0.00024,0.0,990/1000
2,./ncbi_genomes/JAAHGF01.fasta,./query_genomes/JAAHGH01.fasta,0.041285,0.0,266/1000
3,./ncbi_genomes/JAAHHP01.fasta,./query_genomes/JAAHHM01.fasta,0.00024,0.0,990/1000
4,./ncbi_genomes/AEPQ01.fasta,./query_genomes/JAAHHD01.fasta,0.000781,0.0,968/1000
5,./ncbi_genomes/JAAHGW01.fasta,./query_genomes/JAAHGU01.fasta,0.000192,0.0,992/1000
6,./ncbi_genomes/JAAHFR01.fasta,./query_genomes/JAAHFU01.fasta,0.219531,5.95595e-18,5/1000
7,./ncbi_genomes/JAAHGA01.fasta,./query_genomes/JAAHGB01.fasta,0.00024,0.0,990/1000
8,./ncbi_genomes/RCBZ01.fasta,./query_genomes/JAAHFW01.fasta,0.004959,0.0,820/1000
9,./ncbi_genomes/JAAHGU01.fasta,./query_genomes/JAAHGW01.fasta,0.000192,0.0,992/1000


In [8]:
mash_df.sort_values(by='distance')

Unnamed: 0,reference,query,distance,pvalue,matching-hashes
9,./ncbi_genomes/JAAHGU01.fasta,./query_genomes/JAAHGW01.fasta,0.000192,0.0,992/1000
5,./ncbi_genomes/JAAHGW01.fasta,./query_genomes/JAAHGU01.fasta,0.000192,0.0,992/1000
0,./ncbi_genomes/JAAHHM01.fasta,./query_genomes/JAAHHP01.fasta,0.00024,0.0,990/1000
1,./ncbi_genomes/JAAHGB01.fasta,./query_genomes/JAAHGA01.fasta,0.00024,0.0,990/1000
3,./ncbi_genomes/JAAHHP01.fasta,./query_genomes/JAAHHM01.fasta,0.00024,0.0,990/1000
7,./ncbi_genomes/JAAHGA01.fasta,./query_genomes/JAAHGB01.fasta,0.00024,0.0,990/1000
10,./ncbi_genomes/JAAHGB01.fasta,./query_genomes/JAAHGC01.fasta,0.00041,0.0,983/1000
4,./ncbi_genomes/AEPQ01.fasta,./query_genomes/JAAHHD01.fasta,0.000781,0.0,968/1000
8,./ncbi_genomes/RCBZ01.fasta,./query_genomes/JAAHFW01.fasta,0.004959,0.0,820/1000
16,./ncbi_genomes/JAAHHD01.fasta,./query_genomes/JAAHHC01.fasta,0.007596,0.0,743/1000


**Cateloging NCBI Genomes into Subsections**

In [3]:
from Bio import SeqIO
from cyanotools1.miscellaneous import split_string_at

def extract_ncbi_taxa(inputs_folder):
    input_list = glob.glob("%s*.fasta"%inputs_folder)
    taxa = {}
    for item in input_list:
        records = list(SeqIO.parse(item, "fasta"))
        strain = os.path.splitext(os.path.basename(item))[0]
        if records:
            if "TPA_asm" in str(records[0]):
                genera = split_string_at(genera,":",1)[0]
            else:
                genera = split_string_at(records[0].description," ",1)[1]
            if "Candidatus" in str(genera):
                taxa[strain] = str(genera).partition(' ')[2].partition(' ')[0]
            else:
                taxa[strain] = str(genera).partition(' ')[0].lstrip("[").rstrip("]")
    return taxa
            
taxa_dict = extract_ncbi_taxa("./refseq_cyano/")

In [4]:
taxa_dict

{'AADV02': 'Crocosphaera',
 'AANO01': 'Synechococcus',
 'AANP01': 'Synechococcus',
 'AAOK01': 'Synechococcus',
 'AATZ01': 'Synechococcus',
 'AAUA01': 'Synechococcus',
 'AAVU01': 'Lyngbya',
 'AAVW01': 'Nodularia',
 'AAXW01': 'Cyanothece',
 'AAZV01': 'Leptolyngbya',
 'ABRS01': 'Coleofasciculus',
 'ABRV01': 'Synechococcus',
 'ABSE01': 'Cyanobium',
 'ABYK01': 'Arthrospira',
 'ACDW01': 'Prochlorococcus',
 'ACSK03': 'Arthrospira',
 'ACYA01': 'Cylindrospermopsis',
 'ACYB01': 'Raphidiopsis',
 'ADXL01': 'Synechococcus',
 'ADXM01': 'Synechococcus',
 'AEPQ01': 'Moorea',
 'AESD01': 'Crocosphaera',
 'AFEJ01': 'Acaryochloris',
 'AFJC01': 'Microcoleus',
 'AFXD01': 'Arthrospira',
 'AGCR01': 'Tolypothrix',
 'AGIK01': 'Synechococcus',
 'AGIZ01': 'Fischerella',
 'AGJC02': 'Cyanothece',
 'AHGV01': 'Thermanaerovibrio',
 'AJHB01': 'Microcystis',
 'AJLJ01': 'Fischerella',
 'AJLK01': 'Fischerella',
 'AJLL01': 'Fischerella',
 'AJLM01': 'Chlorogloeopsis',
 'AJLN01': 'Chlorogloeopsis',
 'AJTX02': 'Prochlorothrix

In [6]:
import csv

with open('./mash-Linux64-v2.0/mash_taxonomy_1.csv', 'wb') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in taxa_dict.items():
       writer.writerow([key, value])

In [9]:
!cat ./mash-Linux64-v2.0/mash_taxonomy_1.csv >> ./mash-Linux64-v2.0/mash_taxonomy.csv

In [12]:
!tail ./mash-Linux64-v2.0/mash_taxonomy.csv

ADXL01,Synechococcus
BFAC01,Microcystis
PXQE01,Cyanobacteria
JNAR01,Prochlorococcus
ALPC01,Prochlorococcus
DLEF01,Phormidium
DEGZ01,Aphanizomenon
AOCI01,Microcystis
ASSJ01,Rubidibacter
LADK01,Trichodesmium


**Using MASH Distances to Catelog Cyanobiome Finished Genomes into Subsections**

In [5]:
def get_mash_df(inputs_folder,sketch):
    start_time = time.time()
    input_list = glob.glob("%s*.fasta"%inputs_folder)
    final_df = pd.DataFrame()
    for index,item in enumerate(input_list):
        scores = []
        cmd = "./mash-Linux64-v2.0/mash dist %s %s > ./mash-Linux64-v2.0/temp.txt"%(sketch,item)
        subprocess.call(cmd,shell=True)
        df = pd.read_csv("./mash-Linux64-v2.0/temp.txt",sep="\t",names=["reference","query","distance","pvalue","matching-hashes"])
        df = df[df.reference != item]
        row = df[df.distance == df.distance.min()]
        final_df = final_df.append(row[:1], ignore_index = True)
    print('\n' + "--- %s seconds ---" %(time.time()-start_time))
    return final_df
        
mash_df = get_mash_df("/home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/","./mash-Linux64-v2.0/ncbi_genomes.msh")

./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/2G6.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/3F7.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/2I2.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/3H3.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/2F4.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.ms

./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/2I4.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/2F8.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/1H6.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/4E2.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/1F2.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.ms

./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/3I9.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/4E8.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/2D8.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/1I2.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/1H4.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.ms

./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/3A7.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/1A2.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/4F4.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/3B3.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/3H1.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.ms

In [1]:
taxa_dict2 = {}

for i,r in mash_df.iterrows():
    ncbi_taxa = os.path.splitext(os.path.basename((r[0])))[0].partition('.')[0]
    strain = os.path.splitext(os.path.basename((r[1])))[0].partition('_')[0]
    taxa_dict2[strain] = taxa_dict[ncbi_taxa]

NameError: name 'mash_df' is not defined

In [1]:
taxa_dict2["3B5"]

NameError: name 'taxa_dict2' is not defined

In [11]:
import csv

with open('./mash-Linux64-v2.0/mash_taxonomy.csv', 'wb') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in taxa_dict2.items():
       writer.writerow([key, value])