**Importing General Dependencies**

In [1]:
import numpy as np
import pandas as pd
import glob
import subprocess
import os
import time
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()

# Notebook 6 - MASH Analysis for Marine Cyanobacterial Genomes

This notebook uses MASH to perform a whole genome comparison between our 24 marine cyanbacteria and the other published cyanobacteria from NCBI

**Installing MASH**

In [2]:
if os.path.exists("./mash-OSX64-v2.2/mash"):
    print("MASH already downloaded")
else:
    !wget https://github.com/marbl/Mash/releases/download/v2.2/mash-OSX64-v2.2.tar
    !tar xopf mash-OSX64-v2.2.tar; rm mash-OSX64-v2.2.tar

MASH already downloaded


**Sketching Finished Cyanobiome and NCBI Datasets**

This step is going to create the minhashes from the references to be probed in the soon to be queried genomes

In [3]:
start_time = time.time()

if os.path.exists("./mash-OSX64-v2.2/ncbi_genomes.msh"):
    print("Sketch already exists")
else:
    !./mash-OSX64-v2.2/mash sketch -o ./mash-OSX64-v2.2/ncbi_genomes ./ncbi_genomes/*.fasta
    
print('\n' + "--- %s seconds ---" %(time.time()-start_time))

Sketch already exists

--- 0.00020003318786621094 seconds ---


**Subsetting the 24 Marine Cyanobacterial Genomes**

In [4]:
query_genomes = ['JAAHII01','JAAHIH01','JAAHGF01','JAAHHC01','JAAHHD01','JAAHFN01','JAAHGM01','JAAHGB01','JAAHFW01','JAAHFZ01','JAAHFP01','JAAHGH01','JAAHGC01','JAAHGA01','JAAHHJ01','JAAHGO01','JAAHGU01','JAAHGW01','JAAHGS01','JAAHHM01','JAAHHP01','JAAHHN01','JAAHGL01','JAAHFU01']

!mkdir ./query_genomes/

mkdir: ./query_genomes/: File exists


In [15]:
glob_list = glob.glob('./ncbi_genomes/*fasta')

for item in glob_list:
    name = os.path.basename(item).split('.')[0]
    if name in query_genomes:
        cmd = 'cp %s ./query_genomes/'%item
        subprocess.call(cmd,shell=True)

**Running MASH and Creating the MASH Dataframe with Similarity Scores**

In [5]:
def get_mash_df(inputs_folder,sketch):
    start_time = time.time()
    input_list = glob.glob("%s*.fasta"%inputs_folder)
    final_df = pd.DataFrame()
    for index,item in enumerate(input_list):
        scores = []
        ncbi_item = './ncbi_genomes/%s'%(os.path.basename(item))
        print(item,ncbi_item)
        cmd = "./mash-OSX64-v2.2/mash dist %s %s > ./mash-OSX64-v2.2/temp.txt"%(sketch,item)
        subprocess.call(cmd,shell=True)
        df = pd.read_csv("./mash-OSX64-v2.2/temp.txt",sep="\t",names=["reference","query","distance","pvalue","matching-hashes"])
        df = df[df.reference != ncbi_item]
        row = df[df.distance == df.distance.min()]
        final_df = final_df.append(row[:1], ignore_index = True)
    print('\n' + "--- %s seconds ---" %(time.time()-start_time))
    return final_df
        
mash_df = get_mash_df("./query_genomes/",
                      "./mash-OSX64-v2.2/ncbi_genomes.msh")

./query_genomes/JAAHHP01.fasta ./ncbi_genomes/JAAHHP01.fasta
./query_genomes/JAAHGA01.fasta ./ncbi_genomes/JAAHGA01.fasta
./query_genomes/JAAHGH01.fasta ./ncbi_genomes/JAAHGH01.fasta
./query_genomes/JAAHHM01.fasta ./ncbi_genomes/JAAHHM01.fasta
./query_genomes/JAAHHD01.fasta ./ncbi_genomes/JAAHHD01.fasta
./query_genomes/JAAHGU01.fasta ./ncbi_genomes/JAAHGU01.fasta
./query_genomes/JAAHFU01.fasta ./ncbi_genomes/JAAHFU01.fasta
./query_genomes/JAAHGB01.fasta ./ncbi_genomes/JAAHGB01.fasta
./query_genomes/JAAHFW01.fasta ./ncbi_genomes/JAAHFW01.fasta
./query_genomes/JAAHGW01.fasta ./ncbi_genomes/JAAHGW01.fasta
./query_genomes/JAAHGC01.fasta ./ncbi_genomes/JAAHGC01.fasta
./query_genomes/JAAHHN01.fasta ./ncbi_genomes/JAAHHN01.fasta
./query_genomes/JAAHFZ01.fasta ./ncbi_genomes/JAAHFZ01.fasta
./query_genomes/JAAHGO01.fasta ./ncbi_genomes/JAAHGO01.fasta
./query_genomes/JAAHGF01.fasta ./ncbi_genomes/JAAHGF01.fasta
./query_genomes/JAAHGS01.fasta ./ncbi_genomes/JAAHGS01.fasta
./query_genomes/JAAHHC01

In [6]:
mash_df

Unnamed: 0,reference,query,distance,pvalue,matching-hashes
0,./ncbi_genomes/JAAHHM01.fasta,./query_genomes/JAAHHP01.fasta,0.00024,0.0,990/1000
1,./ncbi_genomes/JAAHGB01.fasta,./query_genomes/JAAHGA01.fasta,0.00024,0.0,990/1000
2,./ncbi_genomes/JAAHGF01.fasta,./query_genomes/JAAHGH01.fasta,0.041285,0.0,266/1000
3,./ncbi_genomes/JAAHHP01.fasta,./query_genomes/JAAHHM01.fasta,0.00024,0.0,990/1000
4,./ncbi_genomes/AEPQ01.fasta,./query_genomes/JAAHHD01.fasta,0.000781,0.0,968/1000
5,./ncbi_genomes/JAAHGW01.fasta,./query_genomes/JAAHGU01.fasta,0.000192,0.0,992/1000
6,./ncbi_genomes/JAAHFR01.fasta,./query_genomes/JAAHFU01.fasta,0.219531,5.95595e-18,5/1000
7,./ncbi_genomes/JAAHGA01.fasta,./query_genomes/JAAHGB01.fasta,0.00024,0.0,990/1000
8,./ncbi_genomes/RCBZ01.fasta,./query_genomes/JAAHFW01.fasta,0.004959,0.0,820/1000
9,./ncbi_genomes/JAAHGU01.fasta,./query_genomes/JAAHGW01.fasta,0.000192,0.0,992/1000


In [8]:
mash_df.sort_values(by='distance')

Unnamed: 0,reference,query,distance,pvalue,matching-hashes
9,./ncbi_genomes/JAAHGU01.fasta,./query_genomes/JAAHGW01.fasta,0.000192,0.0,992/1000
5,./ncbi_genomes/JAAHGW01.fasta,./query_genomes/JAAHGU01.fasta,0.000192,0.0,992/1000
0,./ncbi_genomes/JAAHHM01.fasta,./query_genomes/JAAHHP01.fasta,0.00024,0.0,990/1000
1,./ncbi_genomes/JAAHGB01.fasta,./query_genomes/JAAHGA01.fasta,0.00024,0.0,990/1000
3,./ncbi_genomes/JAAHHP01.fasta,./query_genomes/JAAHHM01.fasta,0.00024,0.0,990/1000
7,./ncbi_genomes/JAAHGA01.fasta,./query_genomes/JAAHGB01.fasta,0.00024,0.0,990/1000
10,./ncbi_genomes/JAAHGB01.fasta,./query_genomes/JAAHGC01.fasta,0.00041,0.0,983/1000
4,./ncbi_genomes/AEPQ01.fasta,./query_genomes/JAAHHD01.fasta,0.000781,0.0,968/1000
8,./ncbi_genomes/RCBZ01.fasta,./query_genomes/JAAHFW01.fasta,0.004959,0.0,820/1000
16,./ncbi_genomes/JAAHHD01.fasta,./query_genomes/JAAHHC01.fasta,0.007596,0.0,743/1000
