# MASH Analysis for Finished and NCBI Cyanobacterial Genomes

### Outline



**Importing General Dependencies**

In [1]:
import numpy as np
import pandas as pd
import glob
import subprocess
import os
import time
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()

**Installing MASH**

In [4]:
if os.path.exists("./mash-OSX64-v2.2/mash"):
    print("MASH already downloaded")
else:
    !wget https://github.com/marbl/Mash/releases/download/v2.2/mash-OSX64-v2.2.tar
    !tar xopf mash-OSX64-v2.2.tar; rm mash-OSX64-v2.2.tar

--2020-08-25 11:17:17--  https://github.com/marbl/Mash/releases/download/v2.2/mash-OSX64-v2.2.tar
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/41320247/19024a80-afb6-11e9-8a80-08bb54253c74?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200825%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200825T181719Z&X-Amz-Expires=300&X-Amz-Signature=fcf3a94b0ff17fd2617a317f6d7b060e7f04b0e2769c30004ba55e206ed4c37b&X-Amz-SignedHeaders=host&actor_id=0&repo_id=41320247&response-content-disposition=attachment%3B%20filename%3Dmash-OSX64-v2.2.tar&response-content-type=application%2Foctet-stream [following]
--2020-08-25 11:17:19--  https://github-production-release-asset-2e65be.s3.amazonaws.com/41320247/19024a80-afb6-11e9-8a80-08bb54253c74?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Cre

**Sketching Finished Cyanobiome and NCBI Datasets**

This step is going to create the minhashes from the references to be probed in the soon to be queried genomes

In [5]:
start_time = time.time()

if os.path.exists("./mash-OSX64-v2.2/ncbi_genomes.msh"):
    print("Sketch already exists")
else:
    !./mash-OSX64-v2.2/mash sketch -o ./mash-OSX64-v2.2/ncbi_genomes ./ncbi_genomes/*.fasta
    
print('\n' + "--- %s seconds ---" %(time.time()-start_time))

Sketching ./ncbi_genomes/AADV02.fasta...
Sketching ./ncbi_genomes/AANO01.fasta...
Sketching ./ncbi_genomes/AANP01.fasta...
Sketching ./ncbi_genomes/AAOK01.fasta...
Sketching ./ncbi_genomes/AATZ01.fasta...
Sketching ./ncbi_genomes/AAUA01.fasta...
Sketching ./ncbi_genomes/AAVU01.fasta...
Sketching ./ncbi_genomes/AAVW01.fasta...
Sketching ./ncbi_genomes/AAXW01.fasta...
Sketching ./ncbi_genomes/AAZV01.fasta...
Sketching ./ncbi_genomes/ABRS01.fasta...
Sketching ./ncbi_genomes/ABRV01.fasta...
Sketching ./ncbi_genomes/ABSE01.fasta...
Sketching ./ncbi_genomes/ABYK01.fasta...
Sketching ./ncbi_genomes/ACDW01.fasta...
Sketching ./ncbi_genomes/ACSK03.fasta...
Sketching ./ncbi_genomes/ACYA01.fasta...
Sketching ./ncbi_genomes/ACYB01.fasta...
Sketching ./ncbi_genomes/ADXM01.fasta...
Sketching ./ncbi_genomes/AEPQ01.fasta...
Sketching ./ncbi_genomes/AESD01.fasta...
Sketching ./ncbi_genomes/AFEJ01.fasta...
Sketching ./ncbi_genomes/AFJC01.fasta...
Sketching ./ncbi_genomes/AFXD01.fasta...
Sketching ./ncbi

Sketching ./ncbi_genomes/CACARF01.fasta...
Sketching ./ncbi_genomes/CACARV01.fasta...
Sketching ./ncbi_genomes/CACASR01.fasta...
Sketching ./ncbi_genomes/CACAST01.fasta...
Sketching ./ncbi_genomes/CACATM01.fasta...
Sketching ./ncbi_genomes/CACATU01.fasta...
Sketching ./ncbi_genomes/CACATZ01.fasta...
Sketching ./ncbi_genomes/CACAUG01.fasta...
Sketching ./ncbi_genomes/CACAVZ01.fasta...
Sketching ./ncbi_genomes/CACAWW01.fasta...
Sketching ./ncbi_genomes/CACAXN01.fasta...
Sketching ./ncbi_genomes/CACAXR01.fasta...
Sketching ./ncbi_genomes/CACAXS01.fasta...
Sketching ./ncbi_genomes/CACAYO01.fasta...
Sketching ./ncbi_genomes/CACAYX01.fasta...
Sketching ./ncbi_genomes/CACAZF01.fasta...
Sketching ./ncbi_genomes/CACBAW01.fasta...
Sketching ./ncbi_genomes/CACBBE01.fasta...
Sketching ./ncbi_genomes/CACBBH01.fasta...
Sketching ./ncbi_genomes/CACBBI01.fasta...
Sketching ./ncbi_genomes/CACBBJ01.fasta...
Sketching ./ncbi_genomes/CACBCR01.fasta...
Sketching ./ncbi_genomes/CACBDL01.fasta...
Sketching .

Sketching ./ncbi_genomes/CACJOV01.fasta...
Sketching ./ncbi_genomes/CACJOW01.fasta...
Sketching ./ncbi_genomes/CACJOZ01.fasta...
Sketching ./ncbi_genomes/CACJPB01.fasta...
Sketching ./ncbi_genomes/CACJQY01.fasta...
Sketching ./ncbi_genomes/CACJSF01.fasta...
Sketching ./ncbi_genomes/CACJTA01.fasta...
Sketching ./ncbi_genomes/CACJTI01.fasta...
Sketching ./ncbi_genomes/CACJUL01.fasta...
Sketching ./ncbi_genomes/CACJVD01.fasta...
Sketching ./ncbi_genomes/CACJVV01.fasta...
Sketching ./ncbi_genomes/CACJWO01.fasta...
Sketching ./ncbi_genomes/CACJYM01.fasta...
Sketching ./ncbi_genomes/CACJYO01.fasta...
Sketching ./ncbi_genomes/CACJZI01.fasta...
Sketching ./ncbi_genomes/CACKAH01.fasta...
Sketching ./ncbi_genomes/CACKBB01.fasta...
Sketching ./ncbi_genomes/CACKBW01.fasta...
Sketching ./ncbi_genomes/CACKEF01.fasta...
Sketching ./ncbi_genomes/CACKEW01.fasta...
Sketching ./ncbi_genomes/CACKFN01.fasta...
Sketching ./ncbi_genomes/CACKFR01.fasta...
Sketching ./ncbi_genomes/CACKFT01.fasta...
Sketching .

Sketching ./ncbi_genomes/CVSY01.fasta...
Sketching ./ncbi_genomes/CVSZ01.fasta...
Sketching ./ncbi_genomes/CZCT02.fasta...
Sketching ./ncbi_genomes/CZCZ01.fasta...
Sketching ./ncbi_genomes/CZDF01.fasta...
Sketching ./ncbi_genomes/DBAX01.fasta...
Sketching ./ncbi_genomes/DBCX01.fasta...
Sketching ./ncbi_genomes/DBIX01.fasta...
Sketching ./ncbi_genomes/DBVD01.fasta...
Sketching ./ncbi_genomes/DBVH01.fasta...
Sketching ./ncbi_genomes/DBXD01.fasta...
Sketching ./ncbi_genomes/DBYM01.fasta...
Sketching ./ncbi_genomes/DCNZ01.fasta...
Sketching ./ncbi_genomes/DCQC01.fasta...
Sketching ./ncbi_genomes/DCSN01.fasta...
Sketching ./ncbi_genomes/DDLS01.fasta...
Sketching ./ncbi_genomes/DDVE01.fasta...
Sketching ./ncbi_genomes/DEDY01.fasta...
Sketching ./ncbi_genomes/DEEH01.fasta...
Sketching ./ncbi_genomes/DEEN01.fasta...
Sketching ./ncbi_genomes/DEFJ01.fasta...
Sketching ./ncbi_genomes/DEGT01.fasta...
Sketching ./ncbi_genomes/DEGZ01.fasta...
Sketching ./ncbi_genomes/DESS01.fasta...
Sketching ./ncbi

Sketching ./ncbi_genomes/JAAHGW01.fasta...
Sketching ./ncbi_genomes/JAAHGX01.fasta...
Sketching ./ncbi_genomes/JAAHGY01.fasta...
Sketching ./ncbi_genomes/JAAHGZ01.fasta...
Sketching ./ncbi_genomes/JAAHHA01.fasta...
Sketching ./ncbi_genomes/JAAHHC01.fasta...
Sketching ./ncbi_genomes/JAAHHD01.fasta...
Sketching ./ncbi_genomes/JAAHHE01.fasta...
Sketching ./ncbi_genomes/JAAHHF01.fasta...
Sketching ./ncbi_genomes/JAAHHG01.fasta...
Sketching ./ncbi_genomes/JAAHHH01.fasta...
Sketching ./ncbi_genomes/JAAHHI01.fasta...
Sketching ./ncbi_genomes/JAAHHJ01.fasta...
Sketching ./ncbi_genomes/JAAHHK01.fasta...
Sketching ./ncbi_genomes/JAAHHL01.fasta...
Sketching ./ncbi_genomes/JAAHHM01.fasta...
Sketching ./ncbi_genomes/JAAHHN01.fasta...
Sketching ./ncbi_genomes/JAAHHO01.fasta...
Sketching ./ncbi_genomes/JAAHHP01.fasta...
Sketching ./ncbi_genomes/JAAHHQ01.fasta...
Sketching ./ncbi_genomes/JAAHHR01.fasta...
Sketching ./ncbi_genomes/JAAHHS01.fasta...
Sketching ./ncbi_genomes/JAAHHT01.fasta...
Sketching .

Sketching ./ncbi_genomes/LIRN01.fasta...
Sketching ./ncbi_genomes/LIRO01.fasta...
Sketching ./ncbi_genomes/LIUQ01.fasta...
Sketching ./ncbi_genomes/LJOP01.fasta...
Sketching ./ncbi_genomes/LJOQ01.fasta...
Sketching ./ncbi_genomes/LJOS01.fasta...
Sketching ./ncbi_genomes/LJOT01.fasta...
Sketching ./ncbi_genomes/LJOU01.fasta...
Sketching ./ncbi_genomes/LJOV01.fasta...
Sketching ./ncbi_genomes/LJOW01.fasta...
Sketching ./ncbi_genomes/LJOX01.fasta...
Sketching ./ncbi_genomes/LJOY01.fasta...
Sketching ./ncbi_genomes/LJZR01.fasta...
Sketching ./ncbi_genomes/LJZT01.fasta...
Sketching ./ncbi_genomes/LMTZ01.fasta...
Sketching ./ncbi_genomes/LMVE01.fasta...
Sketching ./ncbi_genomes/LNAA02.fasta...
Sketching ./ncbi_genomes/LNDC01.fasta...
Sketching ./ncbi_genomes/LSSA01.fasta...
Sketching ./ncbi_genomes/LSYZ01.fasta...
Sketching ./ncbi_genomes/LSZA01.fasta...
Sketching ./ncbi_genomes/LTEC01.fasta...
Sketching ./ncbi_genomes/LUBZ01.fasta...
Sketching ./ncbi_genomes/LUFH01.fasta...
Sketching ./ncbi

Sketching ./ncbi_genomes/NQMD01.fasta...
Sketching ./ncbi_genomes/NQME01.fasta...
Sketching ./ncbi_genomes/NQMF01.fasta...
Sketching ./ncbi_genomes/NRIU01.fasta...
Sketching ./ncbi_genomes/NRQW01.fasta...
Sketching ./ncbi_genomes/NRTA01.fasta...
Sketching ./ncbi_genomes/NSHF01.fasta...
Sketching ./ncbi_genomes/NSHT01.fasta...
Sketching ./ncbi_genomes/NSII01.fasta...
Sketching ./ncbi_genomes/NTFS01.fasta...
Sketching ./ncbi_genomes/NXIB02.fasta...
Sketching ./ncbi_genomes/NYUU01.fasta...
Sketching ./ncbi_genomes/NYXI01.fasta...
Sketching ./ncbi_genomes/NYZI01.fasta...
Sketching ./ncbi_genomes/NZED01.fasta...
Sketching ./ncbi_genomes/NZHI01.fasta...
Sketching ./ncbi_genomes/NZHY01.fasta...
Sketching ./ncbi_genomes/NZJI01.fasta...
Sketching ./ncbi_genomes/NZKA01.fasta...
Sketching ./ncbi_genomes/NZPI01.fasta...
Sketching ./ncbi_genomes/NZVB01.fasta...
Sketching ./ncbi_genomes/NZXF01.fasta...
Sketching ./ncbi_genomes/PAEP01.fasta...
Sketching ./ncbi_genomes/PAEU01.fasta...
Sketching ./ncbi

Sketching ./ncbi_genomes/QBZT01.fasta...
Sketching ./ncbi_genomes/QBZU01.fasta...
Sketching ./ncbi_genomes/QBZV01.fasta...
Sketching ./ncbi_genomes/QCAE01.fasta...
Sketching ./ncbi_genomes/QCAF01.fasta...
Sketching ./ncbi_genomes/QCAG01.fasta...
Sketching ./ncbi_genomes/QCAH01.fasta...
Sketching ./ncbi_genomes/QCAJ01.fasta...
Sketching ./ncbi_genomes/QCAK01.fasta...
Sketching ./ncbi_genomes/QCAL01.fasta...
Sketching ./ncbi_genomes/QCAN01.fasta...
Sketching ./ncbi_genomes/QCAQ01.fasta...
Sketching ./ncbi_genomes/QCAR01.fasta...
Sketching ./ncbi_genomes/QCBD01.fasta...
Sketching ./ncbi_genomes/QCBE01.fasta...
Sketching ./ncbi_genomes/QCBF01.fasta...
Sketching ./ncbi_genomes/QCBG01.fasta...
Sketching ./ncbi_genomes/QCBH01.fasta...
Sketching ./ncbi_genomes/QCBI01.fasta...
Sketching ./ncbi_genomes/QCBJ01.fasta...
Sketching ./ncbi_genomes/QCBK01.fasta...
Sketching ./ncbi_genomes/QCBL01.fasta...
Sketching ./ncbi_genomes/QCBM01.fasta...
Sketching ./ncbi_genomes/QCBN01.fasta...
Sketching ./ncbi

Sketching ./ncbi_genomes/QCJN01.fasta...
Sketching ./ncbi_genomes/QCJO01.fasta...
Sketching ./ncbi_genomes/QCJP01.fasta...
Sketching ./ncbi_genomes/QCJQ01.fasta...
Sketching ./ncbi_genomes/QCJR01.fasta...
Sketching ./ncbi_genomes/QCJS01.fasta...
Sketching ./ncbi_genomes/QCJT01.fasta...
Sketching ./ncbi_genomes/QCJU01.fasta...
Sketching ./ncbi_genomes/QCJV01.fasta...
Sketching ./ncbi_genomes/QCJW01.fasta...
Sketching ./ncbi_genomes/QCJX01.fasta...
Sketching ./ncbi_genomes/QCJY01.fasta...
Sketching ./ncbi_genomes/QCJZ01.fasta...
Sketching ./ncbi_genomes/QCKA01.fasta...
Sketching ./ncbi_genomes/QCKB01.fasta...
Sketching ./ncbi_genomes/QCKC01.fasta...
Sketching ./ncbi_genomes/QCKD01.fasta...
Sketching ./ncbi_genomes/QCKE01.fasta...
Sketching ./ncbi_genomes/QCKF01.fasta...
Sketching ./ncbi_genomes/QCKG01.fasta...
Sketching ./ncbi_genomes/QCKH01.fasta...
Sketching ./ncbi_genomes/QCKI01.fasta...
Sketching ./ncbi_genomes/QCKJ01.fasta...
Sketching ./ncbi_genomes/QCKK01.fasta...
Sketching ./ncbi

Sketching ./ncbi_genomes/QCTX01.fasta...
Sketching ./ncbi_genomes/QCTY01.fasta...
Sketching ./ncbi_genomes/QCTZ01.fasta...
Sketching ./ncbi_genomes/QCUA01.fasta...
Sketching ./ncbi_genomes/QCUB01.fasta...
Sketching ./ncbi_genomes/QCUC01.fasta...
Sketching ./ncbi_genomes/QCUD01.fasta...
Sketching ./ncbi_genomes/QCUF01.fasta...
Sketching ./ncbi_genomes/QCUG01.fasta...
Sketching ./ncbi_genomes/QCUH01.fasta...
Sketching ./ncbi_genomes/QCUI01.fasta...
Sketching ./ncbi_genomes/QCUJ01.fasta...
Sketching ./ncbi_genomes/QCUK01.fasta...
Sketching ./ncbi_genomes/QCUL01.fasta...
Sketching ./ncbi_genomes/QCUM01.fasta...
Sketching ./ncbi_genomes/QCUN01.fasta...
Sketching ./ncbi_genomes/QCUO01.fasta...
Sketching ./ncbi_genomes/QCUP01.fasta...
Sketching ./ncbi_genomes/QCUQ01.fasta...
Sketching ./ncbi_genomes/QCUR01.fasta...
Sketching ./ncbi_genomes/QCUS01.fasta...
Sketching ./ncbi_genomes/QCUT01.fasta...
Sketching ./ncbi_genomes/QCUU01.fasta...
Sketching ./ncbi_genomes/QCUV01.fasta...
Sketching ./ncbi

In [14]:
query_genomes = ['JAAHII01','JAAHIH01','JAAHGF01','JAAHHC01','JAAHHD01','JAAHFN01','JAAHGM01','JAAHGB01','JAAHFW01','JAAHFZ01','JAAHFP01','JAAHGH01','JAAHGC01','JAAHGA01','JAAHHJ01','JAAHGO01','JAAHGU01','JAAHGW01','JAAHGS01','JAAHHM01','JAAHHP01','JAAHHN01','JAAHGL01','JAAHFU01']

!mkdir ./query_genomes/

In [15]:
glob_list = glob.glob('./ncbi_genomes/*fasta')

for item in glob_list:
    name = os.path.basename(item).split('.')[0]
    if name in query_genomes:
        cmd = 'cp %s ./query_genomes/'%item
        subprocess.call(cmd,shell=True)

In [22]:
def get_mash_df(inputs_folder,sketch):
    start_time = time.time()
    input_list = glob.glob("%s*.fasta"%inputs_folder)
    final_df = pd.DataFrame()
    for index,item in enumerate(input_list):
        scores = []
        ncbi_item = './ncbi_genomes/%s'%(os.path.basename(item))
        print(item,ncbi_item)
        cmd = "./mash-OSX64-v2.2/mash dist %s %s > ./mash-OSX64-v2.2/temp.txt"%(sketch,item)
        subprocess.call(cmd,shell=True)
        df = pd.read_csv("./mash-OSX64-v2.2/temp.txt",sep="\t",names=["reference","query","distance","pvalue","matching-hashes"])
        df = df[df.reference != ncbi_item]
        row = df[df.distance == df.distance.min()]
        final_df = final_df.append(row[:1], ignore_index = True)
    print('\n' + "--- %s seconds ---" %(time.time()-start_time))
    return final_df
        
mash_df = get_mash_df("./query_genomes/",
                      "./mash-OSX64-v2.2/ncbi_genomes.msh")

./query_genomes/JAAHHP01.fasta ./ncbi_genomes/JAAHHP01.fasta
./query_genomes/JAAHGA01.fasta ./ncbi_genomes/JAAHGA01.fasta
./query_genomes/JAAHGH01.fasta ./ncbi_genomes/JAAHGH01.fasta
./query_genomes/JAAHHM01.fasta ./ncbi_genomes/JAAHHM01.fasta
./query_genomes/JAAHHD01.fasta ./ncbi_genomes/JAAHHD01.fasta
./query_genomes/JAAHGU01.fasta ./ncbi_genomes/JAAHGU01.fasta
./query_genomes/JAAHFU01.fasta ./ncbi_genomes/JAAHFU01.fasta
./query_genomes/JAAHGB01.fasta ./ncbi_genomes/JAAHGB01.fasta
./query_genomes/JAAHFW01.fasta ./ncbi_genomes/JAAHFW01.fasta
./query_genomes/JAAHGW01.fasta ./ncbi_genomes/JAAHGW01.fasta
./query_genomes/JAAHGC01.fasta ./ncbi_genomes/JAAHGC01.fasta
./query_genomes/JAAHHN01.fasta ./ncbi_genomes/JAAHHN01.fasta
./query_genomes/JAAHFZ01.fasta ./ncbi_genomes/JAAHFZ01.fasta
./query_genomes/JAAHGO01.fasta ./ncbi_genomes/JAAHGO01.fasta
./query_genomes/JAAHGF01.fasta ./ncbi_genomes/JAAHGF01.fasta
./query_genomes/JAAHGS01.fasta ./ncbi_genomes/JAAHGS01.fasta
./query_genomes/JAAHHC01

In [34]:
mash_df_cutoffs = mash_df[mash_df['distance'] > 0.05]

mash_df_cutoffs

Unnamed: 0,reference,query,distance,pvalue,matching-hashes
6,./ncbi_genomes/JAAHFR01.fasta,./query_genomes/JAAHFU01.fasta,0.219531,5.95595e-18,5/1000
11,./ncbi_genomes/ALVQ01.fasta,./query_genomes/JAAHHN01.fasta,0.23011,8.77756e-15,4/1000
13,./ncbi_genomes/JAAHGP01.fasta,./query_genomes/JAAHGO01.fasta,0.05153,0.0,204/1000
15,./ncbi_genomes/JAAHFQ01.fasta,./query_genomes/JAAHGS01.fasta,0.098138,1.15775e-306,68/1000
17,./ncbi_genomes/RFFC01.fasta,./query_genomes/JAAHFN01.fasta,0.23011,2.06374e-14,4/1000
20,./ncbi_genomes/JAAHFM01.fasta,./query_genomes/JAAHGL01.fasta,0.074432,0.0,117/1000
23,./ncbi_genomes/QVFV01.fasta,./query_genomes/JAAHII01.fasta,0.243761,7.61738e-11,3/1000


In [37]:
7/24

0.2916666666666667

**Cateloging NCBI Genomes into Subsections**

In [3]:
from Bio import SeqIO
from cyanotools1.miscellaneous import split_string_at

def extract_ncbi_taxa(inputs_folder):
    input_list = glob.glob("%s*.fasta"%inputs_folder)
    taxa = {}
    for item in input_list:
        records = list(SeqIO.parse(item, "fasta"))
        strain = os.path.splitext(os.path.basename(item))[0]
        if records:
            if "TPA_asm" in str(records[0]):
                genera = split_string_at(genera,":",1)[0]
            else:
                genera = split_string_at(records[0].description," ",1)[1]
            if "Candidatus" in str(genera):
                taxa[strain] = str(genera).partition(' ')[2].partition(' ')[0]
            else:
                taxa[strain] = str(genera).partition(' ')[0].lstrip("[").rstrip("]")
    return taxa
            
taxa_dict = extract_ncbi_taxa("./refseq_cyano/")

In [4]:
taxa_dict

{'AADV02': 'Crocosphaera',
 'AANO01': 'Synechococcus',
 'AANP01': 'Synechococcus',
 'AAOK01': 'Synechococcus',
 'AATZ01': 'Synechococcus',
 'AAUA01': 'Synechococcus',
 'AAVU01': 'Lyngbya',
 'AAVW01': 'Nodularia',
 'AAXW01': 'Cyanothece',
 'AAZV01': 'Leptolyngbya',
 'ABRS01': 'Coleofasciculus',
 'ABRV01': 'Synechococcus',
 'ABSE01': 'Cyanobium',
 'ABYK01': 'Arthrospira',
 'ACDW01': 'Prochlorococcus',
 'ACSK03': 'Arthrospira',
 'ACYA01': 'Cylindrospermopsis',
 'ACYB01': 'Raphidiopsis',
 'ADXL01': 'Synechococcus',
 'ADXM01': 'Synechococcus',
 'AEPQ01': 'Moorea',
 'AESD01': 'Crocosphaera',
 'AFEJ01': 'Acaryochloris',
 'AFJC01': 'Microcoleus',
 'AFXD01': 'Arthrospira',
 'AGCR01': 'Tolypothrix',
 'AGIK01': 'Synechococcus',
 'AGIZ01': 'Fischerella',
 'AGJC02': 'Cyanothece',
 'AHGV01': 'Thermanaerovibrio',
 'AJHB01': 'Microcystis',
 'AJLJ01': 'Fischerella',
 'AJLK01': 'Fischerella',
 'AJLL01': 'Fischerella',
 'AJLM01': 'Chlorogloeopsis',
 'AJLN01': 'Chlorogloeopsis',
 'AJTX02': 'Prochlorothrix

In [6]:
import csv

with open('./mash-Linux64-v2.0/mash_taxonomy_1.csv', 'wb') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in taxa_dict.items():
       writer.writerow([key, value])

In [9]:
!cat ./mash-Linux64-v2.0/mash_taxonomy_1.csv >> ./mash-Linux64-v2.0/mash_taxonomy.csv

In [12]:
!tail ./mash-Linux64-v2.0/mash_taxonomy.csv

ADXL01,Synechococcus
BFAC01,Microcystis
PXQE01,Cyanobacteria
JNAR01,Prochlorococcus
ALPC01,Prochlorococcus
DLEF01,Phormidium
DEGZ01,Aphanizomenon
AOCI01,Microcystis
ASSJ01,Rubidibacter
LADK01,Trichodesmium


**Using MASH Distances to Catelog Cyanobiome Finished Genomes into Subsections**

In [5]:
def get_mash_df(inputs_folder,sketch):
    start_time = time.time()
    input_list = glob.glob("%s*.fasta"%inputs_folder)
    final_df = pd.DataFrame()
    for index,item in enumerate(input_list):
        scores = []
        cmd = "./mash-Linux64-v2.0/mash dist %s %s > ./mash-Linux64-v2.0/temp.txt"%(sketch,item)
        subprocess.call(cmd,shell=True)
        df = pd.read_csv("./mash-Linux64-v2.0/temp.txt",sep="\t",names=["reference","query","distance","pvalue","matching-hashes"])
        df = df[df.reference != item]
        row = df[df.distance == df.distance.min()]
        final_df = final_df.append(row[:1], ignore_index = True)
    print('\n' + "--- %s seconds ---" %(time.time()-start_time))
    return final_df
        
mash_df = get_mash_df("/home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/","./mash-Linux64-v2.0/ncbi_genomes.msh")

./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/2G6.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/3F7.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/2I2.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/3H3.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/2F4.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.ms

./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/2I4.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/2F8.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/1H6.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/4E2.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/1F2.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.ms

./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/3I9.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/4E8.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/2D8.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/1I2.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/1H4.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.ms

./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/3A7.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/1A2.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/4F4.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/3B3.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.msh /home/gerwick-lab/Desktop/data/genomes/drafts/kappahyperplus/scaffolded_drafts/3H1.fasta > ./mash-Linux64-v2.0/temp.txt
./mash-Linux64-v2.0/mash dist ./mash-Linux64-v2.0/ncbi_genomes.ms

In [1]:
taxa_dict2 = {}

for i,r in mash_df.iterrows():
    ncbi_taxa = os.path.splitext(os.path.basename((r[0])))[0].partition('.')[0]
    strain = os.path.splitext(os.path.basename((r[1])))[0].partition('_')[0]
    taxa_dict2[strain] = taxa_dict[ncbi_taxa]

NameError: name 'mash_df' is not defined

In [1]:
taxa_dict2["3B5"]

NameError: name 'taxa_dict2' is not defined

In [11]:
import csv

with open('./mash-Linux64-v2.0/mash_taxonomy.csv', 'wb') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in taxa_dict2.items():
       writer.writerow([key, value])