`Last update at 2021-08-11`

# Outline

```
    1) Downloading (meta)genomes and metagenome-assembled genomes (MAGs) that contain a paired untargeted 
    metabolomics (LC-MS/MS and in the future GC-MS/MS)(metabolomic files also going to be downloaded):
    1.1) Downloading all JSON files from PoDP;
    1.2) Parsing JSON files and creating podp_merged_df;
    1.3) Obtaining uniform Genbank code from BioSample ID; (estimated runtime of 02:46:30)
    1.4) Adding new column with the downloaded NCBI IDs;
    1.5) Downloading and unzipping the NCBI genomes;
    1.6) Copying previously downloaded genomes;
    1.7) Downloading metagenomic reads;
    1.8) Downloading JGI (meta)genomes;
    1.9) Separating Genomes without antiSMASH;
    1.10) Running antiSMASH;
    1.11) Filtering links for (meta)genomes with antiSMASH and downloading LC-MS/MS;
    1.12) Download missing Gerwick cyanoibacterial MS/MS and selecting Gerwick mgf files.
```

In [1]:
import pandas as pd
import time
import glob
import numpy as np
import csv
import subprocess
import glob
import os
import re
import networkx
from networkx.algorithms.components.connected import connected_components
from collections import defaultdict
import matplotlib.pyplot as plt
from Bio import SeqIO
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
import pickle
import json
from pyteomics import mgf, auxiliary
import csv

In [2]:
# !mkdir ./inputs
# !mkdir ./outputs/
# !mkdir ./temp_files/

# 1. Downloading genomes from the Paired omics Data Platform (PoDP)

## 1.1. Downloading all JSON files from PoDP

```
Go to PoDP (https://pairedomicsdata.bioinformatics.nl) and download the database JSON files, move files to 
input folder (path/to/NPOmix/inputs/databases/).
```

## 1.2. Parsing JSON files and creating podp_merged_df

In [2]:
def get_strain_dicts(filename):
    strain_gbk_dict = {}
    strain_biosample_dict = {}
    strain_db_dict = {}
    strain_to_shotgun = {}
    with open(filename) as f:
        data1 = json.load(f)
        if 'message' not in data1.keys():
            data1 = data1['genomes']
            json_df1 = pd.DataFrame(data1)
            for i,r in json_df1.iterrows():
                genome_line = dict(json_df1['genome_ID'].loc[i])
                if 'ENA_NCBI_accession' in genome_line.keys():
                    genomeID = genome_line['ENA_NCBI_accession']
                    db_id = 'ENA_NCBI_accession'
                if 'GenBank_accession' in genome_line.keys():
                    genomeID = genome_line['GenBank_accession']
                    db_id = 'GenBank_accession'
                if 'RefSeq_accession' in genome_line.keys():
                    genomeID = genome_line['RefSeq_accession']
                    db_id = 'RefSeq_accession'
                if 'JGI_Genome_ID' in genome_line.keys():
                    genomeID = genome_line['JGI_Genome_ID']
                    db_id = 'JGI_Genome_ID'
                if 'JGI_ID' in genome_line.keys():
                    genomeID = genome_line['JGI_ID']
                    db_id = 'JGI_ID'
                if 'JGI_IMG_genome_ID'in genome_line.keys():
                    genomeID = genome_line['JGI_IMG_genome_ID']
                    db_id = 'JGI_IMG_genome_ID'
                strain_to_shotgun[json_df1['genome_label'].loc[i]] = genome_line['genome_type']
                strain_gbk_dict[json_df1['genome_label'].loc[i]] = genomeID
                strain_db_dict[json_df1['genome_label'].loc[i]] = db_id
                if 'BioSample_accession' in json_df1.columns:
                    strain_biosample_dict[json_df1['genome_label'].loc[i]] = json_df1['BioSample_accession'].loc[i]
                else:
                    strain_biosample_dict[json_df1['genome_label'].loc[i]] = 'No BioSample'
    return strain_gbk_dict,strain_biosample_dict,strain_db_dict,strain_to_shotgun

def get_paired_df(filename,strain_gbk_dict,strain_biosample_dict,strain_db_dict,strain_to_shotgun):
    col1,col2,col3,col4,col5,col6 = [],[],[],[],[],[]
    with open(filename) as f:
        data2 = json.load(f)['genome_metabolome_links']
        json_df2 = pd.DataFrame(data2)
        for i,r in json_df2.iterrows():
            col1.append(strain_gbk_dict[json_df2['genome_label'].loc[i]])
            col2.append(strain_db_dict[json_df2['genome_label'].loc[i]])
            col3.append(strain_biosample_dict[json_df2['genome_label'].loc[i]])
            col4.append(json_df2['metabolomics_file'].loc[i])
            col5.append(json_df2['genome_label'].loc[i])
            col6.append(strain_to_shotgun[json_df2['genome_label'].loc[i]])
    frames = {'NCBI_ID':col1, 'Database':col2, 'BioSample':col3, 
              'LCMS_file':col4, 'Genome_label':col5, 'Method':col6}
    paired_df = pd.DataFrame(frames)
    return paired_df

In [3]:
table_list = glob.glob('./inputs/PoDP_datasets/*.json')

for filename in table_list:
    strain_gbk_dict,strain_biosample_dict,strain_db_dict,strain_to_shotgun = get_strain_dicts(filename)
    if strain_gbk_dict:
        if filename == table_list[0]:
            podp_merged_df = get_paired_df(filename,strain_gbk_dict,strain_biosample_dict,strain_db_dict,strain_to_shotgun)
        else:
            paired_df = get_paired_df(filename,strain_gbk_dict,strain_biosample_dict,strain_db_dict,strain_to_shotgun)
            frames = [podp_merged_df,paired_df]
            podp_merged_df = pd.concat(frames)
        
podp_merged_df = podp_merged_df.reset_index(drop=True)

podp_merged_df

Unnamed: 0,NCBI_ID,Database,BioSample,LCMS_file,Genome_label,Method
0,GCA_000240165.1,RefSeq_accession,SAMN02603879,ftp://massive.ucsd.edu/MSV000078850/results/S....,S. cattleya,genome
1,GCA_000240165.1,RefSeq_accession,SAMN02603879,ftp://massive.ucsd.edu/MSV000078850/results/S....,S. cattleya,genome
2,GCA_000240165.1,RefSeq_accession,SAMN02603879,ftp://massive.ucsd.edu/MSV000078850/results/S....,S. cattleya,genome
3,GCA_000240165.1,RefSeq_accession,SAMN02603879,ftp://massive.ucsd.edu/MSV000078850/results/S....,S. cattleya,genome
4,GCA_000240165.1,RefSeq_accession,SAMN02603879,ftp://massive.ucsd.edu/MSV000078850/results/S....,S. cattleya,genome
...,...,...,...,...,...,...
4891,GCA_000286575.1,GenBank_accession,SAMN00255227,ftp://massive.ucsd.edu/MSV000084945/ccms_peak/...,Burkholderia multivorans CF2,genome
4892,GCA_000959505.1,GenBank_accession,SAMN03144971,ftp://massive.ucsd.edu/MSV000084945/ccms_peak/...,Burkholderia dolosa AU0158,genome
4893,GCA_000959505.1,GenBank_accession,SAMN03144971,ftp://massive.ucsd.edu/MSV000084945/ccms_peak/...,Burkholderia dolosa AU0158,genome
4894,GCA_003568605.1,GenBank_accession,SAMN02866398,ftp://massive.ucsd.edu/MSV000084945/ccms_peak/...,Burkholderia thailandensis E264,genome


In [4]:
len(table_list) ### contain all online datasets

71

In [5]:
online_massive = ['MTBLS1606','MSV000086050','MSV000085589','MSV000085586','MSV000085376','MSV000085214','MSV000085210','MSV000085192','MSV000085180','MSV000085179','MSV000085159','MSV000085158','MSV000085141','MSV000085123','MSV000085085','MSV000085038','MSV000085032','MSV000085031','MSV000085027','MSV000085026','MSV000085023','MSV000085021','MSV000085018','MSV000085003','MSV000085000','MSV000084989','MSV000084954','MSV000084950','MSV000084950','MSV000084945','MSV000084884','MSV000084781','MSV000084771','MSV000084723','MSV000084674','MSV000084475','MSV000084475','MSV000084117','MSV000083835','MSV000083734','MSV000083648','MSV000083387','MSV000083302','MSV000083295','MSV000083268','MSV000083081','MSV000082988','MSV000082969','MSV000082891','MSV000082831','MSV000082285','MSV000082045','MSV000081832','MSV000081504','MSV000081318','MSV000081063','MSV000080427','MSV000080251','MSV000080179','MSV000079519','MSV000079284','MSV000079139','MSV000079015','MSV000078995','MSV000078891','MSV000078850','MSV000078847','MSV000078839','MSV000078836','MSV000078667','MSV000078556']

In [6]:
downloaded_massive = []

for item in podp_merged_df['LCMS_file']:
    if 'MSV' in item:
        if 'ftp://maftp' in item:
            downloaded_massive.append(item[31:43])
        else:
            downloaded_massive.append(item[23:35])
    if 'MTBLS' in item:
        downloaded_massive.append(item[35:])
    
downloaded_massive = list(np.unique(downloaded_massive))

for item in online_massive:
    if item not in downloaded_massive:
        print(item)
### we downloaded and processed all datasets (69 unique and 71 total MassIVE/Metabolights IDs)

In [7]:
len(np.unique(podp_merged_df['NCBI_ID'])),len(podp_merged_df)
### unique genomes and total number of links (the second matches the online record)

(2289, 4896)

In [8]:
len(podp_merged_df[podp_merged_df['BioSample'] == 'No BioSample'])

799

In [9]:
len(np.unique(podp_merged_df[podp_merged_df['BioSample'] != 'No BioSample']['BioSample']))

1677

In [10]:
799+1677

2476

In [11]:
for item in np.unique(podp_merged_df['Database']):
    print(item, len(podp_merged_df[podp_merged_df['Database'] == item]['NCBI_ID']))

ENA_NCBI_accession 1298
GenBank_accession 2107
JGI_Genome_ID 765
JGI_ID 2
JGI_IMG_genome_ID 612
RefSeq_accession 112


In [12]:
for item in np.unique(podp_merged_df['Method']):
    print(item, len(podp_merged_df[podp_merged_df['Method'] == item]['NCBI_ID']))

genome 3456
metagenome 1306
metagenome-assembled genome 134


## 1.3. Obtaining uniform Genbank code from BioSample ID

```
Separating JGI/ENA IDs from NCBI genomes, these will be downloaded in another step, for now we'll focus on NCBI genomes
```

In [13]:
filtered_df = podp_merged_df[podp_merged_df['Database'] != 'JGI_IMG_genome_ID']
filtered_df = filtered_df[filtered_df['Database'] != 'JGI_Genome_ID']
filtered_df = filtered_df[filtered_df['Database'] != 'JGI_ID']
filtered_df = filtered_df[filtered_df['Database'] != 'ENA_NCBI_accession']
filtered_df = filtered_df[filtered_df['BioSample'] != 'No BioSample']
filtered_df = filtered_df.reset_index(drop=True)

filtered_df

Unnamed: 0,NCBI_ID,Database,BioSample,LCMS_file,Genome_label,Method
0,GCA_000240165.1,RefSeq_accession,SAMN02603879,ftp://massive.ucsd.edu/MSV000078850/results/S....,S. cattleya,genome
1,GCA_000240165.1,RefSeq_accession,SAMN02603879,ftp://massive.ucsd.edu/MSV000078850/results/S....,S. cattleya,genome
2,GCA_000240165.1,RefSeq_accession,SAMN02603879,ftp://massive.ucsd.edu/MSV000078850/results/S....,S. cattleya,genome
3,GCA_000240165.1,RefSeq_accession,SAMN02603879,ftp://massive.ucsd.edu/MSV000078850/results/S....,S. cattleya,genome
4,GCA_000240165.1,RefSeq_accession,SAMN02603879,ftp://massive.ucsd.edu/MSV000078850/results/S....,S. cattleya,genome
...,...,...,...,...,...,...
2115,GCA_000286575.1,GenBank_accession,SAMN00255227,ftp://massive.ucsd.edu/MSV000084945/ccms_peak/...,Burkholderia multivorans CF2,genome
2116,GCA_000959505.1,GenBank_accession,SAMN03144971,ftp://massive.ucsd.edu/MSV000084945/ccms_peak/...,Burkholderia dolosa AU0158,genome
2117,GCA_000959505.1,GenBank_accession,SAMN03144971,ftp://massive.ucsd.edu/MSV000084945/ccms_peak/...,Burkholderia dolosa AU0158,genome
2118,GCA_003568605.1,GenBank_accession,SAMN02866398,ftp://massive.ucsd.edu/MSV000084945/ccms_peak/...,Burkholderia thailandensis E264,genome


In [14]:
### only run once

# with open('./temp_files/genome_list_temp.txt', "w") as output:
#     writer = csv.writer(output, lineterminator='\n')
#     for val in list(filtered_df['NCBI_ID']):
#         writer.writerow([val])
        
# with open('./temp_files/biosample_list_temp.txt', "w") as output:
#     writer = csv.writer(output, lineterminator='\n')
#     for val in list(filtered_df['BioSample']):
#         writer.writerow([val])

In [15]:
### only run once

# start = time.time()

# subprocess.call('python NCBI_getGenBankID.py --input ./temp_files/biosample_list_temp.txt --output ./outputs/biosample_list-round3-210220.out.csv', shell=True)

# end = time.time()
# hours, rem = divmod(end-start, 3600)
# minutes, seconds = divmod(rem, 60)
# print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
### 02:46:30.68

In [16]:
biosample_df = pd.read_csv('./outputs/biosample_list-round3-210220.out.csv',sep='\t')
biosample_df = biosample_df[biosample_df['GenBank_id'] != '-']
biosample_df = biosample_df.reset_index(drop=True)


biosample_df

Unnamed: 0,Accession_id,Assembly_id,GenBank_id,Refseq_id
0,SAMN02603879,376848,GCA_000240165.1,GCF_000240165.1
1,SAMN14411170,6465981,GCA_011765705.1,
2,SAMN14411171,6466001,GCA_011765735.1,
3,SAMD00106498,2168011,GCA_003945305.1,GCF_003945305.1
4,SAMN05710194,1533101,GCA_002897315.1,GCF_002897315.1
...,...,...,...,...
232,SAMN03144971,315111,GCA_000959505.1,GCF_000959505.1
233,SAMN00623032,39588,GCA_000018505.1,GCF_000018505.1
234,SAMN03140189,315121,GCA_000959525.1,GCF_000959525.1
235,SAMN00255227,492528,GCA_000286575.1,GCF_000286575.1


## 1.4. Adding new column with the downloaded NCBI IDs

In [17]:
len(podp_merged_df),len(filtered_df),len(np.unique(filtered_df['BioSample'])),len(np.unique(biosample_df['GenBank_id']))
### notice that we downloaded 237 out of the 259 unique BioSamples

(4896, 2120, 259, 237)

In [18]:
biosample_dict = dict(zip(biosample_df.Accession_id, biosample_df.GenBank_id))
new_col = []

for i,r in podp_merged_df.iterrows():
    biosample_id = podp_merged_df['BioSample'].loc[i]
    if biosample_id in biosample_dict.keys():
        new_col.append(biosample_dict[biosample_id])
    else:
        if 'GCA' in biosample_id or 'ERX' in biosample_id or 'ERS' in biosample_id:
            new_col.append(biosample_id)
        else:
            new_col.append('NaN')
        
podp_merged_df['New_NCBI_ID'] = new_col

podp_merged_df

Unnamed: 0,NCBI_ID,Database,BioSample,LCMS_file,Genome_label,Method,New_NCBI_ID
0,GCA_000240165.1,RefSeq_accession,SAMN02603879,ftp://massive.ucsd.edu/MSV000078850/results/S....,S. cattleya,genome,GCA_000240165.1
1,GCA_000240165.1,RefSeq_accession,SAMN02603879,ftp://massive.ucsd.edu/MSV000078850/results/S....,S. cattleya,genome,GCA_000240165.1
2,GCA_000240165.1,RefSeq_accession,SAMN02603879,ftp://massive.ucsd.edu/MSV000078850/results/S....,S. cattleya,genome,GCA_000240165.1
3,GCA_000240165.1,RefSeq_accession,SAMN02603879,ftp://massive.ucsd.edu/MSV000078850/results/S....,S. cattleya,genome,GCA_000240165.1
4,GCA_000240165.1,RefSeq_accession,SAMN02603879,ftp://massive.ucsd.edu/MSV000078850/results/S....,S. cattleya,genome,GCA_000240165.1
...,...,...,...,...,...,...,...
4891,GCA_000286575.1,GenBank_accession,SAMN00255227,ftp://massive.ucsd.edu/MSV000084945/ccms_peak/...,Burkholderia multivorans CF2,genome,GCA_000286575.1
4892,GCA_000959505.1,GenBank_accession,SAMN03144971,ftp://massive.ucsd.edu/MSV000084945/ccms_peak/...,Burkholderia dolosa AU0158,genome,GCA_000959505.1
4893,GCA_000959505.1,GenBank_accession,SAMN03144971,ftp://massive.ucsd.edu/MSV000084945/ccms_peak/...,Burkholderia dolosa AU0158,genome,GCA_000959505.1
4894,GCA_003568605.1,GenBank_accession,SAMN02866398,ftp://massive.ucsd.edu/MSV000084945/ccms_peak/...,Burkholderia thailandensis E264,genome,GCA_003568605.1


In [19]:
count = 0
gca_list = []

for item in podp_merged_df.New_NCBI_ID:
    if 'GCA' in item:
        count += 1
        gca_list.append(item)
        
print(count,len(podp_merged_df[podp_merged_df['New_NCBI_ID'] == 'NaN']),
      len(np.unique(podp_merged_df[podp_merged_df['New_NCBI_ID'] == 'NaN']['NCBI_ID'])),len(podp_merged_df))
print(len(np.unique(gca_list)))
### notice that all 237 unique BioSamples generate a GCA code and there were 490/727 other genomes that can't be downloaded

1383 2308 727 4896
237


## 1.5. Downloading and unzipping the NCBI genomes

In [19]:
# !mkdir -p /Volumes/TFL190831/NPOmix_round4/podp_ncbi/

In [20]:
cmd_list,genomes_failed = [],[]

for i,r in podp_merged_df.iterrows():
    if type(r['New_NCBI_ID']) == float:
        n = re.match(r'^\D{4}\d{2}$',r['NCBI_ID'])
        if n:
            cmd = 'wget https://sra-download.ncbi.nlm.nih.gov/traces/wgs01/wgs_aux/%s/%s/%s/%s.1.fsa_nt.gz'%(r['NCBI_ID'][0:2],r['NCBI_ID'][2:4],r['NCBI_ID'],r['NCBI_ID'])
        if 'GCA' in str(r['NCBI_ID']):
            cmd = 'wget -nd -r --no-parent -A "genomic.fna.gz"  -R "*_from_genomic*" "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/%s/%s/%s/"'%(r['NCBI_ID'][4:7],r['NCBI_ID'][7:10],r['NCBI_ID'][10:13])
        if 'GCA' not in str(r['NCBI_ID']) and not n:
            cmd = None
    else:
        if 'GCA' in str(r['New_NCBI_ID']):
            cmd = 'wget -nd -r --no-parent -A "genomic.fna.gz"  -R "*_from_genomic*" "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/%s/%s/%s/"'%(r['New_NCBI_ID'][4:7],r['New_NCBI_ID'][7:10],r['New_NCBI_ID'][10:13])
        else:
            cmd = None
    if cmd != None:
        if cmd not in cmd_list:
            cmd_list.append(cmd)
#             try: #only run once
#                 subprocess.check_output(cmd, shell=True, cwd='/Volumes/TFL190831/NPOmix_round4/podp_ncbi/')
#             except subprocess.CalledProcessError as e:
#                 genomes_failed.append(cmd)

print(len(cmd_list),len(genomes_failed))

237 0


In [21]:
!ls /Volumes/TFL190831/NPOmix_round4_genomes/podp_ncbi/ | wc -l

     237


In [22]:
### only run once

# genome_list = glob.glob("/Volumes/TFL190831/podp_round4_ncbi/*.gz")

# for item in genome_list:
#     genome = os.path.basename(item).split('.')[0]
#     print(os.path.basename(item))
#     cmd = "gunzip -c %s > %s.fasta"%(os.path.basename(item),genome)
#     subprocess.check_output(cmd, shell=True, cwd='/Volumes/TFL190831/NPOmix_round4_genomes/podp_ncbi/')

In [23]:
# !rm /Volumes/TFL190831/NPOmix_round4_genomes/podp_ncbi/*.gz

## 1.6. Copying previously downloaded genomes

In [22]:
glob_round3 = glob.glob('/Volumes/TFL190831/podp_round3_ncbi/*fasta')

for file_path in glob_round3:
    fasta_file = os.path.basename(file_path)
    if not os.path.exists('/Volumes/TFL190831/NPOmix_round4_genomes/podp_ncbi/%s'%fasta_file):
        cmd = 'cp %s /Volumes/TFL190831/NPOmix_round4/previous_genomes/'%file_path
#         subprocess.call(cmd, shell=True)

In [23]:
!ls /Volumes/TFL190831/NPOmix_round4_genomes/podp_ncbi/*.fasta | wc -l

     237


In [24]:
!ls /Volumes/TFL190831/NPOmix_round4_genomes/previous_genomes/*.fasta | wc -l

     174


In [25]:
AS_previous_list = glob.glob('/Volumes/TFL190831/nf_output-iomega/*')
count = 0

for file_path in AS_previous_list:
    AS_file = os.path.basename(file_path).split('_output')[0]
    if os.path.exists('/Volumes/TFL190831/NPOmix_round4_genomes/previous_genomes/%s'%AS_file):
        count += 1
    else:
        if os.path.exists('/Volumes/TFL190831/NPOmix_round4_genomes/podp_ncbi/%s'%AS_file):
            count += 1
            
count,len(AS_previous_list)

(279, 279)

In [26]:
previous_genomes = glob.glob('/Volumes/TFL190831/NPOmix_round4_genomes/previous_genomes/*fasta')
missing_from_podp = []

for item in previous_genomes:
    sum_item = os.path.basename(item).split('.')[0]
    temp_df = podp_merged_df[podp_merged_df['NCBI_ID'].str.contains(sum_item)]
    if len(temp_df) == 0:
        missing_from_podp.append(item)

len(missing_from_podp)        
### for now, eventhough we don't have these genomes in the current podp_list, we already have LCMS for those

161

## 1.7. Downloading metagenomic reads

In [29]:
# !mkdir /Volumes/TFL190831/NPOmix_round4/ERX_reads/
# !mkdir /Volumes/TFL190831/NPOmix_round4/ERS_reads/

In [27]:
len(np.unique(podp_merged_df[podp_merged_df['New_NCBI_ID'].str.contains('ERS')]['New_NCBI_ID']))

724

In [28]:
len(np.unique(podp_merged_df[podp_merged_df['New_NCBI_ID'].str.contains('ERX')]['New_NCBI_ID']))

481

In [32]:
### only run once
# !mkdir /Volumes/TFL190831/NPOmix_round4_genomes/ERX_fileroports/
# !mkdir /Volumes/TFL190831/NPOmix_round4_genomes/ERS_fileroports/

In [33]:
### only run once for ERS codes and one more time to ERX

# !mkdir /Volumes/TFL190831/NPOmix_round4_genomes/ERS_fileroports/

# for i,r in podp_merged_df.iterrows():
#     if 'ERS' in podp_merged_df['New_NCBI_ID'].loc[i]:
#         ena_code = podp_merged_df['New_NCBI_ID'].loc[i]
#         https_cmd = "wget \"https://www.ebi.ac.uk/ena/data/warehouse/filereport?accession=%s&result=read_run&fields=fastq_ftp\""%ena_code
#         subprocess.call(https_cmd, shell=True, cwd='/Volumes/TFL190831/NPOmix_round4_genomes/ERS_fileroports/')

In [29]:
https_list = glob.glob('/Volumes/TFL190831/NPOmix_round4_genomes/ERX_fileroports/filereport?accession=*')
cmd_list = []
ena_dict1 = {}

for item in https_list:
    https_df = pd.read_csv(item,sep='\t')
    read_1 = https_df['fastq_ftp'].loc[0].split(';')[0]
    cmd_list.append('wget ftp://%s'%read_1)
    ena_dict1[os.path.basename(read_1).split('.')[0]] = item[78:88]
    
# with open('./temp_files/ftp_ERX_list.txt', 'w') as filehandle:
#     filehandle.writelines("%s\n" % command for command in cmd_list)

In [30]:
ena_dict1['ERR2239508']

'ERX2291668'

In [31]:
https_list = glob.glob('/Volumes/TFL190831/NPOmix_round4_genomes/ERS_fileroports/filereport?accession=*')
cmd_list = []
ena_dict2 = {}

for item in https_list:
    https_df = pd.read_csv(item,sep='\t')
    read_1 = https_df['fastq_ftp'].loc[0].split(';')[0]
    read_2 = https_df['fastq_ftp'].loc[0].split(';')[1]
    cmd_list.append('wget ftp://%s'%read_1)
    cmd_list.append('wget ftp://%s'%read_2)
    ena_dict2[os.path.basename(read_1).split('.')[0].split('_')[0]] = item[78:88]
    
# with open('./temp_files/ftp_ERS_list.txt', 'w') as filehandle:
#     filehandle.writelines("%s\n" % command for command in cmd_list)

In [32]:
ena_dict2['ERR3946693']

'ERS4346398'

In [33]:
def Merge(dict1, dict2):
    res = {**dict1, **dict2}
    return res

ena_dict3 = Merge(ena_dict1, ena_dict2)

# with open('./temp_files/ena_dict-round3-210315.csv', 'w') as f:
#     for key in ena_dict3.keys():
#         f.write("%s,%s\n"%(key,ena_dict3[key]))

```bash
cd /Volumes/TFL190831/NPOmix_round4_genomes/ERX_reads/

sh ./ftp_ERX_list.txt
```
and alternatively

```bash
cd /Volumes/TFL190831/NPOmix_round4_genomes/ERS_reads/

sh ./ftp_ERS_list.txt
```

In [34]:
!ls /Volumes/TFL190831/NPOmix_round4_genomes/ERS_reads/*fastq.gz | wc -l

    1448


In [35]:
!ls /Volumes/TFL190831/NPOmix_round4_genomes/ERX_reads/*fastq.gz | wc -l

     481


## 1.8. Downloading JGI (meta)genomes

`Salinispora_genomes_faa` with 119 Salinisopra was provided by the Jensen lab

In [36]:
len(np.unique(podp_merged_df[podp_merged_df['Database'].str.contains('JGI')]['NCBI_ID']))

309

In [42]:
# !mkdir /Volumes/TFL190831/NPOmix_round4_genomes/manual_genomes

In [37]:
download_col,download_list,sal_filt = [],[],[]
sal_glob = glob.glob('/Volumes/TFL190831/NPOmix_round4_genomes/Salinispora_genomes_faa/*.faa')

for item in sal_glob:
    sal_filt.append(os.path.basename(item).split('_')[0])

for i,r in podp_merged_df.iterrows():
    if 'JGI' in r['Database']:
        if 'Salinispora' in r['Genome_label']:
            sal_id = 'S' + r['Genome_label'][12] + r['Genome_label'].split(' ')[2]
            if sal_id in sal_filt:
                download_col.append('yes')
                download_list.append(sal_id)
            else:
                download_col.append('no')
        else:
            download_col.append('no')
    else:
        if r['New_NCBI_ID'] != 'NaN':
            download_col.append('yes')
        else:
            download_col.append('no')
            
podp_merged_df['Downloaded'] = download_col

In [38]:
len(np.unique(download_list)),len(download_col),len(podp_merged_df)

(112, 4896, 4896)

In [39]:
len(np.unique(podp_merged_df[podp_merged_df['Downloaded'] == 'yes']['New_NCBI_ID']))

1443

In [40]:
podp_merged_df[podp_merged_df['Downloaded'] == 'no']

Unnamed: 0,NCBI_ID,Database,BioSample,LCMS_file,Genome_label,Method,New_NCBI_ID,Downloaded
46,2821485673,JGI_Genome_ID,SAMN13061032,ftp://massive.ucsd.edu/MSV000086050/updates/20...,GUM007,metagenome-assembled genome,,no
47,CP022686.1,GenBank_accession,No BioSample,ftp://massive.ucsd.edu/MSV000083387/peak/mzXML...,Ecoli_Nissle_genome,genome,,no
48,CP022686.1,GenBank_accession,No BioSample,ftp://massive.ucsd.edu/MSV000083387/peak/mzXML...,Ecoli_Nissle_genome,genome,,no
49,CP022686.1,GenBank_accession,No BioSample,ftp://massive.ucsd.edu/MSV000083387/peak/mzXML...,Ecoli_Nissle_genome,genome,,no
50,MDRX01000000,GenBank_accession,No BioSample,ftp://massive.ucsd.edu/MSV000085021/peak/DA2-8...,WMMB235,genome,,no
...,...,...,...,...,...,...,...,...
3970,"651717039, 651716642",JGI_Genome_ID,SAMN11997568,ftp://massive.ucsd.edu/MSV000083648/raw/detoxi...,Streptomyces spectabilis Dietz NRRL 2792 (ATCC...,genome,,no
3971,"651717039, 651716642",JGI_Genome_ID,SAMN11997568,ftp://massive.ucsd.edu/MSV000083648/raw/detoxi...,Streptomyces spectabilis Dietz NRRL 2792 (ATCC...,genome,,no
3972,"651717039, 651716642",JGI_Genome_ID,SAMN11997568,ftp://massive.ucsd.edu/MSV000083648/raw/detoxi...,Streptomyces spectabilis Dietz NRRL 2792 (ATCC...,genome,,no
3973,2767802005,JGI_Genome_ID,SAMN02645517,ftp://massive.ucsd.edu/MSV000083648/raw/detoxi...,Streptomyces sp. NRRL S-325,genome,,no


Use the dataframe above to manually download these genomes searching at [NCBI database](https://www.ncbi.nlm.nih.gov/Traces/wgs/?page=1&view=all&search=GCF_000196155.1) and place them a the manual_genomes folder

In [41]:
manual_glob = glob.glob('/Volumes/TFL190831/NPOmix_round4_genomes/manual_genomes/*')

manual_list,indexes_to_keep = [],[]

for item in manual_glob:
    manual_list.append(os.path.basename(item).split('.fasta')[0])

for i,r in podp_merged_df.iterrows():
    if r['Downloaded'] == 'no':
        if r['NCBI_ID'] not in manual_list:
            indexes_to_keep.append(i)

# podp_merged_df.loc[indexes_to_keep].to_csv('./outputs/cant_download_podp_merged_df-round3-210315.tsv',sep='\t')

In [42]:
len(podp_merged_df[podp_merged_df['Downloaded'] == 'yes']),len(podp_merged_df[podp_merged_df['Downloaded'] == 'no']),len(podp_merged_df)

(3047, 1849, 4896)

In [43]:
len(np.unique(podp_merged_df.loc[indexes_to_keep]['NCBI_ID']))

308

## 1.9. Separating Genomes without antiSMASH (Optional)

In [53]:
# !mkdir /Volumes/TFL190831/genomes_to_antiSMASH/

In [44]:
AS_all_list = glob.glob('/Volumes/TFL190831/ming_output/antismash/*')
AS_list = []
count = 0

for file_path in AS_all_list:
    AS_file = file_path.split('/')[5].split('.')[0].split('_contigs')[0]
    if AS_file in ena_dict1:
        AS_list.append(ena_dict1[AS_file])
    if AS_file in ena_dict2:
        AS_list.append(ena_dict2[AS_file])
    if AS_file not in ena_dict1 and AS_file not in ena_dict2:
        AS_list.append(AS_file)

manual_genomes = glob.glob('/Volumes/TFL190831/NPOmix_round4_genomes/manual_genomes/*')
ncbi_genomes = glob.glob('/Volumes/TFL190831/NPOmix_round4_genomes/podp_ncbi/*')
sal_genomes = glob.glob('/Volumes/TFL190831/NPOmix_round4_genomes/Salinispora_genomes_faa/*')

for item in list(manual_genomes+ncbi_genomes+sal_genomes):
    if os.path.basename(item) not in AS_list:
        count += 1
        cmd = 'cp %s /Volumes/TFL190831/genomes_to_antiSMASH/'%item
#         subprocess.call(cmd,shell=True)
    
len(AS_list),count

(1766, 679)

## 1.10. Running antiSMASH

```
Ming will add code
```

## 1.11. Filtering links for (meta)genomes with antiSMASH and downloading LC-MS/MS (need update)

In [45]:
!ls /Volumes/TFL190831/ming_output/antismash/ | wc -l

    1766


In [46]:
updated_col = []

for i,r in podp_merged_df.iterrows():
    if r['New_NCBI_ID'] in AS_list:
        updated_col.append('yes')
    else:
        if r['NCBI_ID'] in AS_list:
            updated_col.append('yes')
        else:
            updated_col.append('no')
        
print(len(updated_col))

4896


In [47]:
podp_merged_df['Downloaded'] = updated_col
podp_final_df = podp_merged_df[podp_merged_df['Downloaded'] == 'yes']
podp_final_df = podp_final_df.reset_index(drop=True)
# podp_final_df.to_csv('./outputs/podp_final_df-round4-210426.tsv',sep='\t')

In [48]:
podp_final_df

Unnamed: 0,NCBI_ID,Database,BioSample,LCMS_file,Genome_label,Method,New_NCBI_ID,Downloaded
0,2821485673,JGI_Genome_ID,SAMN13061032,ftp://massive.ucsd.edu/MSV000086050/updates/20...,GUM007,metagenome-assembled genome,,yes
1,MDRX01000000,GenBank_accession,No BioSample,ftp://massive.ucsd.edu/MSV000085021/peak/DA2-8...,WMMB235,genome,,yes
2,2518285561,JGI_Genome_ID,No BioSample,ftp://massive.ucsd.edu/MSV000079284//raw/Dunca...,Salinispora pacifica CNR942,genome,,yes
3,2518285561,JGI_Genome_ID,No BioSample,ftp://massive.ucsd.edu/MSV000079284//raw/Dunca...,Salinispora pacifica CNR942,genome,,yes
4,2518285562,JGI_Genome_ID,No BioSample,ftp://massive.ucsd.edu/MSV000079284//raw/Dunca...,Salinispora pacifica CNS055,genome,,yes
...,...,...,...,...,...,...,...,...
2108,ERS4356320,ENA_NCBI_accession,ERS4356320,ftp://massive.ucsd.edu/MSV000083302/peak/mzxml...,ERS4356320,metagenome,ERS4356320,yes
2109,ERS4356321,ENA_NCBI_accession,ERS4356321,ftp://massive.ucsd.edu/MSV000083302/peak/mzxml...,ERS4356321,metagenome,ERS4356321,yes
2110,ERS4356322,ENA_NCBI_accession,ERS4356322,ftp://massive.ucsd.edu/MSV000083302/peak/mzxml...,ERS4356322,metagenome,ERS4356322,yes
2111,ERS4356323,ENA_NCBI_accession,ERS4356323,ftp://massive.ucsd.edu/MSV000083302/peak/mzxml...,ERS4356323,metagenome,ERS4356323,yes


In [50]:
podp_merged_df.shape,podp_final_df.shape

((4896, 8), (2113, 8))

In [51]:
lcms_list = []

lcms_glob = glob.glob('/Volumes/TFL210426/podp_LCMS/*')

for lcms_file in lcms_glob:
    lcms_list.append(os.path.basename(lcms_file.split('.')[0]))

lcms_list = list(np.unique(lcms_list))

len(lcms_list)

1610

In [52]:
rows_wo_file,seen,commands = [],[],[]

for i,r in podp_final_df.iterrows():
    if podp_final_df['New_NCBI_ID'].loc[i] != 'NaN':
        genomeID = podp_final_df['New_NCBI_ID'].loc[i]
    else:
        genomeID = podp_final_df['NCBI_ID'].loc[i]
    seen.append(genomeID)
    file_count = seen.count(genomeID)
    extension = podp_final_df['LCMS_file'].loc[i].rsplit('.', 1)[1]
    if genomeID not in lcms_list:
        cmd = 'wget -O %s %s'%(genomeID+'.'+extension+'.'+str(file_count),podp_final_df['LCMS_file'].loc[i])
        commands.append(cmd)

len(commands)

22

In [72]:
### only run once

# with open('./temp_files/PODP_LCMS_list_commands-round4.txt', "w") as output:
#     writer = csv.writer(output, lineterminator='\n')
#     for val in commands:
#         writer.writerow([val])

```bash
mkdir /Volumes/TFL190831/podp_LCMS

cd /Volumes/TFL190831/podp_LCMS

sh PODP_LCMS_list_commands-round3.txt
```

**Rename names containing ".1."**

```
cd podp_LCMS/

ls *.1.* | cat > rename.txt

Edit rename.txt in a text editor

sh rename.txt
```

## 1.12 Download missing Gerwick cyanoibacterial MS/MS and selecting Gerwick mgf files

In [1]:
import re
import pandas as pd
import numpy as np
from pyteomics import mgf, auxiliary
import csv

```
Download all GNPS database from the link: https://gnps.ucsd.edu/ProteoSAFe/gnpslibrary.jsp?library=all
```

In [2]:
count = 0
colA = []
colB = []

with open('/Users/tiagoferreiraleao/Dropbox/tiago-NAS/NPOmix/inputs/ALL_GNPS.mgf') as fp:
    for line in fp:
        count += 1
        a = re.match(r'BEGIN IONS',line)
        if a:
            colA.append(count)
        b = re.match(r'END IONS',line)
        if b:
            colB.append(count)

print(len(colA),len(colB))

448698 448698


In [3]:
col1 = []
col2 = []
col3 = []
col4 = []
col5 = []

with open('/Users/tiagoferreiraleao/Dropbox/tiago-NAS/NPOmix/inputs/ALL_GNPS.mgf') as fp:
    lines=fp.readlines()
    for index,item in enumerate(colA):
        pepmass = "N/A"
        name = "N/A"
        pi = "N/A"
        smiles = "N/A"
        gnps_id = "N/A"
        chunk = lines[item:colB[index]]
        for line in chunk:
            a = re.match(r'PEPMASS=(.*)',line)
            if a:
                pepmass = a.group(1)
            b = re.match(r'NAME=(.*)',line)
            if b:
                name = b.group(1)
            c = re.match(r'PI=(.*)',line)
            if c:
                pi = c.group(1)
            d = re.match(r'SMILES=(.*)',line)
            if d:
                smiles = d.group(1)
            e = re.match(r'SPECTRUMID=(.*)',line)
            if e:
                gnps_id = e.group(1)
        col1.append(pepmass)
        col2.append(name)
        col3.append(pi)
        col4.append(smiles)
        col5.append(gnps_id)
                
len(col1),len(col2),len(col3),len(col4),len(col5)

(448698, 448698, 448698, 448698, 448698)

In [5]:
gnps_df = pd.DataFrame(
    {'PEPMASS': col1,
     'NAME': col2,
     'PI': col3,
     'SMILES': col4,
     'SPECTRUMID': col5
    })

gnps_df

Unnamed: 0,PEPMASS,NAME,PI,SMILES,SPECTRUMID
0,981.54,3-Des-Microcystein_LR M+H,Gerwick,CC(C)CC1NC(=O)C(C)NC(=O)C(=C)N(C)C(=O)CCC(NC(=...,CCMSLIB00000001547
1,940.25,Hoiamide B M+H,Gerwick,CCC[C@@H](C)[C@@H]([C@H](C)[C@@H]1[C@H]([C@H](...,CCMSLIB00000001548
2,456.1,Malyngamide C M+H,Gerwick,CCCCCCC[C@@H](C/C=C/CCC(=O)NC/C(=C/Cl)/[C@@]12...,CCMSLIB00000001549
3,545.0,Scytonemin M+H,Gerwick,OC1=CC=C(\C=C2\C(=O)C(C3=C4C5=C(C=CC=C5)N=C4\C...,CCMSLIB00000001550
4,314.116,Salinisporamide A M+H,Fenical-Jensen-Moore,,CCMSLIB00000001551
...,...,...,...,...,...
448693,277.218,Pinolenic acid - 20eV M-H,Sumner,CCCCC/C=C\C/C=C\CC/C=C\CCCC(=O)O,CCMSLIB00004684135
448694,277.218,Pinolenic acid - 30eV M-H,Sumner,CCCCC/C=C\C/C=C\CC/C=C\CCCC(=O)O,CCMSLIB00004684136
448695,279.234,"9Z, 11E-Linoleic acid - 10eV M-H",Sumner,CCCCCC/C=C/C=C\CCCCCCCC(=O)O,CCMSLIB00004684137
448696,279.232,"9Z, 11E-Linoleic acid - 30eV M-H",Sumner,CCCCCC/C=C/C=C\CCCCCCCC(=O)O,CCMSLIB00004684138


In [6]:
gerwick_df = gnps_df[gnps_df["PI"].isin(["Gerwick","Dr. Gerwick","W. Gerwick","Dorrestein/Gerwick","Pieter Dorrestein &amp; Lena Gerwick","William Gerwick"])]

gerwick_df

Unnamed: 0,PEPMASS,NAME,PI,SMILES,SPECTRUMID
0,981.54,3-Des-Microcystein_LR M+H,Gerwick,CC(C)CC1NC(=O)C(C)NC(=O)C(=C)N(C)C(=O)CCC(NC(=...,CCMSLIB00000001547
1,940.25,Hoiamide B M+H,Gerwick,CCC[C@@H](C)[C@@H]([C@H](C)[C@@H]1[C@H]([C@H](...,CCMSLIB00000001548
2,456.1,Malyngamide C M+H,Gerwick,CCCCCCC[C@@H](C/C=C/CCC(=O)NC/C(=C/Cl)/[C@@]12...,CCMSLIB00000001549
3,545.0,Scytonemin M+H,Gerwick,OC1=CC=C(\C=C2\C(=O)C(C3=C4C5=C(C=CC=C5)N=C4\C...,CCMSLIB00000001550
5,667.115,Hectochlorin M+H,Gerwick,C[C@H]1[C@@H](OC(C2=CSC([C@H](C(C)(OC(C3=CSC([...,CCMSLIB00000001552
...,...,...,...,...,...
6142,1201.5,Kurahamide B M+NH4,Gerwick,CCCC(N[C@H](C(N[C@H](C(N[C@H]1[C@@H](C)OC([C@H...,CCMSLIB00005723420
6702,1156.59,Rivulariapeptolide 1155 M+H,Gerwick,CC(C)[C@@H](NC([C@@H](CC1=CC=C(O)C=C1)N(C)C([C...,CCMSLIB00005723986
6714,400.25,Cryptomaldamice M+H,Gerwick,NC(N[C@@H](CO)C(N[C@@H](C(C)C)C(N(C)[C@@H](C(C...,CCMSLIB00005724004
6726,400.257,Cryptomaldamide M+H,Gerwick,CC(C)[C@@H](/C=C(\C)/C(=O)O)N(C)C(=O)[C@H](C(C...,CCMSLIB00005724017


In [7]:
gnps_df2 = gerwick_df[gerwick_df["SMILES"] != "N/A"]
gnps_df2 = gnps_df2[gnps_df2["SMILES"] != ' ']
gnps_df2 = gnps_df2[gnps_df2["SMILES"] != 'NaN']
gnps_df2 = gnps_df2.fillna(0)
gnps_df2 = gnps_df2[gnps_df2["SMILES"] != 0]
gnps_df2 = gnps_df2[gnps_df2["SMILES"] != 'c']
gnps_df2 = gnps_df2[gnps_df2["SMILES"] != 'no data']

gnps_df2.reset_index(inplace=True,drop=True)

gnps_df2

Unnamed: 0,PEPMASS,NAME,PI,SMILES,SPECTRUMID
0,981.54,3-Des-Microcystein_LR M+H,Gerwick,CC(C)CC1NC(=O)C(C)NC(=O)C(=C)N(C)C(=O)CCC(NC(=...,CCMSLIB00000001547
1,940.25,Hoiamide B M+H,Gerwick,CCC[C@@H](C)[C@@H]([C@H](C)[C@@H]1[C@H]([C@H](...,CCMSLIB00000001548
2,456.1,Malyngamide C M+H,Gerwick,CCCCCCC[C@@H](C/C=C/CCC(=O)NC/C(=C/Cl)/[C@@]12...,CCMSLIB00000001549
3,545.0,Scytonemin M+H,Gerwick,OC1=CC=C(\C=C2\C(=O)C(C3=C4C5=C(C=CC=C5)N=C4\C...,CCMSLIB00000001550
4,667.115,Hectochlorin M+H,Gerwick,C[C@H]1[C@@H](OC(C2=CSC([C@H](C(C)(OC(C3=CSC([...,CCMSLIB00000001552
...,...,...,...,...,...
401,1201.5,Kurahamide B M+NH4,Gerwick,CCCC(N[C@H](C(N[C@H](C(N[C@H]1[C@@H](C)OC([C@H...,CCMSLIB00005723420
402,1156.59,Rivulariapeptolide 1155 M+H,Gerwick,CC(C)[C@@H](NC([C@@H](CC1=CC=C(O)C=C1)N(C)C([C...,CCMSLIB00005723986
403,400.25,Cryptomaldamice M+H,Gerwick,NC(N[C@@H](CO)C(N[C@@H](C(C)C)C(N(C)[C@@H](C(C...,CCMSLIB00005724004
404,400.257,Cryptomaldamide M+H,Gerwick,CC(C)[C@@H](/C=C(\C)/C(=O)O)N(C)C(=O)[C@H](C(C...,CCMSLIB00005724017


In [8]:
select_list = []

for item in gnps_df2['SPECTRUMID']:
    select_list.append(item)

select_list

['CCMSLIB00000001547',
 'CCMSLIB00000001548',
 'CCMSLIB00000001549',
 'CCMSLIB00000001550',
 'CCMSLIB00000001552',
 'CCMSLIB00000001553',
 'CCMSLIB00000001555',
 'CCMSLIB00000001558',
 'CCMSLIB00000001560',
 'CCMSLIB00000001562',
 'CCMSLIB00000001564',
 'CCMSLIB00000001565',
 'CCMSLIB00000001567',
 'CCMSLIB00000001569',
 'CCMSLIB00000001571',
 'CCMSLIB00000001573',
 'CCMSLIB00000001575',
 'CCMSLIB00000001577',
 'CCMSLIB00000001579',
 'CCMSLIB00000001581',
 'CCMSLIB00000001583',
 'CCMSLIB00000001585',
 'CCMSLIB00000001587',
 'CCMSLIB00000001589',
 'CCMSLIB00000001591',
 'CCMSLIB00000001595',
 'CCMSLIB00000001597',
 'CCMSLIB00000001601',
 'CCMSLIB00000001603',
 'CCMSLIB00000001605',
 'CCMSLIB00000001608',
 'CCMSLIB00000001626',
 'CCMSLIB00000001628',
 'CCMSLIB00000001630',
 'CCMSLIB00000001632',
 'CCMSLIB00000001634',
 'CCMSLIB00000001636',
 'CCMSLIB00000001638',
 'CCMSLIB00000001640',
 'CCMSLIB00000001642',
 'CCMSLIB00000001644',
 'CCMSLIB00000001646',
 'CCMSLIB00000001648',
 'CCMSLIB00

In [9]:
with mgf.MGF('/Users/tiagoferreiraleao/Dropbox/tiago-NAS/NPOmix/inputs/ALL_GNPS.mgf') as reader:
    for spectrum in reader:
        if spectrum['params']['spectrumid'] in select_list:
            individual_spec = []
            individual_spec.append(spectrum)
            output_file = '/Users/tiagoferreiraleao/Dropbox/tiago-NAS/NPOmix/NPOmix_mgf_spectra/%s.mgf'%(spectrum['params']['spectrumid'])
#             mgf.write(spectra=individual_spec, header='', output=output_file)

In [10]:
!ls /Users/tiagoferreiraleao/Dropbox/tiago-NAS/NPOmix/NPOmix_mgf_spectra/ | wc -l

     406


In [11]:
#downloaded file from the Gerwick dataset with 
podp_lena_df = pd.read_csv('/Users/tiagoferreiraleao/Dropbox/tiago-NAS/NPOmix/inputs/paired-864909ec-e716-4c5a-bfe3-ce3a169b8844.2-genome-metabolome.csv',sep=',')

podp_lena_df

Unnamed: 0,Genome/Metagenome,Location of metabolomics data file,NCBI ID,Sample Growth Conditions,Extraction Method,Instrumentation Method
0,Symploca sp. SIO1B1,ftp://massive.ucsd.edu/MSV000085210/raw/iomega...,GCA_010672945,SW-BG-11,2:1 DCM:MeOH,Maxis LCMS
1,Cyanothece sp. SIO1E1,ftp://massive.ucsd.edu/MSV000085210/raw/iomega...,GCA_010672835,SW-BG-11,2:1 DCM:MeOH,Maxis LCMS
2,Moorea sp. SIO1F2,ftp://massive.ucsd.edu/MSV000085210/raw/iomega...,GCA_010672755,SW-BG-11,2:1 DCM:MeOH,Maxis LCMS
3,Okeania sp. SIO1F9,ftp://massive.ucsd.edu/MSV000085210/raw/iomega...,GCA_010672745,SW-BG-11,2:1 DCM:MeOH,Maxis LCMS
4,Okeania sp. SIO1F9,ftp://massive.ucsd.edu/MSV000085210/raw/iomega...,GCA_010672745,SW-BG-11,2:1 DCM:MeOH,Maxis LCMS
...,...,...,...,...,...,...
109,Leptolyngbya sp. SIO4C5,ftp://massive.ucsd.edu/MSV000085210/raw/iomega...,GCA_010671965,SW-BG-11,2:1 DCM:MeOH,Maxis LCMS
110,Moorea sp. SIO4G2,ftp://massive.ucsd.edu/MSV000085210/raw/iomega...,GCA_010692345,SW-BG-11,2:1 DCM:MeOH,Maxis LCMS
111,Moorea sp. SIO4G3,ftp://massive.ucsd.edu/MSV000085210/raw/iomega...,GCA_010692305,SW-BG-11,2:1 DCM:MeOH,Maxis LCMS
112,Moorea sp. SIOASIH,ftp://massive.ucsd.edu/MSV000085210/raw/iomega...,GCA_010671925,SW-BG-11,2:1 DCM:MeOH,Maxis LCMS


In [12]:
rows_wo_file,seen,commands = [],[],[]

for i,r in podp_lena_df.iterrows():
    genomeID = podp_lena_df['NCBI ID'].loc[i]
    seen.append(genomeID)
    file_count = seen.count(genomeID)
    extension = podp_lena_df['Location of metabolomics data file'].loc[i].rsplit('.', 1)[1]
    cmd = 'wget -O %s %s'%(genomeID+'.'+extension+'.'+str(file_count),podp_lena_df['Location of metabolomics data file'].loc[i])
    commands.append(cmd)

len(commands)

114

In [13]:
commands

['wget -O GCA_010672945.mzXML.1 ftp://massive.ucsd.edu/MSV000085210/raw/iomega/2248%20cr_GC6_01_37750.mzXML',
 'wget -O GCA_010672835.mzXML.1 ftp://massive.ucsd.edu/MSV000085210/raw/iomega/PAB18MAY11_9_C_GA10_01_37730.mzXML',
 'wget -O GCA_010672755.mzXML.1 ftp://massive.ucsd.edu/MSV000085210/raw/iomega/2239%20cr_GG1_01_37793.mzXML',
 'wget -O GCA_010672745.mzXML.1 ftp://massive.ucsd.edu/MSV000085210/raw/iomega/2058%20D_GD8_01_37764.mzXML',
 'wget -O GCA_010672745.mzXML.2 ftp://massive.ucsd.edu/MSV000085210/raw/iomega/2058%20H_GD9_01_37765.mzXML',
 'wget -O GCA_010672795.mzXML.1 ftp://massive.ucsd.edu/MSV000085210/raw/iomega/2035%20cr_GD1_01_37757.mzXML',
 'wget -O GCA_010672695.mzXML.1 ftp://massive.ucsd.edu/MSV000085210/raw/iomega/A1765_015E11_RE11_01_13970.mzXML',
 'wget -O GCA_010672695.mzXML.2 ftp://massive.ucsd.edu/MSV000085210/raw/iomega/A1766_015F11_RF11_01_13982.mzXML',
 'wget -O GCA_010672695.mzXML.3 ftp://massive.ucsd.edu/MSV000085210/raw/iomega/A1765_015F2_RF2_01_13972.mzXM

In [14]:
### only run once

# with open('./temp_files/PODP_LCMS_list_commands-round5.txt', "w") as output:
#     writer = csv.writer(output, lineterminator='\n')
#     for val in commands:
#         writer.writerow([val])

# !mkdir ./inputs/cyanos_mzML

In [16]:
!ls ./inputs/cyanos_mzML/ | wc -l

     118


In [None]:
#!rm ./temp_files/PODP_LCMS_list_commands-round5.txt