# Imports, settings and paths

In [1]:
import pandas as pd
import os
import csv
import traceback

In [2]:
from Bio import SeqIO
from tqdm import tqdm_notebook as tqdm
from ete3 import NCBITaxa

In [3]:
path_ncbi = "/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq"
path_output = "/home/ubuntu/Disks/HDD1000/NCBI/ncbi.csv"
path_scanning_id = "/home/ubuntu/Disks/HDD1000/NCBI/ncbi_scanning_id.txt"
path_ncbi_csv = "/home/ubuntu/Disks/HDD1000/NCBI/ncbi.csv"
path_ncbi_pd = "/home/ubuntu/Disks/HDD1000/NCBI/ncbi.pd"

In [4]:
pd.set_option('display.max_columns', 50)

In [5]:
ncbi = NCBITaxa()

In [6]:
desired_ranks = ['superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

In [7]:
def get_desired_ranks(taxid, desired_ranks, tolist=False):
    """ From stackoverflow
        https://stackoverflow.com/questions/36503042/how-to-get-taxonomic-specific-ids-for-kingdom-phylum-class-order-family-gen
    """
    try:
        lineage = ncbi.get_lineage(taxid)
        lineage2ranks = ncbi.get_rank(lineage)
        ranks2lineage = dict((rank, taxid) for (taxid, rank) in lineage2ranks.items())
        if tolist: return [ranks2lineage.get(rank, 0) for rank in desired_ranks]
        else:      return {f'{rank}_id': ranks2lineage.get(rank, 0) for rank in desired_ranks}
    except:
        print(f"{taxid} failed")
        if tolist: return [0 for rank in desired_ranks]
        else:      return {f'{rank}_id': 0 for rank in desired_ranks}

# Loop through NCBI genomes to collect data into csv

In [283]:
i = 0
imax = 200000
can_be = ["plasmid", "chloroplaste", "scaffold", "contig", 
          "chromosome", "complete genome", "whole genome shotgun sequence", ]  # to extend
headers = ["taxon_id", "taxon_name", "rank", "type", "length", 
           "record_id", "record_description", "path_folder", "path_file", ]
[headers.extend([f"id_{v}", f"name_{v}"]) for v in desired_ranks]

with open(path_scanning_id, 'r') as f:
    scanning_id = int(f.read())

with open(path_ncbi_csv, 'a', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t')
    if scanning_id == 0: 
        writer.writerow(headers)

    for dir_path, dir_names, files in tqdm(os.walk(path_ncbi)):
        for file in files:
            if file.endswith("fna"):
                i += 1
                if i < scanning_id: continue
                    
                # Read record / .fna file
                path_file = os.path.join(dir_path, file)
                records_to_csv = []

                for record in SeqIO.parse(path_file, "fasta"):
                    try:
                        # Is it a genome, chromosome, plasmid ?
                        what = "undefined"
                        for v in can_be:
                            if v in record.description.lower():
                                what = v
                                break

                        # Read taxo file if exists
                        path_taxo_file = path_file.replace(".fna", ".taxon")
                        assert os.path.isfile(path_taxo_file), f"taxo file hasn't been found: {path_taxo_file}"
                        with open(path_taxo_file, "r") as f:
                            taxon_id = int(f.read().split()[0])

                        # Get the lineage for main ranks
                        try:
                            rank = ncbi.get_rank([taxon_id])[taxon_id]
                        except:
                            rank = "no rank"
        #                     ranks_id = get_desired_ranks(taxon_id, desired_ranks, tolist=True)
        #                     rank_str = ncbi.translate_to_names(ranks_id)
                        ranks_dic = ncbi.get_taxid_translator(get_desired_ranks(taxon_id, desired_ranks, tolist=True))
                        ranks_list = []
                        for k,v in ranks_dic.items():
                            ranks_list.extend([k,v])

                        # The file path (take advantage of pandas categorical)
                        path_base, path_rec_relative = path_file.split("refseq/")
                        path_base += "refseq/"

                        # Gather the data in a tuple
                        record_data = (taxon_id, ncbi.translate_to_names([taxon_id])[0], 
                                       rank, what, len(record.seq), record.id, record.description, 
                                       path_base, path_rec_relative, *ranks_list)
                        # add to tuple ?  str(record.seq.upper())
                        records_to_csv.append(record_data)
                    
                    except Exception as e:
                        print(path_file)
                        traceback.print_exc()
                        raise NotImplementedError("Need to check file: " + path_file)
                        
                for record in records_to_csv:
                    writer.writerow(record)
                # Keep track which file has been processed
                with open(path_scanning_id, 'w') as f:
                    f.write(str(i))
                if i > imax: break
        if i > imax: break

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/vertebrate_mammalian/GCF_000001895.5/GCF_000001895.5_Rnor_6.0_genomic.fna


Traceback (most recent call last):
  File "<ipython-input-283-f7f5c1001da7>", line 37, in <module>
    assert os.path.isfile(path_taxo_file), f"taxo file hasn't been found: {path_taxo_file}"
AssertionError: taxo file hasn't been found: /home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/vertebrate_mammalian/GCF_000001895.5/GCF_000001895.5_Rnor_6.0_genomic.taxon


NotImplementedError: Need to check file: /home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/vertebrate_mammalian/GCF_000001895.5/GCF_000001895.5_Rnor_6.0_genomic.fna

# Read the csv and transform to pandas, plus column dtype conversion

Might take a while if file is big...

In [8]:
df = pd.read_csv(path_ncbi_csv, sep="\t", )

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
# df.info(memory_usage='deep')
f"{df.memory_usage(deep=True).sum()/1000000:.1f} MB"

'741.5 MB'

In [10]:
df.shape

(578756, 25)

In [11]:
for col in df.columns:
    if "id_" in col:
        print(col)
        df[col].fillna(0, inplace=True)
        df[col] = df[col].astype(int)

id_superkingdom
id_kingdom
id_phylum
id_class
id_order
id_family
id_genus
id_species


In [12]:
col_categories = ['rank', 'type', 'path_folder', 
                  'id_superkingdom', 'name_superkingdom', 'id_kingdom', 'name_kingdom', 'id_phylum', 'name_phylum', 
                  'id_class', 'name_class', 'id_order', 'name_order', 'id_family', 'name_family', 
                  'id_genus', 'name_genus', 'id_species', 'name_species']

for col in tqdm(col_categories):
    df[col] = df[col].astype('category')

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))




In [13]:
f"{df.memory_usage(deep=True).sum()/1000000:.1f} MB"

'289.9 MB'

In [14]:
df

Unnamed: 0,taxon_id,taxon_name,rank,type,length,record_id,record_description,path_folder,path_file,id_superkingdom,name_superkingdom,id_kingdom,name_kingdom,id_phylum,name_phylum,id_class,name_class,id_order,name_order,id_family,name_family,id_genus,name_genus,id_species,name_species
0,456320,Methanococcus voltae A3,no rank,complete genome,1936387,NC_014222.1,"NC_014222.1 Methanococcus voltae A3, complete ...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,archaea/GCF_000006175.1/GCF_000006175.1_ASM617...,2157,Archaea,2182,Methanococcales,2183,Methanococcaceae,2184,Methanococcus,2188,Methanococcus voltae,28890,Euryarchaeota,183939,Methanococci,0,
1,64091,Halobacterium salinarum NRC-1,no rank,complete genome,2014239,NC_002607.1,"NC_002607.1 Halobacterium sp. NRC-1, complete ...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,archaea/GCF_000006805.1/GCF_000006805.1_ASM680...,2157,Archaea,2235,Halobacteriales,2236,Halobacteriaceae,2239,Halobacterium,2242,Halobacterium salinarum,28890,Euryarchaeota,183963,Halobacteria,0,
2,64091,Halobacterium salinarum NRC-1,no rank,plasmid,191346,NC_001869.1,NC_001869.1 Halobacterium sp. NRC-1 plasmid pN...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,archaea/GCF_000006805.1/GCF_000006805.1_ASM680...,2157,Archaea,2235,Halobacteriales,2236,Halobacteriaceae,2239,Halobacterium,2242,Halobacterium salinarum,28890,Euryarchaeota,183963,Halobacteria,0,
3,64091,Halobacterium salinarum NRC-1,no rank,complete genome,365425,NC_002608.1,NC_002608.1 Halobacterium sp. NRC-1 plasmid pN...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,archaea/GCF_000006805.1/GCF_000006805.1_ASM680...,2157,Archaea,2235,Halobacteriales,2236,Halobacteriaceae,2239,Halobacterium,2242,Halobacterium salinarum,28890,Euryarchaeota,183963,Halobacteria,0,
4,273057,Saccharolobus solfataricus P2,no rank,complete genome,2992245,NC_002754.1,"NC_002754.1 Sulfolobus solfataricus P2, comple...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,archaea/GCF_000007005.1/GCF_000007005.1_ASM700...,2157,Archaea,2281,Sulfolobales,2287,Saccharolobus solfataricus,28889,Crenarchaeota,118883,Sulfolobaceae,183924,Thermoprotei,2100760,Saccharolobus,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
578751,10090,Mus musculus,species,contig,264822,NT_166293.1,NT_166293.1 Mus musculus strain 129S7/SvEvBrd-...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,vertebrate_mammalian/GCF_000001635.26/GCF_0000...,2759,Eukaryota,7711,Chordata,9989,Rodentia,10066,Muridae,10088,Mus,10090,Mus musculus,33208,Metazoa,40674,Mammalia
578752,10090,Mus musculus,species,contig,191695,NT_166294.1,NT_166294.1 Mus musculus strain 129S7/SvEvBrd-...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,vertebrate_mammalian/GCF_000001635.26/GCF_0000...,2759,Eukaryota,7711,Chordata,9989,Rodentia,10066,Muridae,10088,Mus,10090,Mus musculus,33208,Metazoa,40674,Mammalia
578753,10090,Mus musculus,species,contig,99963,NT_166322.1,NT_166322.1 Mus musculus strain 129S7/SvEvBrd-...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,vertebrate_mammalian/GCF_000001635.26/GCF_0000...,2759,Eukaryota,7711,Chordata,9989,Rodentia,10066,Muridae,10088,Mus,10090,Mus musculus,33208,Metazoa,40674,Mammalia
578754,10090,Mus musculus,species,contig,89177,NT_187001.1,NT_187001.1 Mus musculus strain 129S7/SvEvBrd-...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,vertebrate_mammalian/GCF_000001635.26/GCF_0000...,2759,Eukaryota,7711,Chordata,9989,Rodentia,10066,Muridae,10088,Mus,10090,Mus musculus,33208,Metazoa,40674,Mammalia


In [15]:
df.to_pickle(path_ncbi_pd)

# Need to fix the species name

In [None]:
from ete3 import NCBITaxa
ncbi = NCBITaxa()

ncbi.get_fuzzy_name_translation("Clostridioides difficile")

ncbi.get_name_translator(["Clostridioides difficile"])

# End

In [206]:
headers = ["taxon_id", "taxon_name", "rank", "type", "length", "record_id", "record_description",
           "path_folder", "path_file", ]
[headers.extend([f"id_{v}", f"name_{v}"]) for v in desired_ranks]

In [234]:
"['" + "', '".join("taxon_name	rank	type	length	record_id	record_description	path_folder	path_file	id_superkingdom	name_superkingdom	id_kingdom	name_kingdom	id_phylum	name_phylum	id_class	name_class	id_order	name_order	id_family	name_family	id_genus	name_genus	id_species	name_species".split("\t")) + "']"

"['taxon_name', 'rank', 'type', 'length', 'record_id', 'record_description', 'path_folder', 'path_file', 'id_superkingdom', 'name_superkingdom', 'id_kingdom', 'name_kingdom', 'id_phylum', 'name_phylum', 'id_class', 'name_class', 'id_order', 'name_order', 'id_family', 'name_family', 'id_genus', 'name_genus', 'id_species', 'name_species']"

In [213]:
record_data

(272569,
 'Haloarcula marismortui ATCC 43049',
 'no rank',
 'plasmid',
 410554,
 'NC_006395.1',
 'NC_006395.1 Haloarcula marismortui ATCC 43049 plasmid pNG700, complete sequence',
 '/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/',
 'archaea/GCF_000011085.1/GCF_000011085.1_ASM1108v1_genomic.fna',
 2157,
 'Archaea',
 2235,
 'Halobacteriales',
 2237,
 'Haloarcula',
 2238,
 'Haloarcula marismortui',
 28890,
 'Euryarchaeota',
 183963,
 'Halobacteria',
 1963268,
 'Haloarculaceae')

In [114]:
path_file = "/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/bacteria/GCF_000006845.1/GCF_000006845.1_ASM684v1_genomic.fna"

In [56]:
record

SeqRecord(seq=Seq('AATTTAAAGATTAAAATTAGTAGACTGTCGATTTACAATATCATATTTATGAGT...TAA', SingleLetterAlphabet()), id='NC_014222.1', name='NC_014222.1', description='NC_014222.1 Methanococcus voltae A3, complete genome', dbxrefs=[])

In [59]:
len(record.seq)

1936387

In [85]:
record.seq.upper()

Seq('AATTTAAAGATTAAAATTAGTAGACTGTCGATTTACAATATCATATTTATGAGT...TAA', SingleLetterAlphabet())

In [73]:
print(record)

ID: NC_014222.1
Name: NC_014222.1
Description: NC_014222.1 Methanococcus voltae A3, complete genome
Number of features: 0
Seq('AATTTAAAGATTAAAATTAGTAGACTGTCGATTTACAATATCATATTTATGAGT...TAA', SingleLetterAlphabet())


In [62]:
dir(record)

['__add__',
 '__bool__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__le___',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_per_letter_annotations',
 '_seq',
 '_set_per_letter_annotations',
 '_set_seq',
 'annotations',
 'dbxrefs',
 'description',
 'features',
 'format',
 'id',
 'letter_annotations',
 'lower',
 'name',
 'reverse_complement',
 'seq',
 'translate',
 'upper']

In [86]:
record.description

'NC_014222.1 Methanococcus voltae A3, complete genome'

In [176]:
get_desired_ranks(2, desired_ranks)

{'superkingdom_id': 2,
 'kingdom_id': 'NaN',
 'phylum_id': 'NaN',
 'class_id': 'NaN',
 'order_id': 'NaN',
 'family_id': 'NaN',
 'genus_id': 'NaN',
 'species_id': 'NaN'}

In [154]:
get_desired_ranks(9606, desired_ranks, tolist=True)

[2759, 33208, 7711, 40674, 9443, 9604, 9605, 9606]

In [155]:
ncbi.translate_to_names(get_desired_ranks(9606, desired_ranks, tolist=True))

['Eukaryota',
 'Metazoa',
 'Chordata',
 'Mammalia',
 'Primates',
 'Hominidae',
 'Homo',
 'Homo sapiens']

In [184]:
ranks_id_str = ncbi.get_taxid_translator(get_desired_ranks(6, desired_ranks, tolist=True))
ranks_id_str

{2: 'Bacteria',
 6: 'Azorhizobium',
 356: 'Rhizobiales',
 1224: 'Proteobacteria',
 28211: 'Alphaproteobacteria',
 335928: 'Xanthobacteraceae'}

In [185]:
l=[]
[l.extend([k,v]) for k,v in ranks_id_str.items()]

[None, None, None, None, None, None]

In [186]:
l

[2,
 'Bacteria',
 6,
 'Azorhizobium',
 356,
 'Rhizobiales',
 1224,
 'Proteobacteria',
 28211,
 'Alphaproteobacteria',
 335928,
 'Xanthobacteraceae']