# Imports, settings and paths

In [1]:
import pandas as pd
import os
from copy import deepcopy
from random import randint

In [2]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import SingleLetterAlphabet
from tqdm import tqdm_notebook as tqdm
from ete3 import NCBITaxa

In [3]:
# path_ncbi = "/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq"
path_ncbi_csv = "/home/ubuntu/Disks/HDD1000/NCBI/ncbi.csv"
path_ncbi_pd = "/home/ubuntu/Disks/HDD1000/NCBI/ncbi_2019-11-26.pd"

In [4]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 500)

In [5]:
# Settings
nb_species = 20
nb_reads = 100000
read_length = (1000, 10000)

In [6]:
ncbi = NCBITaxa()

# SyntRead Class

In [7]:
class SyntRead:
    count = 0
    
    def __init__(self, record_id, taxon, genome, what, s_kingdom, path_complete, description):  # folder, file,
        self.object_id    = SyntRead.counter()
        self.record_id    = record_id
        self.taxon        = taxon
        self.genome       = genome
        self.gen_chr_plas = what
        self.s_kingdom    = s_kingdom
        self.path         = path_complete  # os.path.join(folder, file)
        self.description  = description
        self.description_original = description
        
        self.seq          = None
        self.seq_len      = None
        self.start        = None
        self.stop         = None
        self.read_len     = None
        self.read         = None 
        
    def counter():
        SyntRead.count += 1
        return SyntRead.count - 1
        
    def synthetize_read(self, seq="", synt_length=0):
        if synt_length == 0:
            synt_length = randint(read_length[0], read_length[1])
        if seq == "":
            records = list(SeqIO.parse(self.path, "fasta"))
            for r in records:
                if r.id == self.record_id:
                    seq = str(r.seq)
                    break
                else:
                    raise FileNotFoundError("File error, couldn't find the right record")
            self.seq_len = len(seq)
            
        self.read_len  = synt_length
        self.start     = randint(0, self.seq_len - self.read_len)
        self.stop      = self.start + self.read_len
        self.read      = seq[self.start:self.stop]
        self.update_description()
        
    def deep_random_copy(self, n):
        seq = self.fetch_seq()
        l = []
        for i in range(n):
            new_item = deepcopy(self)
            new_item.object_id = SyntRead.counter()
            new_item.synthetize_read(seq)
            l.append(new_item)
        return l
    
    def update_description(self):
        self.description = self.description_original \
            + f", synthetic read (sr_id={self.object_id}) from genome (tax_id={self.taxon}), "\
              f"from nucleotide {self.start} to {self.stop} (len={self.read_len})"
        
    def fetch_seq(self):
        records = list(SeqIO.parse(self.path, "fasta"))
        for r in records:
            if r.id == self.record_id:
                seq = str(r.seq)
                break
            else:
                raise FileNotFoundError("File error, couldn't find the right record")
        return seq
    
    def __repr__(self):
        return f"Synthetic read <{self.object_id}> from {self.genome}, len={self.read_len}, [{self.start}:{self.stop}]"
        
    def save_ground_truth(list_sr, path_pandas):
        if os.path.isfile(path_pandas): input(f"The file {path_pandas} already exists, continue ? ")
        rows = []
        for sr in tqdm(list_sr):
            rows.append([sr.taxon, sr.genome, sr.gen_chr_plas, sr.read_len, sr.start, sr.stop,
                         sr.record_id, sr.object_id, sr.description, sr.path, ])
        df = pd.DataFrame(rows, columns=["taxon", "genome", "gen_chr_plas", "read_len", "start", "stop", 
                                    "record_id", "object_id", "description", "path"])
        for col in ["taxon", "genome", "gen_chr_plas", "record_id", "description", "path"]:
            df[col] = df[col].astype('category')
        df.to_pickle(path_pandas)
        return df
        
    def to_fastq(list_sr, path_fastq):
        if os.path.isfile(path_fastq): input(f"The file {path_fastq} already exists, continue ? ")
        records = []
        for sr in tqdm(list_sr):
            records.append(SeqRecord(Seq(sr.read, SingleLetterAlphabet), 
                                     id=sr.record_id, name=sr.genome, description=sr.description))
        with open(path_fastq, "w") as f:
            SeqIO.write(records, f, "fasta")

# Create List of SyntReads

In [8]:
df = pd.read_pickle(path_ncbi_pd)

In [9]:
print(df.shape, df.taxon_id.unique().shape)

(1923707, 25) (15545,)


In [10]:
filtered = df[(df.type == "complete genome") 
               & ( ((df.name_superkingdom == "Bacteria") & (df.length > 130000))
                 | ((df.name_superkingdom == "Archea") & (df.length > 800000))
                 | ((df.name_superkingdom == "Virus") & (df.length > 800)))]
filtered.shape

(4424, 25)

## Select particular species for gut microbia
Based on https://en.wikipedia.org/wiki/Human_gastrointestinal_microbiota#Composition <br>


In [11]:
filtered[filtered.id_genus == 286    ]["id_species"].value_counts()[:3]

287    72
303    19
294    12
Name: id_species, dtype: int64

In [12]:
filtered[filtered.id_species == 823]

Unnamed: 0,taxon_id,taxon_name,rank,type,length,record_id,record_description,path_folder,path_file,id_superkingdom,id_kingdom,id_phylum,id_class,id_order,id_family,id_genus,id_species,name_superkingdom,name_kingdom,name_phylum,name_class,name_order,name_family,name_genus,name_species
978,435591,Parabacteroides distasonis ATCC 8503,no rank,complete genome,4811379,NC_009615.1,NC_009615.1 Parabacteroides distasonis ATCC 85...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_000012845.1/GCF_000012845.1_ASM12...,2,0,976,200643,171549,2005525,375288,823,Bacteria,0,Bacteroidetes,Bacteroidia,Bacteroidales,Tannerellaceae,Parabacteroides,Parabacteroides distasonis


In [13]:
gut_ids_always = [1351, 562, 1280, 573 ]  # Take 2 strains each
gut_ids_often = [817, 550, 216816, 1590]   # 1 strain each
gut_ids_rare = [1502, 584, 287, 823]

rows = []
for gut_id in gut_ids_always:
    print(ncbi.get_taxid_translator([gut_id]), filtered[filtered.id_species == gut_id].shape[0])
    rows.append(filtered[filtered.id_species == gut_id].sample(3))
for gut_id in gut_ids_often + gut_ids_rare:
    print(ncbi.get_taxid_translator([gut_id]), filtered[filtered.id_species == gut_id].shape[0])
    rows.append(filtered[filtered.id_species == gut_id].sample(1))

selected_species = pd.concat(rows)
selected_species

{1351: 'Enterococcus faecalis'} 11
{562: 'Escherichia coli'} 264
{1280: 'Staphylococcus aureus'} 144
{573: 'Klebsiella pneumoniae'} 94
{817: 'Bacteroides fragilis'} 3
{550: 'Enterobacter cloacae'} 16
{216816: 'Bifidobacterium longum'} 17
{1590: 'Lactobacillus plantarum'} 21
{1502: 'Clostridium perfringens'} 7
{584: 'Proteus mirabilis'} 8
{287: 'Pseudomonas aeruginosa'} 72
{823: 'Parabacteroides distasonis'} 1


Unnamed: 0,taxon_id,taxon_name,rank,type,length,record_id,record_description,path_folder,path_file,id_superkingdom,id_kingdom,id_phylum,id_class,id_order,id_family,id_genus,id_species,name_superkingdom,name_kingdom,name_phylum,name_class,name_order,name_family,name_genus,name_species
4100,1261557,Enterococcus faecalis str. Symbioflor 1,no rank,complete genome,2810675,NC_019770.1,NC_019770.1 Enterococcus faecalis str. Symbiof...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_000317915.1/GCF_000317915.1_ASM31...,2,0,1239,91061,186826,81852,1350,1351,Bacteria,0,Firmicutes,Bacilli,Lactobacillales,Enterococcaceae,Enterococcus,Enterococcus faecalis
6635,1201292,Enterococcus faecalis ATCC 29212,no rank,complete genome,2939973,NZ_CP008816.1,NZ_CP008816.1 Enterococcus faecalis ATCC 29212...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_000742975.1/GCF_000742975.1_ASM74...,2,0,1239,91061,186826,81852,1350,1351,Bacteria,0,Firmicutes,Bacilli,Lactobacillales,Enterococcaceae,Enterococcus,Enterococcus faecalis
22108,1351,Enterococcus faecalis,species,complete genome,2893216,NZ_AP018538.1,NZ_AP018538.1 Enterococcus faecalis KUB3006 DN...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_003966385.1/GCF_003966385.1_ASM39...,2,0,1239,91061,186826,81852,1350,1351,Bacteria,0,Firmicutes,Bacilli,Lactobacillales,Enterococcaceae,Enterococcus,Enterococcus faecalis
12034,562,Escherichia coli,species,complete genome,4740150,NZ_CP010242.1,"NZ_CP010242.1 Escherichia coli strain S56, com...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001901445.1/GCF_001901445.1_ASM19...,2,0,1224,1236,91347,543,561,562,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli
11981,562,Escherichia coli,species,complete genome,4940434,NZ_CP010183.1,"NZ_CP010183.1 Escherichia coli strain M3, comp...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001901045.1/GCF_001901045.1_ASM19...,2,0,1224,1236,91347,543,561,562,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli
6222,1248903,Escherichia coli O145:H28 str. RM12761,no rank,complete genome,5402281,NZ_CP007133.1,NZ_CP007133.1 Escherichia coli O145:H28 str. R...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_000662395.1/GCF_000662395.1_ASM66...,2,0,1224,1236,91347,543,561,562,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli
3002,1006543,Staphylococcus aureus subsp. aureus T0131,no rank,complete genome,2913900,NC_017347.1,NC_017347.1 Staphylococcus aureus subsp. aureu...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_000204665.1/GCF_000204665.1_ASM20...,2,0,1239,91061,1385,90964,1279,1280,Bacteria,0,Firmicutes,Bacilli,Bacillales,Staphylococcaceae,Staphylococcus,Staphylococcus aureus
13063,1280,Staphylococcus aureus,species,complete genome,2919720,NZ_CP014409.1,NZ_CP014409.1 Staphylococcus aureus strain USA...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_002000645.1/GCF_002000645.1_ASM20...,2,0,1239,91061,1385,90964,1279,1280,Bacteria,0,Firmicutes,Bacilli,Bacillales,Staphylococcaceae,Staphylococcus,Staphylococcus aureus
9541,1280,Staphylococcus aureus,species,complete genome,2815416,NZ_CP010998.1,NZ_CP010998.1 Staphylococcus aureus strain FOR...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001580495.1/GCF_001580495.1_ASM15...,2,0,1239,91061,1385,90964,1279,1280,Bacteria,0,Firmicutes,Bacilli,Bacillales,Staphylococcaceae,Staphylococcus,Staphylococcus aureus
14416,573,Klebsiella pneumoniae,species,complete genome,5382888,NZ_CP021549.1,NZ_CP021549.1 Klebsiella pneumoniae strain AR_...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_002180155.1/GCF_002180155.1_ASM21...,2,0,1224,1236,91347,543,570,573,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Klebsiella,Klebsiella pneumoniae


In [14]:
selected_species["path"] = selected_species.path_folder.str.cat(selected_species.path_file)
selected_species.drop(columns=["path_folder", "path_file"], inplace=True)
# selected_species

In [15]:
seeds = []
for row in tqdm(selected_species.itertuples()):
#     print(row.taxon_id)
    #  record_id, taxon, what, s_kingdom, folder, file    row["type"]
    #  record_id, taxon, genome, what, s_kingdom, path_complete, description
    sr = SyntRead(row.record_id, row.taxon_id, row.taxon_name, row.type, row.name_superkingdom, row.path, row.record_description)
    sr.synthetize_read()
    print(sr)
    seeds.append(sr)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Synthetic read <0> from Enterococcus faecalis str. Symbioflor 1, len=9676, [2588160:2597836]
Synthetic read <1> from Enterococcus faecalis ATCC 29212, len=9959, [445317:455276]
Synthetic read <2> from Enterococcus faecalis, len=3232, [415174:418406]
Synthetic read <3> from Escherichia coli, len=1093, [1893159:1894252]
Synthetic read <4> from Escherichia coli, len=2099, [4032453:4034552]
Synthetic read <5> from Escherichia coli O145:H28 str. RM12761, len=9719, [5378000:5387719]
Synthetic read <6> from Staphylococcus aureus subsp. aureus T0131, len=7474, [1420251:1427725]
Synthetic read <7> from Staphylococcus aureus, len=9130, [2821215:2830345]
Synthetic read <8> from Staphylococcus aureus, len=7359, [763710:771069]
Synthetic read <9> from Klebsiella pneumoniae, len=2631, [1733773:1736404]
Synthetic read <10> from Klebsiella pneumoniae subsp. pneumoniae 1158, len=3132, [2084278:2087410]
Synthetic read <11> from Klebsiella pneumoniae, len=9174, [3949877:3959051]
Synthetic read <12> from 

In [16]:
synt_reads = []
for seed in tqdm(seeds):
    synt_reads.extend([seed] + seed.deep_random_copy(int(nb_reads/len(seeds)) -1))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))




In [17]:
path_out_syntreads = f"/home/ubuntu/Data/Segmentation/Test-Data/Synthetic_from_Genomes/" \
                     f"2019-11-26_{nb_reads}-SyntReads_{len(seeds)}-BacGut.fastq"
SyntRead.to_fastq(synt_reads, path_out_syntreads)
print("output at: ", path_out_syntreads)

The file /home/ubuntu/Data/Segmentation/Test-Data/Synthetic_from_Genomes/2019-11-26_100000-SyntReads_20-BacGut.fastq already exists, continue ? 


HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))


output at:  /home/ubuntu/Data/Segmentation/Test-Data/Synthetic_from_Genomes/2019-11-26_100000-SyntReads_20-BacGut.fastq


In [18]:
path_out_ground_truth = f"/home/ubuntu/Data/Segmentation/Test-Data/Synthetic_from_Genomes/" \
                     f"2019-11-26_{nb_reads}-SyntReads_{len(seeds)}-BacGut.pd"
df = SyntRead.save_ground_truth(synt_reads, path_out_ground_truth)

The file /home/ubuntu/Data/Segmentation/Test-Data/Synthetic_from_Genomes/2019-11-26_100000-SyntReads_20-BacGut.pd already exists, continue ? 


HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




In [19]:
f"{df.memory_usage(deep=True).sum()/1000000:.1f} MB"

'29.3 MB'

In [20]:
df

Unnamed: 0,taxon,genome,gen_chr_plas,read_len,start,stop,record_id,object_id,description,path
0,1261557,Enterococcus faecalis str. Symbioflor 1,complete genome,9676,2588160,2597836,NC_019770.1,0,NC_019770.1 Enterococcus faecalis str. Symbiof...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
1,1261557,Enterococcus faecalis str. Symbioflor 1,complete genome,6270,940429,946699,NC_019770.1,20,NC_019770.1 Enterococcus faecalis str. Symbiof...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
2,1261557,Enterococcus faecalis str. Symbioflor 1,complete genome,7470,1466599,1474069,NC_019770.1,21,NC_019770.1 Enterococcus faecalis str. Symbiof...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
3,1261557,Enterococcus faecalis str. Symbioflor 1,complete genome,3294,184431,187725,NC_019770.1,22,NC_019770.1 Enterococcus faecalis str. Symbiof...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
4,1261557,Enterococcus faecalis str. Symbioflor 1,complete genome,2879,844312,847191,NC_019770.1,23,NC_019770.1 Enterococcus faecalis str. Symbiof...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
...,...,...,...,...,...,...,...,...,...,...
99995,435591,Parabacteroides distasonis ATCC 8503,complete genome,2287,98603,100890,NC_009615.1,99995,NC_009615.1 Parabacteroides distasonis ATCC 85...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
99996,435591,Parabacteroides distasonis ATCC 8503,complete genome,7327,3373009,3380336,NC_009615.1,99996,NC_009615.1 Parabacteroides distasonis ATCC 85...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
99997,435591,Parabacteroides distasonis ATCC 8503,complete genome,9824,3633633,3643457,NC_009615.1,99997,NC_009615.1 Parabacteroides distasonis ATCC 85...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
99998,435591,Parabacteroides distasonis ATCC 8503,complete genome,4971,2241208,2246179,NC_009615.1,99998,NC_009615.1 Parabacteroides distasonis ATCC 85...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...


# End

In [79]:
# df.info(memory_usage='deep')
f"{df.memory_usage(deep=True).sum()/1000000:.1f} MB"

'461.5 MB'

In [39]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 500)

In [76]:
df[(df["type"] == "undefined") 
   & ~(df.record_description.str.lower().str.contains("drosophila melanogaster")) 
   & ~(df.record_description.str.lower().str.contains("plasmid")) 
   & ~(df.record_description.str.lower().str.contains("scaffold")) 
   & ~(df.record_description.str.lower().str.contains("whole genome shotgun sequence")) 
   & ~(df.record_description.str.lower().str.contains("contig"))][["record_description", "length"]] 

Unnamed: 0,record_description,length
27,"NC_005791.1 Methanococcus maripaludis strain S2, complete sequence",1661137
236,NC_023044.1 Methanobacterium sp. MB1 complete sequence,2029766
249,NZ_CP007551.1 Haloferax mediterranei ATCC 33500 genome,2946877
257,NZ_CP008822.1 Metallosphaera sedula strain CuR1 genome,2191492
327,NZ_CP013695.1 Sulfolobus acidocaldarius strain NG05B_CO5_07 genome,2217426
356,NZ_CP019470.1 Methanopyrus sp. KOL6 genome,1430309
368,NZ_CP015193.1 Complete genome sequence of Thermococcus chitonophagus type strain GC74,1961979
379,NZ_CP017881.1 Methanohalophilus portucalensis strain FDF-1T genome,2084975
525,NC_002655.2 Escherichia coli O157:H7 str. EDL933 genome,5528445
595,"NC_002940.2 [Haemophilus] ducreyi 35000HP, complete sequence",1698955


PD from CSV

In [76]:
# path_ncbi_csv
df = pd.read_csv("/home/ubuntu/Disks/HDD1000/NCBI/ncbi_2019-11-26.csv", sep="\t", )

  interactivity=interactivity, compiler=compiler, result=result)


In [77]:
# df.info(memory_usage='deep')
f"{df.memory_usage(deep=True).sum()/1000000:.1f} MB"

'1137.2 MB'

In [78]:
df.shape

(877820, 25)

In [79]:
for col in df.columns:
    if "id_" in col:
        print(col)
        df[col].fillna(0, inplace=True)
        df[col] = df[col].astype(int)

id_superkingdom
id_kingdom
id_phylum
id_class
id_order
id_family
id_genus
id_species


In [80]:
col_categories = ['rank', 'type', 'path_folder', 
                  'id_superkingdom', 'name_superkingdom', 'id_kingdom', 'name_kingdom', 'id_phylum', 'name_phylum', 
                  'id_class', 'name_class', 'id_order', 'name_order', 'id_family', 'name_family', 
                  'id_genus', 'name_genus', 'id_species', 'name_species']

for col in tqdm(col_categories):
    df[col] = df[col].astype('category')
print(f"{df.memory_usage(deep=True).sum()/1000000:.1f} MB")

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))


438.1 MB


In [81]:
# path_ncbi_pd
df.to_pickle('/home/ubuntu/Disks/HDD1000/NCBI/ncbi_2019-11-26.pd')

In [6]:
desired_ranks = ['superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
headers = ["taxon_id", "taxon_name", "rank", "type", "length", 
           "record_id", "record_description", "path_folder", "path_file", ]
[headers.extend([f"id_{v}", f"name_{v}"]) for v in desired_ranks]
headers

['taxon_id',
 'taxon_name',
 'rank',
 'type',
 'length',
 'record_id',
 'record_description',
 'path_folder',
 'path_file',
 'id_superkingdom',
 'name_superkingdom',
 'id_kingdom',
 'name_kingdom',
 'id_phylum',
 'name_phylum',
 'id_class',
 'name_class',
 'id_order',
 'name_order',
 'id_family',
 'name_family',
 'id_genus',
 'name_genus',
 'id_species',
 'name_species']

# Read the csv and transform to pandas, plus column dtype conversion

Might take a while if file is big...

# End

In [144]:
path_file = "/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/bacteria/GCF_000006845.1/GCF_000006845.1_ASM684v1_genomic.fna"

In [150]:
record = SeqIO.parse(path_file, "fasta")

In [151]:
r = next(record)
r

SeqRecord(seq=Seq('ATAAATTTTTGCACGGGTTGTGGATAAAATATCGGCGAGTCGGTATAATCGGTT...TGG', SingleLetterAlphabet()), id='NC_002946.2', name='NC_002946.2', description='NC_002946.2 Neisseria gonorrhoeae FA 1090 chromosome, complete genome', dbxrefs=[])

In [59]:
len(record.seq)

1936387

In [85]:
record.seq.upper()

Seq('AATTTAAAGATTAAAATTAGTAGACTGTCGATTTACAATATCATATTTATGAGT...TAA', SingleLetterAlphabet())

In [73]:
print(record)

ID: NC_014222.1
Name: NC_014222.1
Description: NC_014222.1 Methanococcus voltae A3, complete genome
Number of features: 0
Seq('AATTTAAAGATTAAAATTAGTAGACTGTCGATTTACAATATCATATTTATGAGT...TAA', SingleLetterAlphabet())


In [62]:
dir(record)

['__add__',
 '__bool__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__le___',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_per_letter_annotations',
 '_seq',
 '_set_per_letter_annotations',
 '_set_seq',
 'annotations',
 'dbxrefs',
 'description',
 'features',
 'format',
 'id',
 'letter_annotations',
 'lower',
 'name',
 'reverse_complement',
 'seq',
 'translate',
 'upper']

In [86]:
record.description

'NC_014222.1 Methanococcus voltae A3, complete genome'

In [176]:
get_desired_ranks(2, desired_ranks)

{'superkingdom_id': 2,
 'kingdom_id': 'NaN',
 'phylum_id': 'NaN',
 'class_id': 'NaN',
 'order_id': 'NaN',
 'family_id': 'NaN',
 'genus_id': 'NaN',
 'species_id': 'NaN'}

In [154]:
get_desired_ranks(9606, desired_ranks, tolist=True)

[2759, 33208, 7711, 40674, 9443, 9604, 9605, 9606]

In [155]:
ncbi.translate_to_names(get_desired_ranks(9606, desired_ranks, tolist=True))

['Eukaryota',
 'Metazoa',
 'Chordata',
 'Mammalia',
 'Primates',
 'Hominidae',
 'Homo',
 'Homo sapiens']

In [184]:
ranks_id_str = ncbi.get_taxid_translator(get_desired_ranks(6, desired_ranks, tolist=True))
ranks_id_str

{2: 'Bacteria',
 6: 'Azorhizobium',
 356: 'Rhizobiales',
 1224: 'Proteobacteria',
 28211: 'Alphaproteobacteria',
 335928: 'Xanthobacteraceae'}

In [185]:
l=[]
[l.extend([k,v]) for k,v in ranks_id_str.items()]

[None, None, None, None, None, None]

In [186]:
l

[2,
 'Bacteria',
 6,
 'Azorhizobium',
 356,
 'Rhizobiales',
 1224,
 'Proteobacteria',
 28211,
 'Alphaproteobacteria',
 335928,
 'Xanthobacteraceae']