In [22]:
import os
import pandas as pd
import numpy as np
import glob
import re

In [23]:
os.chdir('/fs/scratch/PAS0439/Ming/databases/gtdb_R207_repre_genomes/taxdump')

In [24]:
names = pd.read_csv('names.dmp', sep='\t', header=None, names = ['col' + str(f) for f in range(8)])

In [25]:
names.rename(columns = {'col0': 'taxid', 'col2':'name'}, inplace = True)

In [26]:
nodes = pd.read_csv('nodes.dmp', sep='\t', header=None, names = ['col' + str(f) for f in range(26)])

In [27]:
nodes.rename(columns = {'col0':'taxid', 'col2':'parent_id', 'col4':'rank'}, inplace = True)

In [28]:
taxa = pd.read_csv('taxa_taxid_mapping.txt', sep='\t', header = None, names = ['taxid', 'taxa', 'rank'])

In [29]:
rumen_mags = pd.read_csv('/fs/scratch/PAS0439/Ming/databases/rumen_mags_high_quality/mags_taxa.tsv', sep='\t')

In [30]:
def rename_unclassfied(x):
    renamed = re.search(r'[a-z]__(.*)', str(x))
    if renamed == None:
        return 'Unclassified'
    elif renamed.group(1) == '':
        return "Unclassified"
    else:
        return renamed.group(1)

In [31]:
rumen_mags[['Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genera', 'Specie']] = rumen_mags[['Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genera', 'Specie']].applymap(rename_unclassfied)

In [32]:
def rename_taxa(x):
    taxa = x.split(';')[-1]
    return taxa
    

In [33]:
taxa.taxa = taxa.taxa.apply(rename_taxa)

In [34]:
Unclassified_class = [f for f in set(rumen_mags['Class']) - set(taxa.query('rank == "class"')['taxa']) if f != "Unclassified"]
Unclassified_order = [f for f in set(rumen_mags['Order']) - set(taxa.query('rank == "order"')['taxa']) if f != "Unclassified"]
Unclassified_family = [f for f in set(rumen_mags['Family']) - set(taxa.query('rank == "family"')['taxa']) if f != "Unclassified"]
Unclassified_genera = [f for f in set(rumen_mags['Genera']) - set(taxa.query('rank == "genus"')['taxa']) if f != "Unclassified"]
Unclassified_specie = [f for f in set(rumen_mags['Specie']) - set(taxa.query('rank == "species"')['taxa']) if f != "Unclassified"]

In [35]:
## rumen mags include MAGs which do not have representative taxa in the representative GTDB R207 representative genomes
#rumen_mags.loc[list(rumen_mags[rumen_mags['Class'].isin(Unclassified_class)].index), 'Class'] = "Unclassified"
#rumen_mags.loc[list(rumen_mags[rumen_mags['Order'].isin(Unclassified_order)].index), 'Order'] = "Unclassified"
#rumen_mags.loc[list(rumen_mags[rumen_mags['Family'].isin(Unclassified_family)].index), 'Family'] = "Unclassified"
#rumen_mags.loc[list(rumen_mags[rumen_mags['Genera'].isin(Unclassified_genera)].index), 'Genera'] = "Unclassified"
#rumen_mags.loc[list(rumen_mags[rumen_mags['Specie'].isin(Unclassified_specie)].index), 'Specie'] = "Unclassified"


In [36]:
unclassified_class_index = rumen_mags[rumen_mags.Class.isin(Unclassified_class)].index.to_list()
unclassified_order_index = rumen_mags[rumen_mags.Order.isin(Unclassified_order)].index.to_list()
unclassified_family_index = rumen_mags[rumen_mags.Family.isin(Unclassified_family)].index.to_list()
unclassified_genera_index = rumen_mags[rumen_mags.Genera.isin(Unclassified_genera)].index.to_list()
unclassified_specie_index = rumen_mags[rumen_mags.Specie.isin(Unclassified_specie)].index.to_list()
unclassified_index = unclassified_class_index + unclassified_order_index + unclassified_family_index + unclassified_genera_index + unclassified_specie_index

In [37]:
unclassified_index = list(set(unclassified_index))
rumen_mags_under_represented = rumen_mags.loc[unclassified_index,]

In [38]:
new_taxa = len(rumen_mags.query('Specie == "Unclassified"'))
print(f'The number of new taxid to assign is {new_taxa}')

The number of new taxid to assign is 2175


In [39]:
max_taxid = max(list(taxa['taxid']))
print(f'The largest existing taxid is {max_taxid}')
print(f'New taxid increment by 1 from {max_taxid}')

The largest existing taxid is 4294860253
New taxid increment by 1 from 4294860253


In [40]:
# rumen mags include MAGs which do not have representative taxa in the representative GTDB R207 representative genomes
# add them to the taxa list
def add_taxid():
    new_taxid = int(4294860254) 
    index = int(84273)
    
   
        
    for Class in Unclassified_class:
        taxid = new_taxid
        assigned_taxa = Class
        rank = "class"
        taxa.loc[index, "taxid"] = int(4294860254)
        taxa.loc[index, "taxa"] = assigned_taxa
        taxa.loc[index, "rank"] = rank
        new_taxid += 1
        index += 1
        
    for Order in Unclassified_order:
        taxid = new_taxid
        assigned_taxa = Order
        rank = "order"
        taxa.loc[index, "taxid"] = int(new_taxid)
        taxa.loc[index, "taxa"] = assigned_taxa
        taxa.loc[index, "rank"] = rank
        new_taxid += 1
        index += 1
    
    for Family in Unclassified_family:
        taxid = new_taxid
        assigned_taxa = Family
        rank = "family"
        taxa.loc[index, "taxid"] = int(new_taxid)
        taxa.loc[index, "taxa"] = assigned_taxa
        taxa.loc[index, "rank"] = rank
        new_taxid += 1
        index += 1
        
    for Genera in Unclassified_genera:
        taxid = new_taxid
        assigned_taxa = Genera
        rank = "genus"
        taxa.loc[index, "taxid"] = int(new_taxid)
        taxa.loc[index, "taxa"] = assigned_taxa
        taxa.loc[index, "rank"] = rank
        new_taxid += 1
        index += 1
        
    for Specie in Unclassified_specie:
        taxid = new_taxid
        assigned_taxa = Specie
        rank = "geus"
        taxa.loc[index, "taxid"] = int(new_taxid)
        taxa.loc[index, "taxa"] = assigned_taxa
        taxa.loc[index, "rank"] = rank
        new_taxid += 1
        index += 1
        
    print("the next taxid is " + str(new_taxid))
    taxa.taxid = [int(f) for f in taxa.taxid]           
    #taxa.to_csv("/fs/scratch/PAS0439/Ming/databases/rumen_mags_high_quality/taxadump/taxa_supplement.txt", sep="\t", index = None)
    
add_taxid()                     
            
                    
                    
            

the next taxid is 4294860313


In [41]:
rumen_mags_taxid = {}
def get_taxid():
   
    
    new_taxid = 4294860313    
    names_dmp = open('/fs/scratch/PAS0439/Ming/databases/rumen_mags_high_quality/taxadump/names_supplement.txt', 'w') 
    nodes_dmp = open('/fs/scratch/PAS0439/Ming/databases/rumen_mags_high_quality/taxadump/nodes_supplement.txt', 'w')
    
    for index, row in rumen_mags.iterrows():
        genome = row['user_genome']
        specie = row['Specie']
        genera = row['Genera']
        family = row['Family']
        order = row['Order']
        myclass = row['Class']
        phylum = row['Phylum']
        domain = row['Domain']
        if phylum == "Unclassified":
            if domain == "Bacteria":
                parent_taxid = 609216830
            else:
                parent_taxid = 2587168575

            new_taxid += 1
            new_taxid = int(new_taxid)
            rumen_mags_taxid[genome] = new_taxid
            names_dmp.write(f'{new_taxid}\t|\t{genome}\t|\t\t|\tscientific name\t|' + '\n')
            nodes_dmp.write(f'{new_taxid}\t|\t{parent_taxid}\t|\tphylum\t|\tXX\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t1\t|\t0\t|\t\t|' + '\n')

        elif myclass == "Unclassified":
            new_taxid += 1
            new_taxid = int(new_taxid)
            phylum_id = list(taxa.query('rank == "phylum"').query('taxa == @phylum')['taxid'])[0]
            parent_taxid = int(phylum_id)
            rumen_mags_taxid[genome] = new_taxid
            names_dmp.write(f'{new_taxid}\t|\t{genome}\t|\t\t|\tscientific name\t|' + '\n')
            nodes_dmp.write(f'{new_taxid}\t|\t{parent_taxid}\t|\tclass\t|\tXX\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t1\t|\t0\t|\t\t|' + '\n')

        elif order == "Unclassified":
            new_taxid += 1
            new_taxid = int(new_taxid)
            class_id = list(taxa.query('rank == "class"').query('taxa == @myclass')['taxid'])[0]
            parent_taxid = int(class_id)
            rumen_mags_taxid[genome] = new_taxid
            names_dmp.write(f'{new_taxid}\t|\t{genome}\t|\t\t|\tscientific name\t|' + '\n')
            nodes_dmp.write(f'{new_taxid}\t|\t{parent_taxid}\t|\torder\t|\tXX\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t1\t|\t0\t|\t\t|' + '\n')

        elif family == "Unclassified":
            new_taxid += 1
            new_taxid = int(new_taxid)
            order_id = list(taxa.query('rank == "order"').query('taxa == @order')['taxid'])[0]
            parent_taxid = int(order_id)
            rumen_mags_taxid[genome] = new_taxid
            names_dmp.write(f'{new_taxid}\t|\t{genome}\t|\t\t|\tscientific name\t|' + '\n')
            nodes_dmp.write(f'{new_taxid}\t|\t{parent_taxid}\t|\tfamily\t|\tXX\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t1\t|\t0\t|\t\t|' + '\n')

        elif genera == "Unclassified":
            new_taxid += 1
            new_taxid = int(new_taxid)
            family_id = list(taxa.query('rank == "family"').query('taxa == @family')['taxid'])[0]
            parent_taxid = int(family_id)
            rumen_mags_taxid[genome] = new_taxid
            names_dmp.write(f'{new_taxid}\t|\t{genome}\t|\t\t|\tscientific name\t|' + '\n')
            nodes_dmp.write(f'{new_taxid}\t|\t{parent_taxid}\t|\tgenus\t|\tXX\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t1\t|\t0\t|\t\t|' + '\n')

        elif specie == "Unclassified":
            new_taxid += 1
            new_taxid = int(new_taxid)
            rumen_mags_taxid[genome] = new_taxid
            genera_id = list(taxa.query('rank == "genus"').query('taxa == @genera')['taxid'])[0]
            parent_taxid = int(genera_id)
            rumen_mags_taxid[genome] = new_taxid
            names_dmp.write(f'{new_taxid}\t|\t{genome}\t|\t\t|\tscientific name\t|' + '\n')
            nodes_dmp.write(f'{new_taxid}\t|\t{parent_taxid}\t|\tspecies\t|\tXX\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t1\t|\t0\t|\t\t|' + '\n')
            
        else:
            specie_id = list(taxa.query('rank == "species"').query('taxa == @specie')['taxid'])[0]
            rumen_mags_taxid[genome] = specie_id
        
            
    names_dmp.close()
    nodes_dmp.close()
    
get_taxid()                     
            
                    
                    
            

In [42]:
## rename genome sequence names
os.chdir('/fs/scratch/PAS0439/Ming/databases/rumen_mags_high_quality/dereplicated_genomes')
samples = glob.glob('*.fasta')

from Bio import SeqIO
for sample in samples:
    
    with open(f'../dereplicated_genomes_with_taxid/{sample}', 'w') as outfile:
        genome = sample.split('.fasta')[0]
        
        records = SeqIO.parse(sample, 'fasta')
        for record in records:
            old_id = record.id
            new_id = old_id + '|kraken:taxid|' + str(rumen_mags_taxid[genome])
            record.id = new_id
            SeqIO.write(record, outfile, 'fasta')

In [43]:
with open('/fs/scratch/PAS0439/Ming/databases/rumen_mags_high_quality/taxadump/nodes_supplement.txt', 'r') as sup:
    with open("/fs/scratch/PAS0439/Ming/databases/gtdb_R207_repre_genomes/taxdump/nodes.dmp", 'r') as gtdb:
        with open('/fs/scratch/PAS0439/Ming/databases/kraken2_gtdb207_rumen_mags_7176/taxonomy/nodes_pre.dmp', 'w') as nodes:
            nodes.write(gtdb.read())
            nodes.write(sup.read())

with open('/fs/scratch/PAS0439/Ming/databases/rumen_mags_high_quality/taxadump/names_supplement.txt', 'r') as names_sup:
    with open("/fs/scratch/PAS0439/Ming/databases/gtdb_R207_repre_genomes/taxdump/names.dmp", 'r') as names_gtdb:
        with open('/fs/scratch/PAS0439/Ming/databases/kraken2_gtdb207_rumen_mags_7176/taxonomy/names_pre.dmp', 'w') as names:
            names.write(names_gtdb.read())
            names.write(names_sup.read())

In [44]:
## solve bracken-build error. See https://github.com/jenniferlu717/Bracken/issues/135 for reasons.
nodes_pre = pd.read_csv('/fs/scratch/PAS0439/Ming/databases/kraken2_gtdb207_rumen_mags_7176/taxonomy/nodes_pre.dmp', sep = "\t", header = None)
nodes_pre.rename(columns = {0:'taxid', 2:'parent_id', 4:'rank'}, inplace = True)
#nodes_pre_list = [f for f in set(nodes_pre["parent_id"]) if f not in set(nodes_pre["taxid"])]

In [45]:
parentid = set(nodes_pre['parent_id'])
taxaid = set(nodes_pre['taxid'])
taxasup = [f for f in parentid if f not in taxaid]

In [46]:
taxasup_df = taxa[taxa.taxid.isin(taxasup)]

In [47]:
taxasup_df.loc[taxasup_df.query('rank == "family"').index, "rank"] = "Family" ## to match with rumen_mags
taxasup_df.loc[taxasup_df.query('rank == "genus"').index, "rank"] = "Genera"
next_taxa_index = taxa.index[-1] + 1
next_taxid = max(taxa.taxid) + 1
print(f"next taxa index is {next_taxa_index}") 
print(f"next taxid is {next_taxid}") 

next taxa index is 84332
next taxid is 4294860313


In [48]:
phylum_list = []
class_list = []
order_list = []
family_list = []
genera_list = []
for index, row in taxasup_df.iterrows():
    
    
    taxa_name = row['taxa']
    rank = row["rank"]
    if rank == "Family":
        family_list.append(taxa_name)
        order = rumen_mags[rumen_mags["Family"] == taxa_name]['Order'].to_list()[0]
        order_list.append(order)
        Class = rumen_mags[rumen_mags["Family"] == taxa_name]['Class'].to_list()[0]
        class_list.append(Class)
        phylum = rumen_mags[rumen_mags["Family"] == taxa_name]['Phylum'].to_list()[0]
        phylum_list.append(phylum)
    
    else:
        genera_list.append(taxa_name)
        family = rumen_mags[rumen_mags["Genera"] == taxa_name]['Family'].to_list()[0]
        family_list.append(family)
        order = rumen_mags[rumen_mags["Genera"] == taxa_name]['Order'].to_list()[0]
        order_list.append(order)
        Class = rumen_mags[rumen_mags["Genera"] == taxa_name]['Class'].to_list()[0]
        class_list.append(Class)
        phylum = rumen_mags[rumen_mags["Genera"] == taxa_name]['Phylum'].to_list()[0]
        phylum_list.append(phylum)
        
    
  
        
    

In [49]:
for phylum in phylum_list:
    if  len(taxa.query('rank == "phylum"').query('taxa == @phylum')["taxid"].to_list()) > 0:
        continue
    else:
        taxa.loc[next_taxa_index, "taxid"] = next_taxid
        taxa.loc[next_taxa_index, "taxa"] = phylum
        taxa.loc[next_taxa_index, "rank"] = "phylum"
        next_taxa_index += 1
        next_taxid += 1
        
for Class in class_list:
    if  len(taxa.query('rank == "class"').query('taxa == @Class')["taxid"].to_list()) > 0:
        continue
    else:
        taxa.loc[next_taxa_index, "taxid"] = next_taxid
        taxa.loc[next_taxa_index, "taxa"] = Class
        taxa.loc[next_taxa_index, "rank"] = "class"
        next_taxa_index += 1
        next_taxid += 1
        
for order in order_list:
    if  len(taxa.query('rank == "order"').query('taxa == @order')["taxid"].to_list()) > 0:
        continue
    else:
        taxa.loc[next_taxa_index, "taxid"] = next_taxid
        taxa.loc[next_taxa_index, "taxa"] = order
        taxa.loc[next_taxa_index, "rank"] = "order"
        next_taxa_index += 1
        next_taxid += 1
        
for family in family_list:
    if  len(taxa.query('rank == "family"').query('taxa == @family')["taxid"].to_list()) > 0:
        continue
    else:
        taxa.loc[next_taxa_index, "taxid"] = next_taxid
        taxa.loc[next_taxa_index, "taxa"] = family
        taxa.loc[next_taxa_index, "rank"] = "family"
        next_taxa_index += 1
        next_taxid += 1
        
for genera in genera_list:
    if  len(taxa.query('rank == "genus"').query('taxa == @genera')["taxid"].to_list()) > 0:
        continue
    else:
        print(genera)
        taxa.loc[next_taxa_index, "taxid"] = next_taxid
        taxa.loc[next_taxa_index, "taxa"] = genera
        taxa.loc[next_taxa_index, "rank"] = "genus"
        next_taxa_index += 1
        next_taxid += 1

    
taxa.taxid = [int(f) for f in taxa.taxid]

In [50]:
with open('/fs/scratch/PAS0439/Ming/databases/kraken2_gtdb207_rumen_mags_7176/taxonomy/nodes_sup.dmp', 'w') as nodes:
    with open('/fs/scratch/PAS0439/Ming/databases/kraken2_gtdb207_rumen_mags_7176/taxonomy/names_sup.dmp', 'w') as names:
        for phylum in phylum_list:
            new_index = 403990
            taxid =  taxa.query('rank == "phylum"').query('taxa == @phylum')["taxid"].to_list()[0]
            if len(nodes_pre.query('taxid == @taxid')['taxid'].to_list()) == 0:
                parent_id = 609216830
                nodes.write(f'{taxid}\t|\t{parent_taxid}\t|\tphylum\t|\tXX\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t1\t|\t0\t|\t\t|' + '\n')
                names.write(f'{taxid}\t|\t{phylum}\t|\t\t|\tscientific name\t|' + '\n')

        for Class in class_list:
            taxid =  taxa.query('rank == "class"').query('taxa == @Class')["taxid"].to_list()[0]
            if len(nodes_pre.query('taxid == @taxid')['taxid'].to_list()) == 0:
                parent_id = rumen_mags.query('Class == @Class')['Phylum'].to_list()[0]
                parent_taxid = taxa.query('rank == "phylum"').query('taxa == @parent_id')['taxid'].to_list()[0]
                nodes.write(f'{taxid}\t|\t{parent_taxid}\t|\tclass\t|\tXX\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t1\t|\t0\t|\t\t|' + '\n')
                names.write(f'{taxid}\t|\t{Class}\t|\t\t|\tscientific name\t|' + '\n')


        for order in order_list:
            taxid =  taxa.query('rank == "order"').query('taxa == @order')["taxid"].to_list()[0]
            if len(nodes_pre.query('taxid == @taxid')['taxid'].to_list()) == 0:
                parent_id = rumen_mags.query('Order == @order')['Class'].to_list()[0]
                parent_taxid = taxa.query('rank == "class"').query('taxa == @parent_id')['taxid'].to_list()[0]
                nodes.write(f'{taxid}\t|\t{parent_taxid}\t|\torder\t|\tXX\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t1\t|\t0\t|\t\t|' + '\n')
                names.write(f'{taxid}\t|\t{order}\t|\t\t|\tscientific name\t|' + '\n')

        for family in family_list:
            taxid =  taxa.query('rank == "family"').query('taxa == @family')["taxid"].to_list()[0]
            if len(nodes_pre.query('taxid == @taxid')['taxid'].to_list()) == 0:
                parent_id = rumen_mags.query('Family == @family')['Order'].to_list()[0]
                parent_taxid = taxa.query('rank == "order"').query('taxa == @parent_id')['taxid'].to_list()[0]
                nodes.write(f'{taxid}\t|\t{parent_taxid}\t|\tfamily\t|\tXX\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t1\t|\t0\t|\t\t|' + '\n')
                names.write(f'{taxid}\t|\t{family}\t|\t\t|\tscientific name\t|' + '\n')

        for genera in genera_list:
            taxid =  taxa.query('rank == "genus"').query('taxa == @genera')["taxid"].to_list()[0]
            if len(nodes_pre.query('taxid == @taxid')['taxid'].to_list()) == 0:
                parent_id = rumen_mags.query('Genera == @genera')['Family'].to_list()[0]
                parent_taxid = taxa.query('rank == "family"').query('taxa == @parent_id')['taxid'].to_list()[0]
                nodes.write(f'{taxid}\t|\t{parent_taxid}\t|\tgenus\t|\tXX\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t1\t|\t0\t|\t\t|' + '\n')
                names.write(f'{taxid}\t|\t{genera}\t|\t\t|\tscientific name\t|' + '\n')


In [51]:
with open("/fs/scratch/PAS0439/Ming/databases/kraken2_gtdb207_rumen_mags_7176/taxonomy/nodes_pre.dmp", 'r') as pre:
    with open('/fs/scratch/PAS0439/Ming/databases/kraken2_gtdb207_rumen_mags_7176/taxonomy/nodes_sup.dmp', 'r') as sup:
        with open('/fs/scratch/PAS0439/Ming/databases/kraken2_gtdb207_rumen_mags_7176/taxonomy/nodes.dmp', 'w') as nodes:
            nodes.write(pre.read())
            nodes.write(sup.read())

In [52]:
with open("/fs/scratch/PAS0439/Ming/databases/kraken2_gtdb207_rumen_mags_7176/taxonomy/names_pre.dmp", 'r') as pre:
    with open('/fs/scratch/PAS0439/Ming/databases/kraken2_gtdb207_rumen_mags_7176/taxonomy/names_sup.dmp', 'r') as sup:
        with open('/fs/scratch/PAS0439/Ming/databases/kraken2_gtdb207_rumen_mags_7176/taxonomy/names.dmp', 'w') as names:
            names.write(pre.read())
            names.write(sup.read())