In [15]:
import pandas as pd
import numpy as np
def wrangle_kaiju_megahit_cat(kaiju: str, 
                              cat: str, 
                              megahit: str, 
                              name: str) -> None:
    '''
    Wrangles and merges the CAT, kaiju and MEGAHIT output from the MEGAHIT contigs.
    :param str kaiju: The kaiju out name path
    :param str cat: The CAT out txt path
    :param str megahit: The path to the megahit csv file. 
    :param str name: Name of the new csv file. 
    :returns: New merged csv file
    '''
    
    kaiju_raw = pd.read_csv(kaiju,
                    sep='\t',
                    header=None,
                    usecols=[1, 2, 6, 7],
                    names=['name', 'taxon_id', 'aa_match', 'taxonomy'])
    
    megahit = pd.read_csv(megahit)

    kaiju = (kaiju_raw.merge(megahit, on='name', how='outer')
     .sort_values('length', ascending=False)
     #.dropna()
     .assign(taxonomy=lambda x: x.taxonomy.str.split(';').str[:-1])
     .assign(last_level_kaiju=lambda x: x.taxonomy.str[-1])
     .assign(second_level_kaiju=lambda x: x.taxonomy.str[-2])
     .assign(third_level_kaiju=lambda x: x.taxonomy.str[-3])
     .assign(kingdom_kaiju=lambda x: np.select([x.taxonomy.str[0] != 'cellular organisms'],
                                                   [x.taxonomy.str[0]],
                                                   default=x.taxonomy.str[1])))
    
    
    cat_raw = pd.read_csv(cat,
                  sep='\t')

    cat = (
        cat_raw
     .rename(columns={'# contig': 'name',
                      'species': 'last_level_cat',
                      'genus': 'second_level_cat',
                      'family': 'third_level_cat'})
    # .loc[lambda x: x.classification != 'no taxid assigned']
    # .loc[lambda x: x['superkingdom'] != 'no support']
    # .loc[lambda x: x['phylum'] != 'no support']
     .fillna("")
     .drop(columns=['lineage', 'lineage scores'])
     .assign(kingdom_cat=lambda x: x['superkingdom'].str[:-6])
     .drop(columns='superkingdom')
    )

    merged = (kaiju.merge(cat, on='name', how='outer')
              .sort_values('length', ascending=False))
    
    return merged

In [13]:
kaiju = "~/clinical-genomics/pandemic-preparedness/virusclassification_nextflow/results/sample_01_S8/kaiju/megahit/sample_01_S8_table_megahit.tsv"
cat = "~/clinical-genomics/pandemic-preparedness/virusclassification_nextflow/results/sample_01_S8/cat/CAT_sample_01_S8_contigs_names.txt"
megahit = "~/clinical-genomics/pandemic-preparedness/virusclassification_nextflow/results/sample_01_S8/megahit/sample_01_S8.csv"

In [16]:
cat_df = pd.read_csv(cat, sep="\t").fillna(" ")

cat_df

Unnamed: 0,# contig,classification,reason,lineage,lineage scores,superkingdom,phylum,class,order,family,genus,species
0,k141_0,no taxid assigned,no ORFs found,,,,,,,,,
1,k141_1,no taxid assigned,no ORFs found,,,,,,,,,
2,k141_2,no taxid assigned,no hits to database,,,,,,,,,
3,k141_3,no taxid assigned,no ORFs found,,,,,,,,,
4,k141_4,no taxid assigned,no hits to database,,,,,,,,,


In [17]:
wrangle_kaiju_megahit_cat(kaiju, cat, megahit, name="hej")

  kaiju_raw = pd.read_csv(kaiju,


Unnamed: 0,name,taxon_id,aa_match,taxonomy,length,sequence,last_level_kaiju,second_level_kaiju,third_level_kaiju,kingdom_kaiju,classification,reason,phylum,class,order,third_level_cat,second_level_cat,last_level_cat,kingdom_cat
0,k141_0,,,,498.0,GTCGCCTCTACATATAAATCTTTCAACAATTGCTGCATAGAAGGGT...,,,,,no taxid assigned,no ORFs found,,,,,,,
1,k141_1,,,,351.0,TGATATTTAACCTTTGGTCTGTTTAATTTGCTGTCCTATTTTAAGT...,,,,,no taxid assigned,no ORFs found,,,,,,,
2,k141_2,,,,315.0,TAAGATAACATGCTTAACCTTTTAAAGGCATGCTGCCATTCCCAAA...,,,,,no taxid assigned,no hits to database,,,,,,,
3,k141_4,,,,264.0,AGGGTCATTGTGTGAAGCCCAGGCTGAGTGTGTCTGTCTCTTACAC...,,,,,no taxid assigned,no hits to database,,,,,,,
4,k141_3,,,,256.0,ACTTCTTACATATATTGATTAACACCTCATGTCTCTCACCTGCCTG...,,,,,no taxid assigned,no ORFs found,,,,,,,
5,percent,reads,,,,,,,,,,,,,,,,,
6,0.000000,0,,,,,,,,,,,,,,,,,
7,100.000000,5,,,,,,,,,,,,,,,,,
