In [1]:
import os
import pandas as pd
from Bio import SeqIO

In [2]:
os.chdir("/fs/ess/PAS0439/MING/cilates_fungi_classifier/GutEuk_benchmark/")

In [3]:
fungi_out = pd.read_csv("fungi_GutEuk_output.csv")
protozoa_out = pd.read_csv("protozoa_GutEuk_output.csv")
proka_out = pd.read_csv("proka_GutEuk_output.csv")
sags_out = pd.read_csv("sags_test_chopped_GutEuk_output.csv")
seqorigin = pd.read_csv("../testset_seq_origin_chopped.csv")

In [4]:
GutEuk_out = pd.concat([proka_out, protozoa_out, fungi_out, sags_out])

In [5]:
GutEuk_out_full = pd.merge(GutEuk_out, seqorigin, left_on = "sequence_id", right_on = "sequence")

In [6]:
seq_list = []
seq_length = []
records = SeqIO.parse("/fs/scratch/PAS0439/Ming/databases/gut_eukaryotes_classifier/test/test_chopped.fasta", "fasta")
for record in records:
    if len(record.seq) >= 3000:
        seq_list.append(str(record.id))
        seq_length.append(len(record.seq))

In [7]:
sequence_length_df = pd.DataFrame.from_dict({"sequence":seq_list, "seq_length":seq_length})

In [8]:
GutEuk_out_full.drop("sequence_id", axis = 1, inplace = True)

In [9]:
GutEuk_out_final = pd.merge(GutEuk_out_full, sequence_length_df, on = "sequence")

In [10]:
GutEuk_out_final.to_csv("/fs/ess/PAS0439/MING/cilates_fungi_classifier/stats_visualization/GutEuk_benchmark_full.csv", index = None)

In [11]:
GutEuk_out_final_genome_length = GutEuk_out_final.groupby("genome").seq_length.sum().reset_index()

In [12]:
genome_origin_guteuk = []
genome_name = []
precentage_corrected_genome = []

In [13]:
fungi_genomes = set(GutEuk_out_final.query('genome_type == "fungi"').genome)
protozoa_genomes = set(GutEuk_out_final.query('genome_type == "protozoa"').genome) 
sags_genomes =  set(GutEuk_out_final.query('genome_type == "SAGs"').genome)
prokaryotes_genomes = set(GutEuk_out_final.query('genome_type == "prokaryotes"').genome)

In [14]:
#GutEuk_out_final_filtered = GutEuk_out_final.query('stage1_prediction != "undetermined"').query('stage2_prediction != "undetermined"')
for fungi in fungi_genomes:
    df = GutEuk_out_final.query('genome == @fungi')
    correct_length = 0
    for index, row in df.iterrows():
        pred = row['stage2_prediction']
        Y = row['genome_type']
        seq_length = row['seq_length']
        if pred == Y:
            correct_length += seq_length

    
    total_length = GutEuk_out_final_genome_length.query('genome == @fungi')['seq_length'].item()
    genome_origin_guteuk.append("fungi")
    genome_name.append(fungi)
    precentage_corrected_genome.append(correct_length/total_length)

for protozoa in protozoa_genomes:
    df = GutEuk_out_final.query('genome == @protozoa')
    correct_length = 0
    for index, row in df.iterrows():
        pred = row['stage2_prediction']
        Y = row['genome_type']
        seq_length = row['seq_length']
        if pred == Y:
            correct_length += seq_length
            
    
    total_length = GutEuk_out_final_genome_length.query('genome == @protozoa')['seq_length'].item()
    genome_origin_guteuk.append("protozoa")
    genome_name.append(protozoa)
    precentage_corrected_genome.append(correct_length/total_length)

for sag in sags_genomes:
    df = GutEuk_out_final.query('genome == @sag')
    correct_length = 0
    for index, row in df.iterrows():
        pred = row['stage2_prediction']
        Y = row['genome_type']
        if Y == "SAG":
            Y = "protozoa"
        
        seq_length = row['seq_length']
        if pred == Y:
            correct_length += seq_length

    
    total_length = GutEuk_out_final_genome_length.query('genome == @sag')['seq_length'].item()
    genome_origin_guteuk.append("SAG")
    genome_name.append(sag)
    precentage_corrected_genome.append(correct_length/total_length)

for proka in prokaryotes_genomes:
    df = GutEuk_out_final.query('genome == @proka')
    correct_length = 0
    for index, row in df.iterrows():
        pred = row['stage2_prediction']
        Y = row['genome_type']
        seq_length = row['seq_length']
        if pred == Y:
            correct_length += seq_length

    
    total_length = GutEuk_out_final_genome_length.query('genome == @proka')['seq_length'].item()
    genome_origin_guteuk.append("prokaryotes")
    genome_name.append(proka)
    precentage_corrected_genome.append(correct_length/total_length)

        


In [15]:
res = pd.DataFrame.from_dict({"genome_origin":genome_origin_guteuk, "genome_name":genome_name, "precentage_corrected_genome":precentage_corrected_genome})

In [16]:
res.to_csv("/fs/ess/PAS0439/MING/cilates_fungi_classifier/stats_visualization/GutEuk_benchmark_corrected_bp_by_genome.csv", index = None)

In [17]:
tmp = GutEuk_out_final[GutEuk_out_final.genome_type == "SAG"].index
GutEuk_out_final["genome_type_three_cat"] = GutEuk_out_final.genome_type
GutEuk_out_final.loc[tmp, "genome_type_three_cat"] = "protozoa"

In [18]:
for index, row in GutEuk_out_final.iterrows():
    pred = row['stage2_prediction']
    Y = row['genome_type_three_cat']
    if pred == Y:
        GutEuk_out_final.loc[index, "accuracy"] = 1
    else:
        GutEuk_out_final.loc[index, "accuracy"] = 0

In [19]:
GutEuk_out_final_filtered = GutEuk_out_final.copy() 

In [20]:
GutEuk_out_final_filtered = GutEuk_out_final_filtered[~GutEuk_out_final_filtered.genome.isin(["Smimuc2_AssemblyScaffolds_Repeatmasked.fasta", "Vavcu1_AssemblyScaffolds_Repeatmasked.fasta"])]

In [21]:
def ceiling(x:int):
    x = (x // 5000)*5000
    return x
    


In [22]:
GutEuk_out_final_filtered.loc[:,"sequence_length_range"] = GutEuk_out_final_filtered.seq_length.apply(ceiling)

In [23]:
total_seq_by_len_category = GutEuk_out_final_filtered.groupby(["sequence_length_range", "genome_type"]).sequence.count().reset_index()

In [24]:
total_seq_by_len_category.rename(columns = {"genome_type": "genome_category", "sequence": "total_sequence"}, inplace = True)

In [25]:
corrected_seq_by_len_category = GutEuk_out_final_filtered.groupby(["sequence_length_range", "genome_type"]).accuracy.sum().reset_index()

In [26]:
corrected_seq_by_len_category.rename(columns = {"genome_type": "genome_category", "accuracy": "corrected_sequence"}, inplace = True)

In [27]:
accuracy_by_len_category = pd.merge(corrected_seq_by_len_category, total_seq_by_len_category, on = ["sequence_length_range", "genome_category"])

In [28]:
accuracy_by_len_category.loc[ :, "accuracy"] = accuracy_by_len_category.corrected_sequence / accuracy_by_len_category.total_sequence

In [29]:
accuracy_by_len_category_3classes = accuracy_by_len_category.copy()
accuracy_by_len_category_3classes.genome_category = accuracy_by_len_category.genome_category.replace("SAG", "protozoa")
accuracy_by_len_category_3classes = accuracy_by_len_category_3classes.groupby(["sequence_length_range", "genome_category"]).sum(["total_sequence", "accuracy"]).reset_index()
accuracy_by_len_category_3classes.accuracy = list(accuracy_by_len_category_3classes.corrected_sequence / accuracy_by_len_category_3classes.total_sequence)


In [30]:
accuracy_by_len_category_3classes.to_csv("/fs/ess/PAS0439/MING/cilates_fungi_classifier/stats_visualization/GutEuk_benchmark_accuracy_by_contig_length.csv", index = None)

In [31]:
GutEuk_out_final_precision_recall = GutEuk_out_final_filtered.copy()
GutEuk_out_final_precision_recall.loc[:,"sequence_length_range"] = GutEuk_out_final_precision_recall.seq_length.apply(ceiling)

In [32]:
# sequence_length_range: 0 (3000 < x < 5000)
precision = []
recall = []
sequence_length = []
category = []
for cate in ["prokaryotes", "fungi", "protozoa"]:
    for seq_len in [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000]:
        tp =  len(GutEuk_out_final_precision_recall.query('genome_type_three_cat == @cate').query("sequence_length_range == @seq_len").query('stage2_prediction == @cate'))
        fp = len(GutEuk_out_final_precision_recall.query('genome_type_three_cat != @cate').query("sequence_length_range == @seq_len").query('stage2_prediction == @cate'))
        fn = len(GutEuk_out_final_precision_recall.query('genome_type_three_cat == @cate').query("sequence_length_range == @seq_len").query('stage2_prediction != @cate'))
        if fn + fp + tp == 0:
            continue
        else:
            preci = tp/ (tp + fp)
            recal = tp/ (tp + fn)
            category.append(cate)
            sequence_length.append(seq_len)
            precision.append(preci)
            recall.append(recal)
        

for seq_len in [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000]:
        tp =  len(GutEuk_out_final_precision_recall.query('genome_type_three_cat == "protozoa"|genome_type_three_cat =="fungi"').query("sequence_length_range == @seq_len").query('stage1_prediction != "prokaryotes"'))
        fp = len(GutEuk_out_final_precision_recall.query('genome_type_three_cat == "prokaryotes"').query("sequence_length_range == @seq_len").query('stage1_prediction != "prokaryotes"'))
        fn = len(GutEuk_out_final_precision_recall.query('genome_type_three_cat == "protozoa"|genome_type_three_cat =="fungi"').query("sequence_length_range == @seq_len").query('stage1_prediction == "prokaryotes"'))
        if fn + fp + tp == 0:
            continue
        else:
            preci = tp/ (tp + fp)
            recal = tp/ (tp + fn)
            category.append("eukaryotes")
            sequence_length.append(seq_len)
            precision.append(preci)
            recall.append(recal)
            
preci_recall_by_len50 = pd.DataFrame.from_dict({"category":category, "sequence_length":sequence_length, "precision":precision, "recall":recall, "threshold": 0.5})        

In [33]:
GutEuk_out_final_precision_recall_stage2_60 = GutEuk_out_final_precision_recall.copy()
filtered = GutEuk_out_final_precision_recall_stage2_60.query('stage1_prediction != "prokaryotes"').query('stage2_confidence < 0.60').index
GutEuk_out_final_precision_recall_stage2_60.loc[filtered, "stage2_prediction"] = "undetermined"

# sequence_length_range: 0 (3000 < x < 5000)
precision60 = []
recall60 = []
sequence_length60 = []
category60 = []
for cate in ["prokaryotes", "fungi", "protozoa"]:
    for seq_len in [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000]:
        tp =  len(GutEuk_out_final_precision_recall_stage2_60.query('genome_type_three_cat == @cate').query("sequence_length_range == @seq_len").query('stage2_prediction == @cate'))
        fp = len(GutEuk_out_final_precision_recall_stage2_60.query('genome_type_three_cat != @cate').query("sequence_length_range == @seq_len").query('stage2_prediction == @cate'))
        fn = len(GutEuk_out_final_precision_recall_stage2_60.query('genome_type_three_cat == @cate').query("sequence_length_range == @seq_len").query('stage2_prediction != @cate'))
        if fn + fp + tp == 0:
            continue
        else:
            preci = tp/ (tp + fp)
            recal = tp/ (tp + fn)
            category60.append(cate)
            sequence_length60.append(seq_len)
            precision60.append(preci)
            recall60.append(recal)
        

for seq_len in [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000]:
        tp =  len(GutEuk_out_final_precision_recall_stage2_60.query('genome_type_three_cat == "protozoa"|genome_type_three_cat =="fungi"').query("sequence_length_range == @seq_len").query('stage1_prediction != "prokaryotes"'))
        fp = len(GutEuk_out_final_precision_recall_stage2_60.query('genome_type_three_cat == "prokaryotes"').query("sequence_length_range == @seq_len").query('stage1_prediction != "prokaryotes"'))
        fn = len(GutEuk_out_final_precision_recall_stage2_60.query('genome_type_three_cat == "protozoa"|genome_type_three_cat =="fungi"').query("sequence_length_range == @seq_len").query('stage1_prediction == "prokaryotes"'))
        if fn + fp + tp == 0:
            continue
        else:
            preci = tp/ (tp + fp)
            recal = tp/ (tp + fn)
            category60.append("eukaryotes")
            sequence_length60.append(seq_len)
            precision60.append(preci)
            recall60.append(recal)

preci_recall_by_len60 = pd.DataFrame.from_dict({"category":category60, "sequence_length":sequence_length60, "precision":precision60, "recall":recall60, "threshold": 0.6})            

In [34]:
GutEuk_out_final_precision_recall_stage2_70 = GutEuk_out_final_precision_recall.copy()
filtered = GutEuk_out_final_precision_recall_stage2_70.query('stage1_prediction != "prokaryotes"').query('stage2_confidence < 0.70').index
GutEuk_out_final_precision_recall_stage2_70.loc[filtered, "stage2_prediction"] = "undetermined"

# sequence_length_range: 0 (3000 < x < 5000)
precision70 = []
recall70 = []
sequence_length70 = []
category70 = []
for cate in ["prokaryotes", "fungi", "protozoa"]:
    for seq_len in [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000]:
        tp =  len(GutEuk_out_final_precision_recall_stage2_70.query('genome_type_three_cat == @cate').query("sequence_length_range == @seq_len").query('stage2_prediction == @cate'))
        fp = len(GutEuk_out_final_precision_recall_stage2_70.query('genome_type_three_cat != @cate').query("sequence_length_range == @seq_len").query('stage2_prediction == @cate'))
        fn = len(GutEuk_out_final_precision_recall_stage2_70.query('genome_type_three_cat == @cate').query("sequence_length_range == @seq_len").query('stage2_prediction != @cate'))
        if fn + fp + tp == 0:
            continue
        else:
            preci = tp/ (tp + fp)
            recal = tp/ (tp + fn)
            category70.append(cate)
            sequence_length70.append(seq_len)
            precision70.append(preci)
            recall70.append(recal)
        

for seq_len in [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000]:
        tp =  len(GutEuk_out_final_precision_recall_stage2_70.query('genome_type_three_cat == "protozoa"|genome_type_three_cat =="fungi"').query("sequence_length_range == @seq_len").query('stage1_prediction != "prokaryotes"'))
        fp = len(GutEuk_out_final_precision_recall_stage2_70.query('genome_type_three_cat == "prokaryotes"').query("sequence_length_range == @seq_len").query('stage1_prediction != "prokaryotes"'))
        fn = len(GutEuk_out_final_precision_recall_stage2_70.query('genome_type_three_cat == "protozoa"|genome_type_three_cat =="fungi"').query("sequence_length_range == @seq_len").query('stage1_prediction == "prokaryotes"'))
        if fn + fp + tp == 0:
            continue
        else:
            preci = tp/ (tp + fp)
            recal = tp/ (tp + fn)
            category70.append("eukaryotes")
            sequence_length70.append(seq_len)
            precision70.append(preci)
            recall70.append(recal)

preci_recall_by_len70 = pd.DataFrame.from_dict({"category":category70, "sequence_length":sequence_length70, "precision":precision70, "recall":recall70 , "threshold": 0.7})            

In [35]:
GutEuk_out_final_precision_recall_stage2_80 = GutEuk_out_final_precision_recall.copy()
filtered = GutEuk_out_final_precision_recall_stage2_80.query('stage1_prediction != "prokaryotes"').query('stage2_confidence < 0.80').index
GutEuk_out_final_precision_recall_stage2_80.loc[filtered, "stage2_prediction"] = "undetermined"

# sequence_length_range: 0 (3000 < x < 5000)
precision80 = []
recall80 = []
sequence_length80 = []
category80 = []
for cate in ["prokaryotes", "fungi", "protozoa"]:
    for seq_len in [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000]:
        tp =  len(GutEuk_out_final_precision_recall_stage2_80.query('genome_type_three_cat == @cate').query("sequence_length_range == @seq_len").query('stage2_prediction == @cate'))
        fp = len(GutEuk_out_final_precision_recall_stage2_80.query('genome_type_three_cat != @cate').query("sequence_length_range == @seq_len").query('stage2_prediction == @cate'))
        fn = len(GutEuk_out_final_precision_recall_stage2_80.query('genome_type_three_cat == @cate').query("sequence_length_range == @seq_len").query('stage2_prediction != @cate'))
        if fn + fp + tp == 0:
            continue
        else:
            preci = tp/ (tp + fp)
            recal = tp/ (tp + fn)
            category80.append(cate)
            sequence_length80.append(seq_len)
            precision80.append(preci)
            recall80.append(recal)
        

for seq_len in [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000]:
        tp =  len(GutEuk_out_final_precision_recall_stage2_80.query('genome_type_three_cat == "protozoa"|genome_type_three_cat =="fungi"').query("sequence_length_range == @seq_len").query('stage1_prediction != "prokaryotes"'))
        fp = len(GutEuk_out_final_precision_recall_stage2_80.query('genome_type_three_cat == "prokaryotes"').query("sequence_length_range == @seq_len").query('stage1_prediction != "prokaryotes"'))
        fn = len(GutEuk_out_final_precision_recall_stage2_80.query('genome_type_three_cat == "protozoa"|genome_type_three_cat =="fungi"').query("sequence_length_range == @seq_len").query('stage1_prediction == "prokaryotes"'))
        if fn + fp + tp == 0:
            continue
        else:
            preci = tp/ (tp + fp)
            recal = tp/ (tp + fn)
            category80.append("eukaryotes")
            sequence_length80.append(seq_len)
            precision80.append(preci)
            recall80.append(recal)

preci_recall_by_len80 = pd.DataFrame.from_dict({"category":category80, "sequence_length":sequence_length80, "precision":precision80, "recall":recall80, "threshold": 0.8})            

In [36]:
GutEuk_out_final_precision_recall_stage2_90 = GutEuk_out_final_precision_recall.copy()
filtered = GutEuk_out_final_precision_recall_stage2_90.query('stage1_prediction != "prokaryotes"').query('stage2_confidence < 0.90').index
GutEuk_out_final_precision_recall_stage2_90.loc[filtered, "stage2_prediction"] = "undetermined"

# sequence_length_range: 0 (3000 < x < 5000)
precision90 = []
recall90 = []
sequence_length90 = []
category90 = []
for cate in ["prokaryotes", "fungi", "protozoa"]:
    for seq_len in [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000]:
        tp =  len(GutEuk_out_final_precision_recall_stage2_90.query('genome_type_three_cat == @cate').query("sequence_length_range == @seq_len").query('stage2_prediction == @cate'))
        fp = len(GutEuk_out_final_precision_recall_stage2_90.query('genome_type_three_cat != @cate').query("sequence_length_range == @seq_len").query('stage2_prediction == @cate'))
        fn = len(GutEuk_out_final_precision_recall_stage2_90.query('genome_type_three_cat == @cate').query("sequence_length_range == @seq_len").query('stage2_prediction != @cate'))
        if fn + fp + tp == 0:
            continue
        else:
            preci = tp/ (tp + fp)
            recal = tp/ (tp + fn)
            category90.append(cate)
            sequence_length90.append(seq_len)
            precision90.append(preci)
            recall90.append(recal)
        

for seq_len in [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000]:
        tp =  len(GutEuk_out_final_precision_recall_stage2_90.query('genome_type_three_cat == "protozoa"|genome_type_three_cat =="fungi"').query("sequence_length_range == @seq_len").query('stage1_prediction != "prokaryotes"'))
        fp = len(GutEuk_out_final_precision_recall_stage2_90.query('genome_type_three_cat == "prokaryotes"').query("sequence_length_range == @seq_len").query('stage1_prediction != "prokaryotes"'))
        fn = len(GutEuk_out_final_precision_recall_stage2_90.query('genome_type_three_cat == "protozoa"|genome_type_three_cat =="fungi"').query("sequence_length_range == @seq_len").query('stage1_prediction == "prokaryotes"'))
        if fn + fp + tp == 0:
            continue
        else:
            preci = tp/ (tp + fp)
            recal = tp/ (tp + fn)
            category90.append("eukaryotes")
            sequence_length90.append(seq_len)
            precision90.append(preci)
            recall90.append(recal)

preci_recall_by_len90 = pd.DataFrame.from_dict({"category":category90, "sequence_length":sequence_length90, "precision":precision90, "recall":recall90, "threshold": 0.9})            

In [37]:
GutEuk_out_final_precision_recall_stage2_100 = GutEuk_out_final_precision_recall.copy()
filtered = GutEuk_out_final_precision_recall_stage2_100.query('stage1_prediction != "prokaryotes"').query('stage2_confidence < 1').index
GutEuk_out_final_precision_recall_stage2_100.loc[filtered, "stage2_prediction"] = "undetermined"

# sequence_length_range: 0 (3000 < x < 5000)
precision100 = []
recall100 = []
sequence_length100 = []
category100 = []
for cate in ["prokaryotes", "fungi", "protozoa"]:
    for seq_len in [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000]:
        tp =  len(GutEuk_out_final_precision_recall_stage2_100.query('genome_type_three_cat == @cate').query("sequence_length_range == @seq_len").query('stage2_prediction == @cate'))
        fp = len(GutEuk_out_final_precision_recall_stage2_100.query('genome_type_three_cat != @cate').query("sequence_length_range == @seq_len").query('stage2_prediction == @cate'))
        fn = len(GutEuk_out_final_precision_recall_stage2_100.query('genome_type_three_cat == @cate').query("sequence_length_range == @seq_len").query('stage2_prediction != @cate'))
        if fn + fp + tp == 0:
            continue
        else:
            preci = tp/ (tp + fp)
            recal = tp/ (tp + fn)
            category100.append(cate)
            sequence_length100.append(seq_len)
            precision100.append(preci)
            recall100.append(recal)
        

for seq_len in [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000]:
        tp =  len(GutEuk_out_final_precision_recall_stage2_100.query('genome_type_three_cat == "protozoa"|genome_type_three_cat =="fungi"').query("sequence_length_range == @seq_len").query('stage1_prediction != "prokaryotes"'))
        fp = len(GutEuk_out_final_precision_recall_stage2_100.query('genome_type_three_cat == "prokaryotes"').query("sequence_length_range == @seq_len").query('stage1_prediction != "prokaryotes"'))
        fn = len(GutEuk_out_final_precision_recall_stage2_100.query('genome_type_three_cat == "protozoa"|genome_type_three_cat =="fungi"').query("sequence_length_range == @seq_len").query('stage1_prediction == "prokaryotes"'))
        if fn + fp + tp == 0:
            continue
        else:
            preci = tp/ (tp + fp)
            recal = tp/ (tp + fn)
            category100.append("eukaryotes")
            sequence_length100.append(seq_len)
            precision100.append(preci)
            recall100.append(recal)

preci_recall_by_len100 = pd.DataFrame.from_dict({"category":category100, "sequence_length":sequence_length100, "precision":precision100, "recall":recall100, "threshold": 1})            

In [38]:
preci_recall_by_len = pd.concat([preci_recall_by_len50, preci_recall_by_len60, preci_recall_by_len70, preci_recall_by_len80, preci_recall_by_len90, preci_recall_by_len100])

In [39]:
preci_recall_by_len.to_csv("/fs/ess/PAS0439/MING/cilates_fungi_classifier/stats_visualization/GutEuk_benchmark_preci_recall_by_len.csv", index = None)

In [44]:
GutEuk_out_final_precision_recall.query('seq_length < 5000')

Unnamed: 0,stage1_prediction,stage2_prediction,stage1_confidence,stage2_confidence,sequence,genome,genome_type,seq_length,genome_type_three_cat,accuracy,sequence_length_range
