### Evaluate the GutEuk's performance across different genomes (See EukRep paper, Fig 2C)

In [1]:
import pandas as pd

In [2]:
stage1_res = pd.read_csv("/fs/ess/PAS0439/MING/cilates_fungi_classifier/test_res_stage1.csv")[["seq", "ensembled_out"]]
stage2_res = pd.read_csv("/fs/ess/PAS0439/MING/cilates_fungi_classifier/test_res_stage2.csv")[["seq", "ensembled_out"]]
stage1_res.rename(columns = {"ensembled_out":"stage1_pred"}, inplace = True)
stage2_res.rename(columns = {"ensembled_out":"stage2_pred"}, inplace = True)

In [3]:
seq_origin = pd.read_csv("/fs/ess/PAS0439/MING/cilates_fungi_classifier/testset_seq_origin.csv")

In [4]:
testing_merged_tmp = pd.merge(stage1_res, seq_origin, on = "seq")
testing_merged = pd.merge(testing_merged_tmp, stage2_res, on = "seq", how = "left")

In [5]:
for index, row in testing_merged.iterrows():
    stage1 = row["stage1_pred"]
    stage2 = row["stage2_pred"]
    if stage1 == 0:
        stage1 = "prokaryotes"
        stage2 = "prokaryotes"
    else:
        stage1 = "eukaryotes"
        if stage2 == 0:
            stage2 = "fungi"
        elif stage2 == 1:
            stage2 = "protozoa"
    testing_merged.loc[index, "stage1_pred"] = stage1
    testing_merged.loc[index, "stage2_pred"] = stage2

In [6]:
testing_merged.query("category == 'SAGs'")

Unnamed: 0,seq,stage1_pred,genome,category,stage2_pred
388825,Diplodinium.flabellum.SAG1_40_BT_15855_fragment_6,eukaryotes,Diplodinium.flabellum.SAG1,SAGs,protozoa
388826,Diplodinium.flabellum.SAG1_49_BT_15486_fragment_1,eukaryotes,Diplodinium.flabellum.SAG1,SAGs,protozoa
388827,Diplodinium.flabellum.SAG1_49_BT_15486_fragment_2,eukaryotes,Diplodinium.flabellum.SAG1,SAGs,protozoa
388828,Diplodinium.flabellum.SAG1_49_BT_15486_fragment_3,eukaryotes,Diplodinium.flabellum.SAG1,SAGs,protozoa
388829,Diplodinium.flabellum.SAG1_49_BT_15486_fragment_4,eukaryotes,Diplodinium.flabellum.SAG1,SAGs,protozoa
...,...,...,...,...,...
855083,Diplodinium.flabellum.SAG1_40_BT_15855_fragment_1,eukaryotes,Diplodinium.flabellum.SAG1,SAGs,protozoa
855084,Diplodinium.flabellum.SAG1_40_BT_15855_fragment_2,eukaryotes,Diplodinium.flabellum.SAG1,SAGs,protozoa
855085,Diplodinium.flabellum.SAG1_40_BT_15855_fragment_3,eukaryotes,Diplodinium.flabellum.SAG1,SAGs,protozoa
855086,Diplodinium.flabellum.SAG1_40_BT_15855_fragment_4,eukaryotes,Diplodinium.flabellum.SAG1,SAGs,protozoa


In [7]:
fungi_genomes = set(testing_merged.query('category == "fungi"').genome)
protozoa_genomes = set(testing_merged.query('category == "protozoa"').genome) 
sags_genomes =  set(testing_merged.query('category == "SAGs"').genome)
prokaryotes_genomes = set(testing_merged.query('category == "prokaryotes"').genome)

In [8]:
genome_origin = []
accuracy_list = []
genome_list = []

In [9]:
for fungi in fungi_genomes:
    df = testing_merged.query('genome == @fungi')
    correct = 0
    incorrect = 0
    for index, row in df.iterrows():
        pred = row['stage2_pred']
        Y = row['category']
        if pred == Y:
            correct += 1
        else:
            incorrect += 1
    
    accuracy = correct / (correct + incorrect)
    genome_origin.append("fungi")
    accuracy_list.append(accuracy)
    genome_list.append(fungi)
        


In [10]:
for protozoa in protozoa_genomes:
    df = testing_merged.query('genome == @protozoa')
    correct = 0
    incorrect = 0
    for index, row in df.iterrows():
        pred = row['stage2_pred']
        Y = row['category']
        if pred == Y:
            correct += 1
        else:
            incorrect += 1
    
    accuracy = correct / (correct + incorrect)
    genome_origin.append("protozoa")
    accuracy_list.append(accuracy)
    genome_list.append(protozoa)


In [11]:
for sag in sags_genomes:
    df = testing_merged.query('genome == @sag')
    correct = 0
    incorrect = 0
    for index, row in df.iterrows():
        pred = row['stage2_pred']
        Y = row['category']
        if pred == "protozoa":
            correct += 1
        else:
            incorrect += 1
    
    accuracy = correct / (correct + incorrect)
    genome_origin.append("SAG")
    accuracy_list.append(accuracy)
    genome_list.append(sag)


In [12]:
for proka in prokaryotes_genomes:
    df = testing_merged.query('genome == @proka')
    correct = 0
    incorrect = 0
    for index, row in df.iterrows():
        pred = row['stage1_pred']
        Y = row['category']           
        if pred == Y:
            correct += 1
        else:
            incorrect += 1
    
    accuracy = correct / (correct + incorrect)
    genome_origin.append("prokaryotes")
    accuracy_list.append(accuracy)
    genome_list.append(proka)


In [13]:
accuracy_per_genome = pd.DataFrame({"genome_origin": genome_origin, "accuracy": accuracy_list, "genome": genome_list})
accuracy_per_genome.to_csv("/fs/ess/PAS0439/MING/cilates_fungi_classifier/stats_visualization/accuracy_per_genome_stage2.csv", index = None)