### Filter genomes used for training Tiara and EukRep in the testing set

In [1]:
import os
import pandas as pd
import re

In [2]:
os.chdir("/fs/ess/PAS0439/MING/cilates_fungi_classifier/")

In [3]:
genomes_in_training = pd.read_csv("tiara_eukrep_training.csv")

In [4]:
tiara_train = genomes_in_training.query('dataset == "Tiara"')
eukrep_train = genomes_in_training.query('dataset == "EukRep"')

In [5]:
testing_pro = pd.read_csv("dataset_proka_repre.csv").query('dataset == "Test"')

In [6]:
testing_fungi = pd.read_csv("dataset_fungi_repre.csv").query('dataset == "Test"')

In [7]:
testing_protozoa = pd.read_csv("dataset_protozoa_repre.csv").query('dataset == "Test"')

In [8]:
tiara_euk = tiara_train[tiara_train.taxa.str.contains('Eukaryota')]

In [9]:
testing_fungi_taxa = [set(f) for f in testing_fungi.taxa.str.split("_")]

In [10]:
testing_protozoa_proka = list(testing_pro.id) +  list(testing_protozoa.id)

In [11]:
filtered_proka_protozoa = []
for index, row in tiara_train.iterrows():
    ID = row['id']
    accession = ID.split("_")[1]
    if "GCF_" + accession in testing_protozoa_proka:
        filtered_proka_protozoa.append("GCF_" + accession)
    elif "GCA_" + accession in testing_protozoa_proka:
        filtered_proka_protozoa.append("GCA_" + accession)
        
filtered_fungi_taxa_pre = [] # for manual check
for index, row in tiara_euk.iterrows():
    species = row["name"]
    filtered_fungi_taxa_pre.append(set(species.split(' ')))
    
testing_fungi_taxa = [set(f) for f in testing_fungi.taxa.str.split("_")]

fungi_manual_check = []
for f in testing_fungi_taxa:
    for j in filtered_fungi_taxa_pre:
        if len(f & j) > 0:
            fungi_manual_check.append(f)
            
    

In [12]:
## genomes used in training Tiara
filtered_fungi_index = []
filtered_protozoa_index = []
filtered_proka_index = []

for index, row in testing_fungi.iterrows():
    taxa = set(row['taxa'].split("_"))
    if taxa in fungi_manual_check:
        filtered_fungi_index.append(index)

for index, row in testing_protozoa.iterrows():
    ID = row['id']
    if ID in filtered_proka_protozoa:
        filtered_protozoa_index.append(index)
        
for index, row in testing_pro.iterrows():
    ID = row['id']
    if ID in filtered_proka_protozoa:
        filtered_proka_index.append(index)
        

In [13]:
testing_taxa =  list(testing_protozoa.taxa.str.split(' ')) + [f.split('_') for f in list(testing_fungi.taxa)] # manually checking proka genomes, since different taxanomy were used (GTDB vs. NCBI) 

In [14]:
eukrep_euk_genomes = [f.split(' ') for f in  list(eukrep_train.name)]

In [15]:
## genomes used in training EukRep
eukrep_filtered_euk = []
for f in testing_taxa:
    for j in eukrep_euk_genomes:
        if len(set(f) & set(j)) > 0:
            eukrep_filtered_euk.append(f)

In [16]:
for index, row in testing_fungi.iterrows():
    taxa = set(row['taxa'].split("_"))
    if taxa in eukrep_filtered_euk:
        filtered_fungi_index.append(index)
        
for index, row in testing_protozoa.iterrows():
    taxa = set(row['taxa'].split(" "))
    if taxa in eukrep_filtered_euk:
        filtered_protozoa_index.append(index)

In [17]:
testing_pro_benchmark = testing_pro[~testing_pro.index.isin(filtered_proka_index)]

In [18]:
testing_fungi_benchmark = testing_fungi[~testing_fungi.index.isin(filtered_fungi_index)]

In [19]:
testing_protozoa_benchmark = testing_protozoa[~testing_protozoa.index.isin(filtered_protozoa_index)]

In [20]:
testing_pro_benchmark.to_csv("dataset_proka_benchmark.csv", index = None)
testing_fungi_benchmark.to_csv("dataset_fungi_benchmark.csv", index = None)
testing_protozoa_benchmark.to_csv("dataset_protozoa_benchmark.csv", index = None)