In [1]:
import pandas as pd

In [None]:
gtdb_df = pd.read_csv('bac120_metadata_r220.tsv.gz', compression='gzip', engine='python',sep='\t')
gtdb_df

Genomes to replace:
d__Bacteria;p__Verrucomicrobiota;c__Verrucomicrobiae;o__Opitutales;f__JAIRLM01;g__JAIRLM01;s__JAIRLM01 sp031268795
d__Bacteria;p__Verrucomicrobiota;c__Verrucomicrobiae;o__Opitutales;f__JAIRLM01;g__JAISOW01;s__JAISOW01 sp031275345
d__Bacteria;p__Verrucomicrobiota;c__Verrucomicrobiae;o__Opitutales;f__LL51;g__JAIRKQ01;s__JAIRKQ01 sp031269795
d__Bacteria;p__Verrucomicrobiota;c__Verrucomicrobiae;o__Opitutales;f__LL51;g__JAIDVZ01;s__JAIDVZ01 sp031257495
d__Bacteria;p__Verrucomicrobiota;c__Verrucomicrobiae;o__Opitutales;f__UBA9783;g__CABUVL01;s__CABUVL01 sp031278975
d__Bacteria;p__Verrucomicrobiota;c__Verrucomicrobiae;o__Opitutales;f__UBA9783;g__CABUVL01;s__CABUVL01 sp031258545

Need different family level representatives for f__JAIRLM01, f__LL51, f__UBA9783

In [None]:
verruco_df = gtdb_df.loc[gtdb_df['gtdb_taxonomy'].str.contains('p__Verrucomicrobiota;') &
                                      (gtdb_df['checkm_completeness'] >= 90) & (gtdb_df['checkm_contamination'] <= 5) & 
                                      (gtdb_df['checkm2_completeness'] >= 90) & (gtdb_df['checkm2_contamination'] <= 5) &
                                      (gtdb_df['checkm_strain_heterogeneity'] == 0)]



# split gtdb taxonomy
verruco_df[['domain','phylum','class','order','family','genus','species']]=verruco_df['gtdb_taxonomy'].str.split(';', expand = True)

# sort the columns to select genus reps
verruco_df.sort_values(by=['genus','gtdb_type_species_of_genus','mimag_high_quality','checkm2_completeness','checkm2_contamination'],
                                  ascending=[True,False,False,False,True], inplace=True)
print(verruco_df)

In [None]:

# select 1 class level rep for each class (except c__Verrucomicrobiae)
verruco_class_reps_df = verruco_df.loc[~verruco_df['class'].str.contains('c__Verrucomicrobiae')].groupby('class').first().reset_index()
print(verruco_class_reps_df)


In [67]:

# select 2 order level rep for each order (except o__Opitutales)
verruco_order_reps_df =  verruco_df.drop_duplicates('family', keep='first') \
.loc[verruco_df['class'].str.contains('c__Verrucomicrobiae') & ~verruco_df['order'].str.contains('o__Opitutales')] \
.groupby('order').head(2).reset_index()
print(verruco_order_reps_df)

     index           accession  ambiguous_bases  checkm2_completeness  \
0   423244  GB_GCA_018652875.1                0                 94.24   
1   550574  GB_GCA_019244415.1                0                 92.57   
2    67043  RS_GCF_000020225.1                0                100.00   
3   362903  GB_GCA_913048455.1                0                 97.68   
4   177008  GB_GCA_945861535.1                0                 96.56   
5    11951  GB_GCA_016871575.1                0                 95.37   
6    73552  GB_GCA_026397915.1                0                 94.14   
7   205013  GB_GCA_028293625.1                0                 96.40   
8   329145  GB_GCA_945877805.1                0                 96.12   
9   220658  GB_GCA_020718645.1                0                 94.06   
10  438743  RS_GCF_900105685.1                0                 99.92   
11  105369  GB_GCA_020849895.1                0                 94.39   
12  211635  GB_GCA_026646655.1                0    

In [None]:
verruco_family_reps_df_best = verruco_df.drop_duplicates('gtdb_taxonomy', keep='first') \
.loc[verruco_df['class'].str.contains('c__Verrucomicrobiae') & verruco_df['order'].str.contains('o__Opitutales') & ~verruco_df['family'].str.contains('f__T3Sed10-336')] \
.groupby('family').head(3).reset_index()

print(verruco_family_reps_df_best)

In [None]:
verruco_family_reps_df_genus = verruco_df.drop_duplicates('genus', keep='first') \
.loc[verruco_df['class'].str.contains('c__Verrucomicrobiae') & verruco_df['order'].str.contains('o__Opitutales') & ~verruco_df['family'].str.contains('f__T3Sed10-336')] \
.groupby(['family']).head(3).reset_index()
# .groupby(['family'], as_index=False, group_keys=False,
#          ).apply(
#              lambda x: x.sample(min(3,len(x)))
#          ).reset_index()

print(verruco_family_reps_df_genus)

In [66]:
# concatenate in order of unique genera and then best representative. That way in the group, there's unique genera first in each group. If not 3 unique then takes the next best representative for the family
df_merge_verrcu_family = pd.concat([verruco_family_reps_df_genus,verruco_family_reps_df_best]).drop_duplicates('species') \
.groupby('family',sort=False).head(2) \
.sort_values(by=['family','genus']) \
.drop(['index'],axis=1).reset_index()

print(df_merge_verrcu_family)

    index           accession  ambiguous_bases  checkm2_completeness  \
0       4  GB_GCA_934660865.1                0                 96.54   
1       6  GB_GCA_949287935.1                0                 95.67   
2       3  GB_GCA_913049555.1                0                 95.29   
3       8  GB_GCA_001872735.1                7                 98.92   
4       9  RS_GCF_016595555.1                0                 97.63   
5      23  GB_GCA_030432235.1                0                 95.91   
6      10  RS_GCF_000025905.1                0                100.00   
7      11  RS_GCF_027257905.1                0                 99.88   
8      14  GB_GCA_018673815.1                0                 97.28   
9      21  GB_GCA_913048125.1                0                 91.60   
10     15  GB_GCA_013215165.1                0                 97.24   
11     21  GB_GCA_028280845.1                0                 97.74   
12     22  GB_GCA_028285755.1                0                 9