Create a subset of the GOLD data that contains only the taxa that are in at least 3 ecosystems

In [1]:
import pandas as pd
import os, sys
from math import ceil
from ast import literal_eval
from loguru import logger

logger.remove()
# add a sink that logs errors to a file
logger.add(sys.stderr, level="DEBUG")

data_dir = os.path.expanduser('../data/')
e6_dir = os.path.expanduser('../data/eggnog6/')
gold_dir = os.path.expanduser('../data/gold/')

gold_e6_overlap_dir = os.path.expanduser('../data/gold_e6_overlap/')
if not os.path.exists(gold_e6_overlap_dir):
    os.makedirs(gold_e6_overlap_dir)

Files have been downloaded in the previous step, so we can find the overlap with GOLD data:

In [2]:
# Read the eggNOG orthologs file for bacteria, and extract the NOG ID and taxon ID columns

# Define file path
og2seqs_and_species_bacteria_filepath = os.path.join(
    e6_dir, 'e6.og2seqs_and_species_bacteria.tsv')
# Read specific columns (second and fifth) from the TSV file
df = pd.read_csv(og2seqs_and_species_bacteria_filepath,
                 sep='\t', usecols=[1, 4], names=['NOG_ID', 'taxon_ID'])

# Split taxon_ID column into lists of integers
df['taxon_ID'] = df['taxon_ID'].apply(lambda x: list(map(int, x.split(','))))

# Explode the DataFrame to have one row per taxon_ID
exploded_df = df.explode('taxon_ID')

# Group by taxon_ID and aggregate OGs into lists
all_eggnog_taxa_df = exploded_df.groupby('taxon_ID')['NOG_ID'].apply(list).reset_index()

In [3]:
def readGoldData(filepath:str)->pd.DataFrame:
    allOrganisms = pd.read_csv(filepath,encoding='ISO-8859-1',sep=',',index_col=None, low_memory=False)
    # drop NaN values in the columns of interest
    allOrganisms.dropna(subset = ['ORGANISM ECOSYSTEM PATH ID','ORGANISM ECOSYSTEM','ORGANISM ECOSYSTEM CATEGORY','ORGANISM ECOSYSTEM TYPE'], axis=0, inplace=True)
    # remove unknown information
    allOrganisms = allOrganisms[~allOrganisms['ORGANISM ECOSYSTEM TYPE'].isin(['Unclassified', 'Undefined media', 'Unknown material', 'Unspecified system'])]
    
    return allOrganisms

In [4]:
# Read GOLD data
all_organisms_df = readGoldData(os.path.join(gold_dir, 'goldData_organisms.csv'))
# Get overlap of eggNOG and GOLD
all_organisms_df['ORGANISM NCBI TAX ID'] = all_organisms_df['ORGANISM NCBI TAX ID'].astype('int')
overlap_gold_df = all_eggnog_taxa_df.merge(all_organisms_df, how='inner', left_on='taxon_ID', right_on='ORGANISM NCBI TAX ID').drop('ORGANISM NCBI TAX ID',axis='columns')
overlap_gold_df = overlap_gold_df.sort_values(by=['taxon_ID'])
# Save overlap data
e6_overlap_filepath = os.path.join(gold_e6_overlap_dir, 'e6_gold.overlap.txt')
overlap_gold_df['taxon_ID'].to_csv(e6_overlap_filepath,header=False,index=False)
# Get all OGs that are present in the overlap
ogs = overlap_gold_df['NOG_ID'].explode('NOG_ID')
# get the frequency of each OG
ogs = ogs.value_counts().reset_index()
ogs.columns = ['NOG_ID','freq']
# keep only OGs that are present in at least 1/3 of the taxa
# ogs = ogs[ogs['freq']>=len(overlap_gold_df.index)/3]
# Save OGs data
e6_ogs_filepath = os.path.join(gold_e6_overlap_dir, 'e6_gold.ogs.txt')
ogs['NOG_ID'].to_csv(e6_ogs_filepath,sep='\t',header=False,index=False)
print("Matches between GOLD and eggNOG: {}".format(len(overlap_gold_df['taxon_ID'].unique())))

Matches between GOLD and eggNOG: 5590


Now there should be a file `e6_gold.ogs.txt`, containing all OGs in the overlap and a file `e6_gold.overlap.txt` that contains all tax IDs that are present in both Bacteria OGs and GOLD data.

Next we need to count how many different ecosystem there are per taxa in our overlap. We don't want taxa that are present in only one or two ecosystems, since they are not very informative.

In [5]:
# correct the 'ORGANISM ECOSYSTEM TYPE' column of GOLD data; count # of ecosys a taxon is in

# Define the minimum count of ecosystems a taxon should be in
min_count = 3

# Rename synonyms
replacementDict = {
    'Abdomen': 'Abdominal/Peritoneal cavity',
    'Water treatment plant': 'Wastewater',
    'Integument': 'Integumentary system',
    'Integumentary systemary system': 'Integumentary system',
    'Anaerobic digestor': 'Anaerobic',
}
# use this dict to replace values in the column 'ORGANISM ECOSYSTEM TYPE'
for i, j in replacementDict.items():
    overlap_gold_df['ORGANISM ECOSYSTEM TYPE'] = overlap_gold_df['ORGANISM ECOSYSTEM TYPE'].str.replace(
        i, j)

############################################################################################################

# Create a df with counts of (taxon, ecosys) pairs
taxon_ecosys_paircounts_df = overlap_gold_df.groupby(
    ['taxon_ID', 'ORGANISM ECOSYSTEM TYPE']).size().reset_index(name='COUNT')
display(taxon_ecosys_paircounts_df)

# Group by 'taxon_ID' and count the size of each group (this is the number of ecosystems a taxon is in)
taxon_ecosys_counts_df = taxon_ecosys_paircounts_df.groupby(
    'taxon_ID').size().reset_index(name='COUNT')
display(taxon_ecosys_counts_df)

# Sort by 'COUNT' in descending order
taxon_ecosys_counts_df.sort_values(by='COUNT', ascending=False, inplace=True)

# Save taxon_ecosys_counts_df to a file (includes 'COUNT' column and counts>min_count)
taxon_ecosys_counts_df.to_csv(os.path.join(gold_e6_overlap_dir, 'e6_gold.taxon_ecosys_counts_df.tsv'),
                               sep='\t', index=False)

# Filter rows where 'COUNT' is >= min_count and drop 'COUNT' column
filtered_taxa = taxon_ecosys_counts_df[taxon_ecosys_counts_df['COUNT'] >= min_count].drop(
    columns='COUNT')
# Rename column to 'taxon_ID'
filtered_taxa.columns = ['taxon_ID']

# Save filtered_taxa to a file
filtered_taxa.to_csv(os.path.join(gold_e6_overlap_dir, 'e6_gold.filtered_taxa.txt'),
                    sep='\t', index=False, header=False)

# Print the length of filtered_taxa
print(f"Number of taxa in at least {min_count} ecosystems: {len(filtered_taxa)}")

# Select necessary columns from overlap_gold_df
overlap_gold_subset_df = overlap_gold_df[['ORGANISM NAME', 'taxon_ID',
                                          'ORGANISM GOLD PHYLUM', 'ORGANISM GENUS', 'ORGANISM ECOSYSTEM TYPE', 'ORGANISM ECOSYSTEM SUBTYPE']]

# keep only rows where 'taxon_ID' is in filtered_taxa
overlap_gold_subset_df = overlap_gold_subset_df[overlap_gold_subset_df['taxon_ID'].isin(
    filtered_taxa['taxon_ID'])]

# Save overlap_gold_subset_df to a file
overlap_gold_subset_df.to_csv(os.path.join(gold_e6_overlap_dir, 'e6_gold.overlap_subset_df.tsv'),
                              sep='\t', index=False)

Unnamed: 0,taxon_ID,ORGANISM ECOSYSTEM TYPE,COUNT
0,23,Echinodermata,1
1,48,Soil,3
2,52,Soil,3
3,54,Desert,1
4,54,Soil,3
...,...,...,...
6844,1981712,Soil,1
6845,1987383,Digestive system,1
6846,2044587,Digestive system,4
6847,2052828,Composting,1


Unnamed: 0,taxon_ID,COUNT
0,23,1
1,48,1
2,52,1
3,54,2
4,69,3
...,...,...
5585,1981712,1
5586,1987383,1
5587,2044587,1
5588,2052828,1


Number of taxa in at least 3 ecosystems: 219
