In [1]:
import pandas as pd
import os
import multiprocessing as mp
import tqdm
import csv

data_dir = os.path.join(os.path.dirname(os.getcwd()), "data")
filtered_dir = os.path.join(data_dir, "filtered")

nog_list_filepath = os.path.join(filtered_dir, "nogs.taxa_filtered.gff_filtered.txt")
STRING_dir_filepath = os.path.join(data_dir, "STRING_v12")


In [2]:
%%bash -s "$STRING_dir_filepath" "$filtered_dir"
# prepare a gene to NOG mapping tsv file by assigning each gene in the comma-delimited list of genes to the NOG ID (many-to-one mapping)
awk -F'\t' '{n=split($6,a,","); for(i=1;i<=n;i++) print a[i]"\t"$2}' $2/map.nog_taxa_members.tsv > $2/map.gene_NOG.tsv && echo "Gene-NOG mapping written to $2/map.gene_NOG.tsv"

# read in the cog mappings file and prepare a gene to NOG to COG mapping file.
cog_mappings_file=$1/COG.mappings.v12.0.txt # columns 1 and 4 are gene and COG IDs, respectively
cut -f1,4 $cog_mappings_file | awk '$2!=""' > $2/map.gene_COG.tsv && echo "Gene-COG mapping written to $2/map.gene_COG.tsv"

Gene-NOG mapping written to /root/work/projects/hgt_ecosystem/data/filtered/map.gene_NOG.tsv
Gene-COG mapping written to /root/work/projects/hgt_ecosystem/data/filtered/map.gene_COG.tsv


In [3]:
# prepare a NOG to COG mapping file using the gene to NOG and NOG to COG mappings - in parallel using `multiprocessing`
# about 5mins to run

# read in the gene to NOG mappings
gene_NOG_df = pd.read_csv(
    f"{filtered_dir}/map.gene_NOG.tsv",
    sep="\t",
    header=None,
    names=["gene", "NOG"],
)

# since gene to COG file is large, we read it in chunks and process each chunk in parallel
# first extract the number of lines in the file without reading in the whole file
with open(f"{filtered_dir}/map.gene_COG.tsv") as f:
    num_lines = sum(1 for line in f)
print(f"Number of lines in the gene-COG mapping file: {num_lines}")

gene_NOG_dict = gene_NOG_df.set_index("gene").to_dict()["NOG"]

# define fn to process each chunk of the gene-COG mapping file
def process_chunk(chunk, gene_NOG_dict):
    valid_rows = chunk[chunk["gene"].isin(gene_NOG_dict.keys())].copy()
    valid_rows["NOG"] = valid_rows["gene"].map(gene_NOG_dict)
    # drop gene column
    valid_rows.drop(columns=["gene"], inplace=True)
    # keep only unique rows and flip the NOG and COG columns
    valid_rows.drop_duplicates(inplace=True)
    valid_rows = valid_rows[["NOG", "COG"]]
    return valid_rows


# read in the gene-COG mapping file in chunks and process each chunk in parallel, with progress bar
chunksize = 1e5
num_chunks = int(num_lines / chunksize) + 1
num_threads = num_chunks
chunk_results = []
print(f"Processing {num_chunks} chunks of size {chunksize} lines each")
with mp.Pool(num_threads) as pool:
    for chunk in tqdm.tqdm(
        pd.read_csv(
            f"{filtered_dir}/map.gene_COG.tsv",
            sep="\t",
            header=None,
            names=["gene", "COG"],
            chunksize=chunksize,
        ),
        total=num_chunks,
        desc="Submitting chunks",
    ):
        # asynchronously process each chunk, retrieve the results later, and write to file
        result = pool.apply_async(process_chunk, args=(chunk, gene_NOG_dict))
        chunk_results.append(result)

    with open(f"{filtered_dir}/map.NOG_COG.tsv", "w") as f:
        for result in tqdm.tqdm(
            chunk_results, total=num_chunks, desc="Retrieving and writing results"
        ):
            result = result.get()
            result.to_csv(f, sep="\t", header=False, index=False)

print(f"NOG-COG mapping written to {filtered_dir}/map.NOG_COG.tsv")
# about 5mins to run

Number of lines in the gene-COG mapping file: 56103122
Processing 562 chunks of size 100000.0 lines each


Submitting chunks: 100%|██████████| 562/562 [02:09<00:00,  4.33it/s]
Retrieving and writing results: 100%|██████████| 562/562 [00:49<00:00, 11.26it/s]  


NOG-COG mapping written to /root/work/projects/hgt_ecosystem/data/filtered/map.NOG_COG.tsv


In [4]:
%%bash -s "$data_dir" 
# given the COG column, add a COG category column using FTP file
mkdir -p $1/COG && echo "Created directory $1/COG"
# https://ftp.ncbi.nlm.nih.gov/pub/COG/COG2024/data/cog-24.fun.tab
wget -O $1/COG/cog_fun_2024.tsv https://ftp.ncbi.nlm.nih.gov/pub/COG/COG2024/data/cog-24.fun.tab
# https://ftp.ncbi.nlm.nih.gov/pub/COG/COG2024/data/cog-24.def.tab
wget -O $1/COG/cog_def_2024.tsv https://ftp.ncbi.nlm.nih.gov/pub/COG/COG2024/data/cog-24.def.tab

# some of the NOGs in eggNOG map to arCOGs or KOGs, which are not present in the COG file
# download the arCOG and KOG definitions files and use them for the fn category assignment. 
# The NOGs don't end up being eukaryotic or archaeal, but the fn categories are still useful.
# https://ftp.ncbi.nlm.nih.gov/pub/wolf/COGs/arCOG/ar14.arCOGdef19.tab
wget -O $1/COG/arCOG_def_2019.tsv https://ftp.ncbi.nlm.nih.gov/pub/wolf/COGs/arCOG/ar14.arCOGdef19.tab
# https://ftp.ncbi.nlm.nih.gov/pub/COG/KOG/kog
wget -O $1/COG/KOG_def_2003.tsv https://ftp.ncbi.nlm.nih.gov/pub/COG/KOG/kog

Created directory /root/work/projects/hgt_ecosystem/data/COG


--2025-03-17 13:31:35--  https://ftp.ncbi.nlm.nih.gov/pub/COG/COG2024/data/cog-24.fun.tab


Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.7, 130.14.250.11, 130.14.250.13, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1284 (1.3K)
Saving to: ‘/root/work/projects/hgt_ecosystem/data/COG/cog_fun_2024.tsv’

     0K .                                                     100% 50.3M=0s

2025-03-17 13:31:36 (50.3 MB/s) - ‘/root/work/projects/hgt_ecosystem/data/COG/cog_fun_2024.tsv’ saved [1284/1284]

--2025-03-17 13:31:36--  https://ftp.ncbi.nlm.nih.gov/pub/COG/COG2024/data/cog-24.def.tab
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.12, 130.14.250.31, 130.14.250.7, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 408008 (398K)
Saving to: ‘/root/work/projects/hgt_ecosystem/data/COG/cog_def_2024.tsv’

     0K .......... .......... .......... ..

In [5]:
# we use these COG database files to annotate rows in the NOG-COG-PPI-length-size-functional dataframe
nog_cog_df = pd.read_csv(
    f"{filtered_dir}/map.NOG_COG.tsv",
    sep="\t",
    header=None,
    names=["NOG", "COG"],
)

# if the COG column contains something called NOGXXXX and the corresponding NOG column starts with COG, then we replace the COG column with the NOG column
nog_cog_df["COG"] = nog_cog_df.apply(
    lambda x: (
        x["NOG"]
        if x["COG"].startswith("NOG") and x["NOG"].startswith("COG")
        else x["COG"]
    ),
    axis=1,
)

cog_fun_filepath = f"{data_dir}/COG/cog_fun_2024.tsv"
with open(cog_fun_filepath) as f:
    fun_lines = [line.strip().split("\t") for line in f]
    # meta-categories are in the format '1\tInformation storage and processing'
    fun_meta_category_dict = {line[0]: line[1] for line in fun_lines if len(line) == 2}
    # categories are in the format 'J\t1\tFCCCFC\tTranslation, ribosomal structure and biogenesis'
    # if there are three elements, then out of the four expected elements, the third element is missing
    for line in fun_lines:
        if len(line) == 3:
            line.insert(1, "MISSING")
    fun_categories_list = [line for line in fun_lines if len(line) == 4]
    cog_fun_df = pd.DataFrame(
        fun_categories_list,
        columns=["COG_category_symbol", "Meta_Category", "Abbreviation", "Name"],
    )
    # replace the category symbols with the meta-categories
    cog_fun_df["Meta_Category"] = cog_fun_df["Meta_Category"].map(
        fun_meta_category_dict
    )
cog_def_df = pd.read_csv(
    f"{data_dir}/COG/cog_def_2024.tsv",
    sep="\t",
    header=None,
    usecols=[0, 1],
    names=["COG", "COG_category_symbol"],
)
arcog_def_df = pd.read_csv(
    f"{data_dir}/COG/arCOG_def_2019.tsv",
    sep="\t",
    header=None,
    usecols=[0, 1],
    names=["COG", "COG_category_symbol"],
    quoting=csv.QUOTE_NONE,
)

# KOG is in a different format, where we need to first extract the lines starting with [A-Z]
with open(f"{data_dir}/COG/KOG_def_2003.tsv") as f:
    kog_lines = [line.strip() for line in f]
    kog_lines = [line.split() for line in kog_lines if line.startswith("[")]
    # replace the [A-Z] with A-Z
    kog_lines = [line[:2] for line in kog_lines]
    kog_def_df = pd.DataFrame(kog_lines, columns=["KOG_category_symbol", "KOG"])
    kog_def_df["KOG_category_symbol"] = kog_def_df["KOG_category_symbol"].str[
        1
    ]  # take the second character only since they are in order of priority

nog_cog_ppi_len_size_func_cog_df = pd.merge(
    nog_cog_df, cog_def_df, on="COG", how="left"
)
# add arcog def COG category symbols to fill in missing data in the COG category symbol column from the arCOG def file
nog_cog_ppi_len_size_func_cog_df["arCOG_category_symbol"] = (
    nog_cog_ppi_len_size_func_cog_df["COG"].map(
        arcog_def_df.set_index("COG")["COG_category_symbol"]
    )
)
nog_cog_ppi_len_size_func_cog_df["COG_category_symbol"] = (
    nog_cog_ppi_len_size_func_cog_df["COG_category_symbol"].fillna(
        nog_cog_ppi_len_size_func_cog_df["arCOG_category_symbol"]
    )
)
nog_cog_ppi_len_size_func_cog_df.drop(columns=["arCOG_category_symbol"], inplace=True)
# add KOG def COG category symbols to fill in missing data in the COG category symbol column from the KOG def file
nog_cog_ppi_len_size_func_cog_df["KOG_category_symbol"] = (
    nog_cog_ppi_len_size_func_cog_df["COG"].map(
        kog_def_df.set_index("KOG")["KOG_category_symbol"]
    )
)
nog_cog_ppi_len_size_func_cog_df["COG_category_symbol"] = (
    nog_cog_ppi_len_size_func_cog_df["COG_category_symbol"].fillna(
        nog_cog_ppi_len_size_func_cog_df["KOG_category_symbol"]
    )
)
nog_cog_ppi_len_size_func_cog_df.drop(columns=["KOG_category_symbol"], inplace=True)

# for all the entries in COG_category_symbol, if there are more than 1 COG categories (e.g. EH), we take the first letter
nog_cog_ppi_len_size_func_cog_df["COG_category_symbol"] = (
    nog_cog_ppi_len_size_func_cog_df["COG_category_symbol"].str[0]
)

nog_cog_ppi_len_size_func_cog_df["COG_category_name"] = (
    nog_cog_ppi_len_size_func_cog_df["COG_category_symbol"].map(
        cog_fun_df.set_index("COG_category_symbol")["Name"]
    )
)
nog_cog_ppi_len_size_func_cog_df["COG_meta_category"] = (
    nog_cog_ppi_len_size_func_cog_df["COG_category_symbol"].map(
        cog_fun_df.set_index("COG_category_symbol")["Meta_Category"]
    )
)
# drop na since what's left are NOGs that have no mappings to COG, arCOG, or KOG
nog_cog_ppi_len_size_func_cog_df.dropna(inplace=True)
# remove 'poorly characterized' meta-category NOGs
nog_cog_ppi_len_size_func_cog_df = nog_cog_ppi_len_size_func_cog_df[
    ~nog_cog_ppi_len_size_func_cog_df["COG_meta_category"].str.contains(
        "POORLY CHARACTERIZED"
    )
]
display(nog_cog_ppi_len_size_func_cog_df)
nog_cog_ppi_len_size_func_cog_df.to_csv(
    f"{filtered_dir}/map.NOG_COG_categories.tsv",
    sep="\t",
    header=True,
    index=False,
)
print(
    f"NOG-COG-functionalcategories mapping written to {filtered_dir}/map.NOG_COG_categories.tsv"
)

Unnamed: 0,NOG,COG,COG_category_symbol,COG_category_name,COG_meta_category
0,COG0001,COG0001,H,Coenzyme transport and metabolism,METABOLISM
1,COG1020,COG0001,H,Coenzyme transport and metabolism,METABOLISM
2,COG1670,COG0001,H,Coenzyme transport and metabolism,METABOLISM
3,COG0002,COG0002,E,Amino acid transport and metabolism,METABOLISM
4,COG0003,COG0003,P,Inorganic ion transport and metabolism,METABOLISM
...,...,...,...,...,...
11274,D1K1I,arCOG04219,L,"Replication, recombination and repair",INFORMATION STORAGE AND PROCESSING
11276,D8RMU,arCOG05249,C,Energy production and conversion,METABOLISM
11277,D5DBZ,arCOG06390,Q,"Secondary metabolites biosynthesis, transport ...",METABOLISM
11281,D2KCE,arCOG09463,V,Defense mechanisms,CELLULAR PROCESSES AND SIGNALING


NOG-COG-functionalcategories mapping written to /root/work/projects/hgt_ecosystem/data/filtered/map.NOG_COG_categories.tsv
