In [None]:
import re

import pandas as pd

from glob import glob
from shutil import move

from collections import defaultdict

### Find CoverM files to be reprocessed

In [None]:
smd = pd.read_csv("/local/path/to/data/metadata_1454_cluster_labels.csv", index_col=0)
all_coverm = glob("/data/gpfs/projects/punim0639/vini/coverm/*.gz")
coverm = {x: x.split("/")[-1].split(".")[0][:-3].split("_") for x in all_coverm}
samples = smd["sample_name"].to_list()

In [None]:
def find_sample(l):
    if l[0] == "Arc":
        l = ["_".join(l[:3]),]
    sample = [i for i in samples if any(j in i for j in l)]
    if sample:
        return sample[0]
    else:
        return False

    
def invert_dict(d):
    inverted_dict = defaultdict(list)
    for key, value in d.items():
        inverted_dict[value].append(key)
    return inverted_dict

In [None]:
coverm_mapping = {k: find_sample(v) for k, v in coverm.items() if find_sample(v)}
inverted_dict = invert_dict(coverm_mapping)
files_to_be_used = [item for sublist in list(inverted_dict.values()) for item in sublist]

print("Files to be used:", len(files_to_be_used))

for file in all_coverm:
    if file not in files_to_be_used:
        move(file, "/data/gpfs/projects/punim0639/vini/coverm/not_used/")

## Clean tables

1. Get tables from `dask_step_3_format_concatenated_tables.py` (_formatted)
 
2. Run this notebook (_formatted_filtered)

3. Format tables with `create_R_tables.py` (_formatted_filtered_clean)

4. Normalise data with `preprocess_data.R` (_formatted_filtered_clean_normalised)

In [None]:
input_dir = "/local/path/to/"
tables = {
   "KEGG_ko": pd.read_csv(input_dir + "data/counts/KEGG_ko_trimmed_mean_formatted.csv", index_col=0),
   "KEGG_Pathway": pd.read_csv(input_dir + "data/counts/KEGG_Pathway_trimmed_mean_formatted.csv", index_col=0),
   "BRITE": pd.read_csv(input_dir + "data/counts/BRITE_trimmed_mean_formatted.csv", index_col=0),
   "COG_category": pd.read_csv(input_dir + "data/counts/COG_category_trimmed_mean_formatted.csv", index_col=0),
}

### Pathways

In [None]:
def parse_file(file_path):
    sections = {}  # Initialize dictionary to store sections and subsections

    with open(file_path, 'r') as file:
        current_section = None
        for line in file:
            line = line.strip()
            if (line.startswith('#')) and not (line.startswith('##')):
                current_section = line[1:].strip()  # Remove the '#' and any leading/trailing whitespace
                sections[current_section] = {}  # Initialize an empty dictionary for subsections under this section
            elif line.startswith('##'):
                subsection = line[2:].strip()
                sections[current_section][subsection] = []  # Initialize an empty dictionary for descriptions under this subsection
            else:
                key, value = line.split('\t', 1)
                if current_section is not None:
                    sections[current_section][subsection].append(value)

    return sections

pathways = parse_file("/data/gpfs/projects/punim1293/vini/db/kegg/pathway/pathway.list")

In [None]:
# Check all pathways
for key, value in pathways.items():
    print(key)
    for k, v in value.items():
        print(k)
        for item in v:
            print("\t", item)

In [None]:
level_1_to_remove = ["Drug Development", "Human Diseases", "Organismal Systems", "Genetic Information Processing"]
filtered_pathways = {k: v for k, v in pathways.items() if k not in level_1_to_remove}

In [None]:
# Filter diseases
keys_to_keep = ["Drug resistance: antimicrobial", ]#"Infectious disease: bacterial"]
filtered_pathways["Human Diseases"] = dict()
for k, v in pathways["Human Diseases"].items():
    if k in keys_to_keep:
        filtered_pathways["Human Diseases"][k] = v

In [None]:
# Filter cellular processes
keys_to_keep = ['Cellular community - prokaryotes',]
filtered_pathways["Cellular Processes"] = {k: v for k, v in pathways["Cellular Processes"].items() if k in keys_to_keep}
cell_motility = ['Bacterial chemotaxis', 'Flagellar assembly']
filtered_pathways["Cellular Processes"]["Cell motility"] = keys_to_keep

In [None]:
# Filter environmental information processing
keys_to_keep = ["Membrane transport"]
filtered_pathways["Environmental Information Processing"] = dict()
for k, v in pathways["Environmental Information Processing"].items():
    if k in keys_to_keep:
        filtered_pathways["Environmental Information Processing"][k] = v

In [None]:
for key, value in filtered_pathways.items():
    print(key)
    for k, v in value.items():
        print(k)
        for item in v:
            print("\t", item)

In [None]:
filtered_pathways_cols = [value for subdict in filtered_pathways.values() for value in subdict.values()]
filtered_pathways_cols = [item for sublist in filtered_pathways_cols for item in sublist]
filtered_cols = [i for i in filtered_pathways_cols if i in tables["KEGG_Pathway"].columns]
cols_to_add = [
    "Glycosphingolipid biosynthesis - lacto and neolacto series",
    "Glycosphingolipid biosynthesis - globo and isoglobo series",
	"Glycosphingolipid biosynthesis - ganglio series",
    "Biosynthesis of various plant secondary metabolites",
    "Primary bile acid biosynthesis",
	"Secondary bile acid biosynthesis"
]
filtered_cols = [i for i in filtered_cols if i not in cols_to_add]
kegg_pathway_filtered = tables["KEGG_Pathway"][[i for i in filtered_cols if i in tables["KEGG_Pathway"].columns]]

In [None]:
[i for i in kegg_pathway_filtered if i[-1].isdigit()]

In [None]:
kegg_pathway_filtered.to_csv(input_dir + "data/counts/KEGG_Pathway_trimmed_mean_formatted_filtered.csv")

In [None]:
# # Discard this approach, use CSR framework instead
# kos = tables_normalised["KEGG_ko"].columns.to_series().reset_index(drop=True)
# kos.to_csv("~/biogo-hub/data/misc/kegg_ko_columns_prefiltering.csv", index=False)
# pathways = tables_normalised["KEGG_Pathway"].columns.to_series().reset_index(drop=True)
# pathways.to_csv("~/biogo-hub/data/misc/kegg_Pathway_columns_prefiltering.csv", index=False)

# remaining_pathways = pd.read_csv("~/biogo-hub/data/misc/kegg_Pathway_columns_prefiltering.csv", header=None).values.flatten()
# filtered_pathways = [i for i in pathways if i not in remaining_pathways]
# remaining_kos = pd.read_csv("~/biogo-hub/data/misc/kegg_ko_columns_prefiltering.csv", header=None).values.flatten()
# filtered_kos = [i for i in kos if i not in remaining_kos]

### COG categories

In [None]:
tables["COG_category"].columns

to_filter = [
    'Cell cycle control, cell division, chromosome partitioning',
    'Cytoskeleton',
    'Function unknown',
    'Signal transduction mechanisms',
    'Chromatin structure and dynamics',
    ]

tables["COG_category"] = tables["COG_category"].drop(columns=to_filter)

In [None]:
tables["COG_category"].to_csv(input_dir + "data/counts/COG_category_trimmed_mean_formatted_filtered.csv")

### BRITE

In [None]:
tables["BRITE"].columns

### KEGG KO

In [None]:
kegg_ko = tables["KEGG_ko"].copy()
kegg_ko.columns = [i[:30] + "_" + str(ix) for ix, i in enumerate(tables["KEGG_ko"].columns)]

In [None]:
tables.keys()

In [None]:
tables.keys()

In [None]:
BRITE = tables["BRITE"].copy()
BRITE.columns = [i[:30] + "_" + str(ix) for ix, i in enumerate(tables["BRITE"].columns)]

In [None]:
BRITE.to_csv(input_dir + "data/counts/BRITE_trimmed_mean_formatted_filtered.csv")

In [None]:
pd.Series([len(item) for sublist in [i.columns.to_list() for i in tables.values()] for item in sublist]).describe()

### After running this

1. Run `create_R_tables.py`
2. Run `preprocess_data.R`