In [1]:
import os
import pandas as pd

In [2]:
all_node_files = [f for f in os.listdir('../baseline-spras-nodes') if f.endswith('.txt')]
train_split_nodes = [f for f in os.listdir('../gnn-train-split-samples-nodes') if f.endswith('.txt')]

In [3]:
test_and_empty_train = list(set(all_node_files) - set(train_split_nodes))
len(test_and_empty_train)

252

In [4]:
def filter_files_with_data(file_list, folder_path):
    """
    Given:
      - file_list: list of filenames (e.g. ["a.csv","b.csv",…])
      - folder_path: path to the directory containing them

    Returns:
      - A new list containing only those files where, under the
        columns NODEID & prize, there is at least one non-empty row.
    """
    kept = []
    for fname in file_list:
        full_path = os.path.join(folder_path, fname)
        try:
            df = pd.read_csv(full_path, sep='\t')
        except Exception as e:
            # unreadable or not a CSV → skip
            print(f"Skipping {fname!r}: {e}")
            continue

        # Only consider files that actually have both columns:
        if {'NODEID','prize'}.issubset(df.columns):
            # Take just those two columns, coerce blanks → NaN:
            subset = df[['NODEID','prize']].replace(r'^\s*$', pd.NA, regex=True)
            # Drop any rows where *both* are NA:
            non_empty = subset.dropna(how='all')
            if non_empty.empty:
                # no rows had data in either column → skip the file
                continue

        # If we get here, either:
        #  • the file lacked one/both columns (we leave it in), or
        #  • it had at least one data‐row under NODEID/prize
        kept.append(fname)

    return kept


In [5]:
folder = "../baseline-spras-nodes"
filtered = filter_files_with_data(test_and_empty_train, folder)
print("Files with data in NODEID/prize:", filtered)
len(filtered)

Files with data in NODEID/prize: ['ether_lipid_metaboli_train_7450_nodes.txt', 'nitrogen_metabolism_train_3442_nodes.txt', 'aminoacyl_trna_biosy_train_5354_nodes.txt', 'fatty_acid_degradati_train_2865_nodes.txt', 'porphyrin_and_chloro_train_4873_nodes.txt', 'ubiquinone_and_other_train_4287_nodes.txt', 'starch_and_sucrose_m_train_4604_nodes.txt', 'pantothenate_and_coa_train_8812_nodes.txt', 'phenylalanine__tyros_train_9660_nodes.txt', 'pentose_and_glucuron_train_8480_nodes.txt', 'glycosphingolipid_bi__2_train_1757_nodes.txt', 'beta_alanine_metabol_train_4217_nodes.txt', 'histidine_metabolism_train_4981_nodes.txt', 'fatty_acid_biosynthe_train_1183_nodes.txt', 'steroid_biosynthesis_train_5084_nodes.txt', 'tryptophan_metabolis_train_6753_nodes.txt', 'pentose_phosphate_pa_train_8739_nodes.txt', 'citrate_cycle__tca_c_train_5392_nodes.txt', 'steroid_hormone_bios_train_8921_nodes.txt', 'fatty_acid_degradati_train_2123_nodes.txt', 'glycerophospholipid_train_8885_nodes.txt', 'purine_metabolism_t

137

In [6]:
output_path = "../processed-data-files/test_split_node_files.txt" 

with open(output_path, "w") as f:
    for item in filtered:
        f.write(f"{item}\n")