In [1]:
# ==========================================================
# Cell 1: Context and Parameters
# ==========================================================
# matchms_tol_0.0035_1%I_all_peaks_with_0s_only_matching.tsv → 382 entries
# matchms_tol_0.0035_1%I_top5_with_0s_only_matching.tsv → 380 entries
# RECETOX_GC-EI_MS_20201028.msp → experimental data, 384 molecules, 32 isomers (includes Bromuconazole_isomer1)
# RECETOX_GC-EI-MS_20201028.sdf → experimental data, 367 molecules, 4 isomers (Bromuconazole only)
# simulated_spectra.msp → raw simulated data, 369 molecules, 4 isomers
# simulated_matchms_filter_1%I_all_peaks.msp → simulated filtered data (all peaks), 367 molecules, 4 isomers
# simulated_matchms_filter_1%I_top5.msp → simulated filtered data (top 5 peaks), 367 molecules, 4 isomers
# simulated spectra names were generated from experiental SDF file (query). 

In [2]:
# ==============================
# Step 0: Imports and Parameters
# ==============================
from pathlib import Path
from utils import *


# Parameters
TSV_FILE = "/home/recetox/RECETOX_SIM/QCxMS2_benchmark/data/matching/matchms_tol_0.0035_1%I_all_peaks_with_0s_only_matching.tsv"
SDF_FILE = "/home/recetox/RECETOX_SIM/QCxMS2_benchmark/data/sdf/recetox_gc-ei-ms_20201028_properties.sdf"

BINS = [0, 0.4, 0.6, 0.75, 1.0]
LABELS = ["very_poor", "poor", "borderline", "very_good"]
SCORE_COL = "CosineHungarian_0.0035_0.0_1.0_scores"
MATCHES_COL = "CosineHungarian_0.0035_0.0_1.0_matches"
QUERY_COL = "query"

TARGET_CATEGORY = "very_poor"
OUTPUT_DIR = Path("./output")

In [3]:
# ==============================
# Step 1: Load TSV
# ==============================
df_scores = pd.read_csv(TSV_FILE, sep="\t")
print(f"Loaded {len(df_scores)} rows from TSV file.")

Loaded 381 rows from TSV file.


In [4]:
# Rename for simplicity and consistency
df_scores = rename_columns(df_scores, {
    SCORE_COL: "scores",
    MATCHES_COL: "matches",
    QUERY_COL: "query"
}, verbose=True)

# Updated variables for clarity
SCORE_COL_RENAMED = "scores"
MATCHES_COL_RENAMED = "matches"

Renamed column 'CosineHungarian_0.0035_0.0_1.0_scores' → 'scores'
Renamed column 'CosineHungarian_0.0035_0.0_1.0_matches' → 'matches'


In [5]:
# ==============================
# Step 2: Remove isotopically labeled molecules (early filtering)
# ==============================
df_scores, removed_iso = remove_isotopically_labeled(df_scores, column="query", verbose=True)

Removed 2 isotopically labeled molecules
['Perylene_2H12', 'Phenanthrene_2H10']
Remaining rows after removal: 379


In [6]:
# ==============================
# Step 3: Select best isomers
# ==============================
df_unique, both_max_df, alternative_selection_df = select_best_isomers(
    df_scores,
    score_col=SCORE_COL_RENAMED,
    match_col=MATCHES_COL_RENAMED,
    verbose=True
)

Selected 366 unique isomers | 362 both-max | 4 fallback


In [7]:
# ==============================
# Step 4: Categorize by scores
# ==============================
df_unique = categorize_by_score(
    df_unique,
    score_col=SCORE_COL_RENAMED,
    bins=BINS,
    labels=LABELS,
    verbose=True
)


Categorized scores:
category
very_poor     157
very_good     110
poor           57
borderline     42
Name: count, dtype: int64


In [8]:
# ==============================
# Step 5: Filter for specific category
# ==============================
df_filtered = filter_by_category(df_unique, TARGET_CATEGORY, verbose=True)


Filtered 157 molecules in category 'very_poor'


In [9]:
# ==============================
# Step 6: Extract descriptors from SDF
# ==============================
df_descriptors, mol_list = extract_descriptors_from_sdf(SDF_FILE, verbose=True)
df_descriptors.columns


Extracted descriptors for 367 molecules successfully.


[11:57:43] Skipping unrecognized collection type at line 26872: MDLV30/STERAC1 BONDS=(1 7)


Index(['molname', 'class', 'superclass', 'subclass', 'n_atoms', 'n_bonds',
       'inchikey', 'smiles', 'has_halogen', 'Cl', 'Br', 'F', 'S', 'P', 'Si',
       'rotatable_bonds', 'stereo_centers', 'molecular_complexity',
       'molecular_flexibility', 'composition'],
      dtype='object')

In [10]:
# ==============================
# Step 7: Merge descriptors with filtered TSV results
# ==============================
df_full = merge_descriptors(
    df_existing=df_filtered,
    df_descriptors=df_descriptors,
    left_on="query",
    right_on="molname",
    verbose=True
)
df_full.columns

Merged descriptors — final dataset contains 157 rows


Index(['query', 'reference', 'scores', 'matches', 'category', 'molname',
       'class', 'superclass', 'subclass', 'n_atoms', 'n_bonds', 'inchikey',
       'smiles', 'has_halogen', 'Cl', 'Br', 'F', 'S', 'P', 'Si',
       'rotatable_bonds', 'stereo_centers', 'molecular_complexity',
       'molecular_flexibility', 'composition'],
      dtype='object')

In [11]:
# Generate 3D conformers for "very_poor" category
mol_list_3D, failed_keys = generate_3D_subset(
    mol_list=mol_list,
    df=df_full,
    subset_filter={'category': 'very_poor'}
)

# Write all to XYZ files
write_multiple_mols_to_xyz(mol_list_3D, subfolder="very_poor_scores")


print(f"3D molecules generated: {len(mol_list_3D)}")
print(f"Failures: {len(failed_keys)}")




3D molecules generated: 157
Failures: 0


In [12]:
# ==============================
# Save final merged DataFrame
# ==============================
final_file = "/home/recetox/RECETOX_SIM/QCxMS2_benchmark/data/processed/very_poor_scores_unique_descriptors_all_peaks.tsv"
df_full.to_csv(final_file, sep="\t", index=False)