In [None]:
from pathlib import Path
import pandas as pd
from IPython.display import display, Markdown

# Define your project root (adjust if needed)
ROOT = Path(r"C:\DOCTORAL HUB\nmr_pipeline_project")

# Quick check
print(f" ROOT path set to: {ROOT.resolve()}")


 ROOT path set to: C:\DOCTORAL HUB\nmr_pipeline_project


In [2]:
import pandas as pd
from pathlib import Path
from IPython.display import display, Markdown
import json

# --- Define paths ---
ROOT = Path("C:/DOCTORAL HUB/nmr_pipeline_project")
VERIFIED = ROOT / "data" / "curated" / "verified"
DESC = ROOT / "data" / "descriptors"
MERGED = ROOT / "data" / "merged"
REPORTS = ROOT / "data" / "reports"

MERGED.mkdir(parents=True, exist_ok=True)
REPORTS.mkdir(parents=True, exist_ok=True)

# --- Load verified datasets ---
mols = pd.read_csv(VERIFIED / "molecules_verified.csv")
assigns = pd.read_csv(VERIFIED / "assignments_verified.csv")

display(Markdown("## Data Loaded Successfully"))
print(f"Molecules: {len(mols):,}")
print(f"Assignments: {len(assigns):,}")
display(mols.head(2))
display(assigns.head(2))


## Data Loaded Successfully

Molecules: 61,215
Assignments: 678,775


Unnamed: 0,mol_idx,name,smiles,inchi,solvent,temperature_k,has_13c,has_1h,n_atoms
0,1,,C1(C(C(C2(C(C1([H])[H])(C(C(=C(C2([H])[H])[H])...,InChI=1S/C15H22O3/c1-13(2)7-4-8-14(3)12(13)6-5...,CDCl3,298.0,True,False,40
1,2,Subergorgiol,C1(C(C2(C3(C1(C(=C(C3(C([H])([H])[H])[H])C(O[H...,InChI=1S/C15H24O/c1-10-4-7-15-11(2)12(9-16)8-1...,CDCl3,298.0,True,False,40


Unnamed: 0,mol_idx,atom_index,element,shift_ppm
0,1,11,C,17.6
1,1,1,C,18.3


In [2]:
ecfp0 = pd.read_csv(DESC / "ecfp0.csv")
ecfp2 = pd.read_csv(DESC / "ecfp2.csv")
ecfp4 = pd.read_csv(DESC / "ecfp4.csv")
hose = pd.read_csv(DESC / "hose.csv")

display(Markdown("## Descriptors Loaded"))
print(f"ECFP0: {len(ecfp0):,} rows")
print(f"ECFP2: {len(ecfp2):,} rows")
print(f"ECFP4: {len(ecfp4):,} rows")
print(f"HOSE: {len(hose):,} rows")


## Descriptors Loaded

ECFP0: 60,792 rows
ECFP2: 60,792 rows
ECFP4: 60,792 rows
HOSE: 513,880 rows


In [3]:
display(Markdown("##  Validation of Index Keys"))

common_idx = set(mols['mol_idx']).intersection(assigns['mol_idx']).intersection(ecfp0['mol_idx'])
print(f" Common molecule indices across all datasets: {len(common_idx):,}")


##  Validation of Index Keys

 Common molecule indices across all datasets: 60,792


In [4]:
display(Markdown("## Merging ECFP Descriptors"))

# Drop duplicates just in case
ecfp0 = ecfp0.drop_duplicates(subset=["mol_idx"])
ecfp2 = ecfp2.drop_duplicates(subset=["mol_idx"])
ecfp4 = ecfp4.drop_duplicates(subset=["mol_idx"])

# Merge all ECFP descriptors side-by-side
ecfp_merged = ecfp0.merge(ecfp2, on="mol_idx", suffixes=("_r0", "_r2")).merge(ecfp4, on="mol_idx", suffixes=("", "_r4"))

print(f" ECFP merged shape: {ecfp_merged.shape}")
display(ecfp_merged.head(3))


## Merging ECFP Descriptors

 ECFP merged shape: (60792, 3073)


Unnamed: 0,mol_idx,bit_0_r0,bit_1_r0,bit_2_r0,bit_3_r0,bit_4_r0,bit_5_r0,bit_6_r0,bit_7_r0,bit_8_r0,...,bit_1014,bit_1015,bit_1016,bit_1017,bit_1018,bit_1019,bit_1020,bit_1021,bit_1022,bit_1023
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [7]:
print("Assignments columns:")
print(assigns.columns.tolist())

print("\nHOSE columns:")
print(hose.columns.tolist())


Assignments columns:
['mol_idx', 'atom_index', 'element', 'shift_ppm']

HOSE columns:
['mol_idx', 'atom_index', 'element', 'shift_ppm', 'hose_1', 'hose_2', 'hose_3', 'hose_4']


In [11]:
from pathlib import Path
import pandas as pd
from IPython.display import display, Markdown

display(Markdown("## Chunk-Based Safe ECFP Compression (No Memory Errors)"))

# Path to your large ECFP descriptor file
ecfp_path = Path(ROOT / "data/descriptors/ecfp0.csv")  # or ecfp2.csv / ecfp4.csv if you prefer

# Output path for reduced descriptor file
compact_path = Path(ROOT / "data/descriptors/ecfp_compact.csv")

# Define how many fingerprint columns to keep
N_COLS_TO_KEEP = 128
CHUNKSIZE = 5000  # number of rows processed at once

# Read only the header to identify columns
with open(ecfp_path, 'r') as f:
    header = f.readline().strip().split(',')

# Always keep mol_idx + first N_COLS_TO_KEEP fingerprint columns
cols_to_use = ['mol_idx'] + header[1:N_COLS_TO_KEEP + 1]

print(f" Columns to use: {len(cols_to_use)} / {len(header)} total")

# Prepare the output file
pd.DataFrame(columns=cols_to_use).to_csv(compact_path, index=False)

# Process file in chunks to avoid memory overflow
for chunk in pd.read_csv(ecfp_path, usecols=cols_to_use, chunksize=CHUNKSIZE):
    chunk.to_csv(compact_path, mode='a', header=False, index=False)

print(f" Compact ECFP saved at: {compact_path}")


## Chunk-Based Safe ECFP Compression (No Memory Errors)

 Columns to use: 129 / 1025 total
 Compact ECFP saved at: C:\DOCTORAL HUB\nmr_pipeline_project\data\descriptors\ecfp_compact.csv


In [None]:
# --- Load all necessary data for Phase 3 (Final Fixed CSV Version) ---
from pathlib import Path
import pandas as pd
from IPython.display import display, Markdown

# Define project root
ROOT = Path(r"C:\DOCTORAL HUB\nmr_pipeline_project")
print(f" ROOT path set to: {ROOT.resolve()}")

# Define correct CSV paths
assign_path = ROOT / "data/curated/verified/assignments_verified.csv"
hose_path = ROOT / "data/descriptors/hose.csv"
ecfp_path = ROOT / "data/descriptors/ecfp_compact.csv"

# Check if files exist before reading
for path in [assign_path, hose_path, ecfp_path]:
    print(f"{'' if path.exists() else '❌'} {path.name} -> {path}")

# Load data (all CSVs)
assigns = pd.read_csv(assign_path)
hose = pd.read_csv(hose_path)
ecfp_small = pd.read_csv(ecfp_path)

print(f"\n Assignments shape: {assigns.shape}")
print(f" HOSE shape: {hose.shape}")
print(f" ECFP shape: {ecfp_small.shape}")


✅ ROOT path set to: C:\DOCTORAL HUB\nmr_pipeline_project
✅ assignments_verified.csv -> C:\DOCTORAL HUB\nmr_pipeline_project\data\curated\verified\assignments_verified.csv
✅ hose.csv -> C:\DOCTORAL HUB\nmr_pipeline_project\data\descriptors\hose.csv
✅ ecfp_compact.csv -> C:\DOCTORAL HUB\nmr_pipeline_project\data\descriptors\ecfp_compact.csv

✅ Assignments shape: (678775, 4)
✅ HOSE shape: (513880, 8)
✅ ECFP shape: (60792, 129)


In [11]:
# --- Phase 3 Final Merge: Streamed Mode (Memory-Safe) ---
from pathlib import Path
import pandas as pd

# Define output folder and file
output_path = ROOT / "data/merged/merged_phase3.csv"
output_path.parent.mkdir(parents=True, exist_ok=True)

# --- Step 1: Merge HOSE (atom-level) with Assignments ---
print(" Merging HOSE (atom-level) with Assignments...")
merged = assigns.merge(hose, on=["mol_idx", "atom_index"], how="inner")
print(f" After HOSE merge: {merged.shape}")

# --- Step 2: Prepare ECFP data ---
print("\n Preparing compact ECFP data...")
ecfp_small = ecfp_small.drop_duplicates(subset=["mol_idx"]).reset_index(drop=True)
ecfp_small = ecfp_small.fillna(0)
print(f" Compact ECFP prepared: {ecfp_small.shape}")

# --- Step 3: Streamed merge in chunks to avoid memory overflow ---
print("\n Starting streamed merge in chunks...")
CHUNK = 20000  # adjust if needed
cols = list(merged.columns) + list(ecfp_small.columns[1:])

# Write header only once
pd.DataFrame(columns=cols).to_csv(output_path, index=False)

# Merge and append chunk-by-chunk
for i in range(0, len(merged), CHUNK):
    part = merged.iloc[i:i+CHUNK].merge(ecfp_small, on="mol_idx", how="left")
    part.to_csv(output_path, mode="a", header=False, index=False)
    print(f" Merged chunk {i//CHUNK + 1}/{(len(merged)//CHUNK) + 1} ({len(part)} rows)")

print(f"\n All chunks merged successfully!\n Final saved file: {output_path}")


 Merging HOSE (atom-level) with Assignments...
 After HOSE merge: (756690, 10)

 Preparing compact ECFP data...
 Compact ECFP prepared: (60792, 129)

 Starting streamed merge in chunks...
 Merged chunk 1/38 (20000 rows)
 Merged chunk 2/38 (20000 rows)
 Merged chunk 3/38 (20000 rows)
 Merged chunk 4/38 (20000 rows)
 Merged chunk 5/38 (20000 rows)
 Merged chunk 6/38 (20000 rows)
 Merged chunk 7/38 (20000 rows)
 Merged chunk 8/38 (20000 rows)
 Merged chunk 9/38 (20000 rows)
 Merged chunk 10/38 (20000 rows)
 Merged chunk 11/38 (20000 rows)
 Merged chunk 12/38 (20000 rows)
 Merged chunk 13/38 (20000 rows)
 Merged chunk 14/38 (20000 rows)
 Merged chunk 15/38 (20000 rows)
 Merged chunk 16/38 (20000 rows)
 Merged chunk 17/38 (20000 rows)
 Merged chunk 18/38 (20000 rows)
 Merged chunk 19/38 (20000 rows)
 Merged chunk 20/38 (20000 rows)
 Merged chunk 21/38 (20000 rows)
 Merged chunk 22/38 (20000 rows)
 Merged chunk 23/38 (20000 rows)
 Merged chunk 24/38 (20000 rows)
 Merged chunk 25/38 (20000 ro

In [13]:
# --- Phase 3 Verification: Data Integrity & Consistency ---
import pandas as pd

print(" Verifying Phase 3 Merged Data Integrity...\n")

# Load the merged dataset (from the output file we created)
merged_path = ROOT / "data/merged/merged_phase3.csv"
merged_df = pd.read_csv(merged_path)

# --- Basic Shapes ---
print(f"Assignments: {assigns.shape}")
print(f"HOSE: {hose.shape}")
print(f"ECFP (Compact): {ecfp_small.shape}")
print(f"Merged: {merged_df.shape}")

# --- Molecule and Atom Counts ---
mol_in_assign = assigns['mol_idx'].nunique()
mol_in_hose = hose['mol_idx'].nunique()
mol_in_ecfp = ecfp_small['mol_idx'].nunique()
mol_in_merged = merged_df['mol_idx'].nunique()

atom_in_assign = assigns.shape[0]
atom_in_hose = hose.shape[0]
atom_in_merged = merged_df.shape[0]

print("\n Molecule Counts")
print(f"Assignments mol_idx: {mol_in_assign}")
print(f"HOSE mol_idx:        {mol_in_hose}")
print(f"ECFP mol_idx:        {mol_in_ecfp}")
print(f"Merged mol_idx:      {mol_in_merged}")

print("\n Atom Counts")
print(f"Assignments atoms: {atom_in_assign}")
print(f"HOSE atoms:        {atom_in_hose}")
print(f"Merged atoms:      {atom_in_merged}")

# --- Missing Value & Duplicate Checks ---
missing_vals = merged_df.isnull().sum().sum()
duplicates = merged_df.duplicated(subset=["mol_idx", "atom_index"]).sum()

print("\n Data Quality Checks")
print(f"Missing values: {missing_vals}")
print(f"Duplicate (mol_idx + atom_index): {duplicates}")

# --- Sanity Assertions ---
if mol_in_merged <= mol_in_ecfp:
    print("\n Molecule count preserved.")
else:
    print("\n Molecule mismatch detected!")

if atom_in_merged >= 0.95 * atom_in_assign:
    print(" Atom-level mapping nearly complete.")
else:
    print(" Potential loss in atom-level mapping — investigate.")

if missing_vals == 0:
    print(" No missing values detected.")
else:
    print(" Missing values exist — consider imputing or reviewing.")

if duplicates == 0:
    print(" No duplicate atom entries.")
else:
    print(f" {duplicates} duplicate atom entries found — verify consistency.")

print("\n Integrity verification complete.")



 Verifying Phase 3 Merged Data Integrity...

Assignments: (678775, 4)
HOSE: (513880, 8)
ECFP (Compact): (60792, 129)
Merged: (756690, 138)

 Molecule Counts
Assignments mol_idx: 61215
HOSE mol_idx:        49761
ECFP mol_idx:        60792
Merged mol_idx:      49761

 Atom Counts
Assignments atoms: 678775
HOSE atoms:        513880
Merged atoms:      756690

 Data Quality Checks
Missing values: 0
Duplicate (mol_idx + atom_index): 322543

 Molecule count preserved.
 Atom-level mapping nearly complete.
 No missing values detected.
 322543 duplicate atom entries found — verify consistency.

 Integrity verification complete.


In [14]:
# --- Phase 3 Cleanup: Remove duplicate atom entries (Safe + Verified) ---
print(" Cleaning up duplicate atom entries (mol_idx + atom_index)...")

# Track before cleanup
before_rows = merged_df.shape[0]
before_mols = merged_df["mol_idx"].nunique()

# Remove exact duplicates (same molecule + atom)
merged_clean = merged_df.drop_duplicates(subset=["mol_idx", "atom_index"], keep="first")

# Track after cleanup
after_rows = merged_clean.shape[0]
after_mols = merged_clean["mol_idx"].nunique()

# Save cleaned version safely
clean_path = ROOT / "data/merged/merged_phase3_clean.csv"
merged_clean.to_csv(clean_path, index=False)

# --- Verification summary ---
print("\n Cleanup complete!")
print(f" Original merged rows: {before_rows:,}")
print(f" Cleaned merged rows: {after_rows:,}")
print(f" Duplicates removed:  {before_rows - after_rows:,}")
print(f" Molecules before:    {before_mols:,}")
print(f" Molecules after:     {after_mols:,}")

if before_mols == after_mols:
    print("\n Molecule-level integrity preserved — no loss of chemical information.")
else:
    print("\n Warning: Molecule count changed — check mapping consistency.")

print(f"\n Cleaned dataset saved to:\n{clean_path}")


 Cleaning up duplicate atom entries (mol_idx + atom_index)...

 Cleanup complete!
 Original merged rows: 756,690
 Cleaned merged rows: 434,147
 Duplicates removed:  322,543
 Molecules before:    49,761
 Molecules after:     49,761

 Molecule-level integrity preserved — no loss of chemical information.

 Cleaned dataset saved to:
C:\DOCTORAL HUB\nmr_pipeline_project\data\merged\merged_phase3_clean.csv
