In [1]:
import pandas as pd
from pathlib import Path
import shutil

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [2]:
ws4_set_dir = "./raw_data/WS24v2/data_sets/validation_set"
tsd_ssd_dir = "./raw_data/Nandy_2022_SciData/blinded_test_set"
tgt_root_dir = "./CGCNN_MT/data"
core_mof_dir = Path("./raw_data/CoRE2019")

In [3]:
tgt_ws_dir = Path(tgt_root_dir)/"WS24v2_external_test"
tgt_ts_dir = Path(tgt_root_dir)/"TS_external_test"

# create directories
tgt_ws_dir.mkdir(parents=True, exist_ok=True)
tgt_ts_dir.mkdir(parents=True, exist_ok=True)


In [4]:
ws_label_file = Path(ws4_set_dir)/"sources_labels_v2.csv"
ts_label_file = Path(tsd_ssd_dir)/"blinded_40_elsevier_MOFs.csv"

df_ws = pd.read_csv(ws_label_file)
df_ts = pd.read_csv(ts_label_file)

## The prediction performance of the external test set in source reference

In [5]:
from sklearn import metrics

df_ts.head()
ss_acc = metrics.accuracy_score(df_ts['label (solvent removal stability)'], df_ts['predicted label (solvent removal stability)'])
ss_auc = metrics.roc_auc_score(df_ts['label (solvent removal stability)'], df_ts['predicted probability (solvent removal stability)'])
ts_r2= metrics.r2_score(df_ts['label (thermal stability)'], df_ts['predicted Td (thermal stability)'])
ts_mae = metrics.mean_absolute_error(df_ts['label (thermal stability)'], df_ts['predicted Td (thermal stability)'])

print('Solvent removal stability accuracy:', ss_acc)
print('Solvent removal stability AUC:', ss_auc)
print('Thermal stability R^2:', ts_r2)
print('Thermal stability MAE:', ts_mae)

Solvent removal stability accuracy: 0.775
Solvent removal stability AUC: 0.8800000000000001
Thermal stability R^2: 0.3144906985649343
Thermal stability MAE: 54.582698990000004


In [6]:
## Prepare external test set for water stability
df_ws.rename({
    'water_stability_label': 'water4_label',
    'file_name': 'MofName'

}, axis=1, inplace=True)

df_ws["water_label"] = df_ws["water4_label"].apply(lambda x: 1 if x >2 else 0)
df_ws["water4_label"] = df_ws["water4_label"]
df_ws["MofName"] = df_ws["MofName"].apply(lambda x: x.split('.')[0])

valid_cols = ['MofName', 'MOF_name', "CCDC_refcode", 'water_label', 'water4_label', 'acid_label', 'base_label', 'boiling_label']
df_ws = df_ws[valid_cols]

df_ws.insert(1, "Partition", "external_test")


### Copy CIF files to external test set directory
tgt_ws_cif_dir = tgt_ws_dir/'cifs'
tgt_ws_cif_dir.mkdir(exist_ok=True)
failed_mofs = []
for cifid in df_ws['MofName']:
    cif_file = Path(ws4_set_dir)/f"CIFs/{cifid}.cif"
    if not cif_file.exists():
        failed_mofs.append(cifid)
        continue
    shutil.copy(str(cif_file), tgt_ws_cif_dir)
df_ws = df_ws[~df_ws['MofName'].isin(failed_mofs)]
df_ws.to_csv(tgt_ws_dir/'id_prop.csv', index=False)

##  Prepare external test set for thermal stability and solvent removal stability
df_ts.rename({
    'label (solvent removal stability)': 'ss_label',
    'label (thermal stability)': 'ts_label',
    'CoRE_name': 'MofName'

}, axis=1, inplace=True)

valid_cols = ['MofName', "refcode", 'ts_label', 'ss_label']
df_ts = df_ts[valid_cols]
df_ts.insert(1, "Partition", "external_test")
df_ts.to_csv(tgt_ts_dir/'id_prop.csv', index=False)

### Copy CIF files to external test set directory
tgt_ts_cif_dir = tgt_ts_dir/'cifs'
tgt_ts_cif_dir.mkdir(exist_ok=True)
for cifid in df_ts['CifId']:
    cif_file = Path(core_mof_dir)/f"{cifid}.cif"
    # shutil.copy(str(cif_file), tgt_ts_cif_dir)

## Clean cifs

In [6]:
import subprocess
from pathlib import Path
import os
import time
import shutil

def run_slurm_job(work_dir, executor="sbatch", script_name="run"):
    work_dir = Path(work_dir)
    process = subprocess.Popen(
        f"{executor} {work_dir/script_name}",
        # [executor, str(work_dir/'run'), "&"],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        shell=True,
        env=os.environ.copy(),
        cwd=str(work_dir)
    )
    return process

In [8]:
job_templet = """#!/bin/bash
#SBATCH --job-name={job_name}
#SBATCH --output={log_dir}/%x_%A.out
#SBATCH --error={log_dir}/%x_%A.err
#SBATCH --partition=C9654 
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task={n_cpus}
#SBATCH --mem-per-cpu=50G

export PATH=/opt/share/miniconda3/envs/mofmthnn/bin/:$PATH
export LD_LIBRARY_PATH=/opt/share/miniconda3/envs/mofmthnn/lib/:$LD_LIBRARY_PATH

srun python -u clean_cif.py --cif_dir {cif_dir} --output_dir {output_dir} --santize {santize} --log_file {log_file} --n_cpus {n_cpus}
""".strip()


script_name = "clean.sh"
work_dir = Path("./CGCNN_MT/datamodule")

cif_dirs = [
    # "./CGCNN_MT/data/TS_external_test/cifs",
    "./CGCNN_MT/data/WS24v2_external_test/cifs"
]

for cif_dir in cif_dirs:
    job_name = f"clean_{Path(cif_dir).parent.name}"
    n_cpus = 1
    santize = False
    output_dir = Path(cif_dir).parent / "clean_cifs"
    log_dir = Path(cif_dir).parent
    if output_dir.exists():
        shutil.rmtree(output_dir)
    log_file = output_dir / "clean.log"
    job_script = job_templet.format(job_name=job_name, 
                                            cif_dir=cif_dir, 
                                            output_dir=output_dir,
                                            log_file=log_file,
                                            n_cpus=n_cpus,
                                            santize=santize,
                                            log_dir=log_dir
                                            )
    with open(work_dir/script_name, "w") as f:
        f.write(job_script)
    process = run_slurm_job(work_dir, executor="sbatch", script_name=script_name)
    ## get the output of the job
    while True:
        output = process.stdout.readline()
        if output == b'' and process.poll() is not None:
            break
        if output:
            print(output.decode().strip())
    print(f"Submitted job {job_name} with PID {process.pid}")
    time.sleep(1)

Submitted batch job 199515
Submitted job clean_WS24v2_external_test with PID 2140453


In [7]:
cif_dirs = [
    # "./CGCNN_MT/data/TS_external_test/cifs",
    "./CGCNN_MT/data/WS24v2_external_test/cifs"
]

for cif_dir in cif_dirs:
    output_dir = Path(cif_dir).parent / "clean_cifs"
    print(cif_dir)
    print(len(list(Path(cif_dir).glob("*.cif"))), len(list(output_dir.glob("*.cif"))))
    print("-"*10)

./CGCNN_MT/data/WS24v2_external_test/cifs
46 46
----------


## Prepare graph data files

In [9]:
job_templet = """#!/bin/bash
#SBATCH --job-name={job_name}
#SBATCH --output={log_dir}/%x_%A.out
#SBATCH --error={log_dir}/%x_%A.err
#SBATCH --partition=C9654 
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task={n_cpus}
#SBATCH --mem-per-cpu=50G

export PATH=/opt/share/miniconda3/envs/mofmthnn/bin/:$PATH
export LD_LIBRARY_PATH=/opt/share/miniconda3/envs/mofmthnn/lib/:$LD_LIBRARY_PATH

srun python -u prepare_data.py --cif_dir {cif_dir} --n_cpus {n_cpus}
""".strip()


script_name = "prepare.sh"
work_dir = Path("./CGCNN_MT/datamodule")

cif_dirs = [
    # "./CGCNN_MT/data/TS_external_test/clean_cifs",
    "./CGCNN_MT/data/WS24v2_external_test/clean_cifs"
]

for cif_dir in cif_dirs:
    job_name = f"prepare_{Path(cif_dir).parent.name}"
    log_dir = Path(cif_dir).parent
    n_cpus = 1
    job_script = job_templet.format(job_name=job_name, 
                                            cif_dir=cif_dir, 
                                            log_dir=log_dir, 
                                            n_cpus=n_cpus
                                            )
    with open(work_dir/script_name, "w") as f:
        f.write(job_script)
    process = run_slurm_job(work_dir, executor="sbatch", script_name=script_name)
    ## get the output of the job
    while True:
        output = process.stdout.readline()
        if output == b'' and process.poll() is not None:
            break
        if output:
            print(output.decode().strip())
    print(f"Submitted job {job_name} with PID {process.pid}")
    time.sleep(1)

Submitted batch job 199523
Submitted job prepare_WS24v2_external_test with PID 2562377


## Prepare zeo++ and RACs feature files

In [10]:
job_templet_feat = """#!/bin/bash
#SBATCH --job-name={job_name}
#SBATCH --output={log_dir}/%x_%A.out
#SBATCH --error={log_dir}/%x_%A.err
#SBATCH --partition=C9654 
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=96
export PATH=/opt/share/miniconda3/envs/mofmthnn/bin/:$PATH
export LD_LIBRARY_PATH=/opt/share/miniconda3/envs/mofmthnn/lib/:$LD_LIBRARY_PATH

srun python -u feature_generation.py --cif_dir {cif_dir} --prob_radius {prob_radius}
"""

work_dir = Path("./ML/featuring")
data_dir = Path("./CGCNN_MT/data")
task_names = [
    # "TS_external_test", 
    "WS24v2_external_test"
    ]
script_name = "run_slurm.sh"

for task_name in task_names:
    job_name = f"cif_feat_{task_name}"
    cif_dir = data_dir/f"{task_name}/clean_cifs"
    prob_radius = 1.86
    job_script = job_templet_feat.format(job_name=job_name, 
                                    cif_dir=cif_dir, 
                                    prob_radius=prob_radius, 
                                    log_dir=cif_dir.parent
                                    )
    with open(work_dir/script_name, "w") as f:
        f.write(job_script)
    process = run_slurm_job(work_dir, executor="sbatch", script_name=script_name)
    while True:
        output = process.stdout.readline()
        if output == b'' and process.poll() is not None:
            break
        if output:
            print(output.decode().strip())
    print(f"Submitted job {job_name} with PID {process.pid}")
    time.sleep(1)

Submitted batch job 199524
Submitted job cif_feat_WS24v2_external_test with PID 2562727


### Merge zeo++ and RACs features.

In [11]:


## WS24_external_test
feat_file = "./CGCNN_MT/data/WS24v2_external_test/RAC_and_zeo_features.csv"
id_prop_file = "./CGCNN_MT/data/WS24v2_external_test/id_prop.csv"
out_file = feat_file.replace(".csv", "_with_id_prop.csv")

df_new = pd.read_csv(feat_file)
df = pd.read_csv(id_prop_file)

print(df_new.shape, df.shape)
df_new.drop(columns=['cif_file'], inplace=True)
df_new.rename(columns={'name': 'MofName'}, inplace=True)

id_prop_cols = ["MofName", "Partition", "water_label", "water4_label", "acid_label", "base_label", "boiling_label"]
df_new = df[id_prop_cols].merge(df_new, on="MofName", how="left")
# df_new.dropna(axis=0, how='any', inplace=True)
print(df_new.shape)
df_new.to_csv(out_file, index=False)


# TS_external_test
feat_file = "./CGCNN_MT/data/TS_external_test/RAC_and_zeo_features.csv"
id_prop_file = "./CGCNN_MT/data/TS_external_test/id_prop.csv"
out_file = feat_file.replace(".csv", "_with_id_prop.csv")

df_new = pd.read_csv(feat_file)
df = pd.read_csv(id_prop_file)

print(df_new.shape, df.shape)
df_new.drop(columns=['cif_file'], inplace=True)
df_new.rename(columns={'name': 'MofName'}, inplace=True)

df["ts2_label"] = (df["ts_label"] >= 359).astype(int)
id_prop_cols = ["MofName", "Partition", "ts_label", "ts2_label", "ss_label"]
df_new = df[id_prop_cols].merge(df_new, on="MofName", how="left")
df_new.dropna(axis=0, how='any', inplace=True)
print(df_new.shape)
df_new.to_csv(out_file, index=False)

(46, 192) (46, 9)
(46, 197)
