In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from matplotlib.colors import Normalize
from CGCNN_MT.predict import main as predict_main
# Load the data

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [2]:
def plot_tsne_for_task_by_targets(task, latent_feas, targets, targets_map, fig_file=None, figsize=(10, 8), size=10, **kwargs):
    all_latent_feas = []
    all_targets = []
    all_splits = []

    # Concatenate all latent features and targets for the task
    for split in ['train', 'val', 'test', "external_test"]:
        key = f"{task}_{split}"
        if key in latent_feas:
            all_latent_feas.append(latent_feas[key])
            all_targets.append(targets[key])
            all_splits.extend([split] * len(latent_feas[key]))

    all_latent_feas = np.concatenate(all_latent_feas, axis=0)
    all_targets = np.concatenate(all_targets, axis=0)

    # Perform TSNE
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(all_latent_feas)

    # Create a DataFrame for visualization
    tsne_df = pd.DataFrame(tsne_results, columns=['TSNE1', 'TSNE2'])
    tsne_df['Target'] = all_targets
    tsne_df['Split'] = all_splits
    tsne_df['Size'] = [size] * len(tsne_df)

    # Determine if the task is classification or regression
    unique_targets = np.unique(all_targets)
    is_classification = len(unique_targets) < 20

    # Plotting
    plt.figure(figsize=figsize)

    if is_classification:
        tsne_df['Target'] = tsne_df['Target'].apply(lambda x: targets_map[task][int(x)])
        sns.scatterplot(
            x="TSNE1", y="TSNE2",
            hue="Target",
            style="Target",
            sizes="Size",
            palette=sns.color_palette("hsv", len(unique_targets), desat=0.6),
            data=tsne_df,
            legend="full",
            alpha=kwargs.get('alpha', 0.5),
            markers=True
        )
    else:
        norm = Normalize(vmin=all_targets.min(), vmax=all_targets.max())
        cmap = plt.get_cmap('viridis')
        scatter = plt.scatter(
            tsne_df['TSNE1'], tsne_df['TSNE2'],
            c=all_targets, cmap=cmap, norm=norm,
            alpha=kwargs.get('alpha', 0.5)
        )
        plt.colorbar(scatter, label='Target')

    plt.title(f"TSNE visualization of last-layer features for task: {task} (by targets)")
    if fig_file is not None:
        plt.savefig(fig_file, dpi=300, bbox_inches='tight')
    plt.show()

### Generating the last-layer features for training set, validation set, and test set

In [5]:
task2models = {
 'TSD_SSD_WS24_all_attn': {'Model': 'att_cgcnn',
  'Path': './CGCNN_MT/logs/TSD_SSD_WS24_water_WS24_water4_WS24_acid_WS24_base_WS24_boiling_seed42_att_cgcnn/version_43'}}

result_dir = Path("./CGCNN_MT/evaluation")

tasks = [
    "TSD_SSD_WS24_all_attn"
]
model_dirs = [task2models[t] for t in tasks]

data_dirs = [
    "./CGCNN_MT/data/TSD",
    "./CGCNN_MT/data/SSD", 
    "./CGCNN_MT/data/WS24",
]
col2tasks = [
    {"Label": "TSD"},
    {"Label": "SSD"},
    {"water_label": "WS24_water", "water4_label": "WS24_water4", "acid_label": "WS24_acid", "base_label": "WS24_base", "boiling_label": "WS24_boiling"},
    ]
for task in tasks:
    model_dir = task2models[task]["Path"]
    for col2task, data_dir in zip(col2tasks, data_dirs):
        for split in ["train", "val", "test"]:
            all_outputs, all_metrics = predict_main(model_dir, data_dir, col2task, split=split, result_dir=result_dir)

/opt/share/miniconda3/envs/mofmthnn/lib/python3.9/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /opt/share/miniconda3/envs/mofmthnn/lib/python3.9/si ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/opt/share/miniconda3/envs/mofmthnn/lib/python3.9/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /opt/share/miniconda3/envs/mofmthnn/lib/python3.9/si ...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]


task_types:  ['regression', 'classification', 'classification', 'classification_4', 'classification', 'classification', 'classification']
Model hyperparameters:////////////////////////////////////////////////////////////
batch_size: 32
num_workers: 2
random_seed: 42
accelerator: gpu
devices: 1
max_epochs: 500
auto_lr_bs_find: False
progress_bar: False
focal_alpha: 0.25
focal_gamma: 2
optim: adam
lr: 0.001
weight_decay: 1e-05
momentum: 0.9
optim_config: fine
group_lr: True
lr_mult: 1
lr_scheduler: reduce_on_plateau
lr_decay_steps: 20
lr_decay_rate: 0.8
lr_decay_min_lr: 1e-06
max_steps: -1
decay_power: 1
warmup_steps: 2
load_best: False
log_dir: logs
patience: 50
min_delta: 0.001
monitor: val_Metric
mode: max
eval_freq: 10
max_num_nbr: 10
radius: 8
dmin: 0
step: 0.2
use_cell_params: True
use_extra_fea: False
augment: False
model_name: att_cgcnn
atom_fea_len: 144
extra_fea_len: 28
h_fea_len: 144
n_conv: 4
n_h: 8
att_S: 64
dropout_prob: 0.55
att_pooling: False
task_norm: True
dwa_temp: 2.0

Predicting: |          | 0/? [00:00<?, ?it/s]

(501,) (501,)
TSD/val_R2Score: 0.4820, TSD/val_MeanAbsoluteError: 45.6930


/opt/share/miniconda3/envs/mofmthnn/lib/python3.9/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /opt/share/miniconda3/envs/mofmthnn/lib/python3.9/si ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/opt/share/miniconda3/envs/mofmthnn/lib/python3.9/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /opt/share/miniconda3/envs/mofmthnn/lib/python3.9/si ...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]


Metrics saved to CGCNN_MT/evaluation/TSD_SSD_WS24_water_WS24_water4_WS24_acid_WS24_base_WS24_boiling_seed42_att_cgcnn@version_43/val_metrics.csv
task_types:  ['regression', 'classification', 'classification', 'classification_4', 'classification', 'classification', 'classification']
Model hyperparameters:////////////////////////////////////////////////////////////
batch_size: 32
num_workers: 2
random_seed: 42
accelerator: gpu
devices: 1
max_epochs: 500
auto_lr_bs_find: False
progress_bar: False
focal_alpha: 0.25
focal_gamma: 2
optim: adam
lr: 0.001
weight_decay: 1e-05
momentum: 0.9
optim_config: fine
group_lr: True
lr_mult: 1
lr_scheduler: reduce_on_plateau
lr_decay_steps: 20
lr_decay_rate: 0.8
lr_decay_min_lr: 1e-06
max_steps: -1
decay_power: 1
warmup_steps: 2
load_best: False
log_dir: logs
patience: 50
min_delta: 0.001
monitor: val_Metric
mode: max
eval_freq: 10
max_num_nbr: 10
radius: 8
dmin: 0
step: 0.2
use_cell_params: True
use_extra_fea: False
augment: False
model_name: att_cgcnn


Predicting: |          | 0/? [00:00<?, ?it/s]

/opt/share/miniconda3/envs/mofmthnn/lib/python3.9/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /opt/share/miniconda3/envs/mofmthnn/lib/python3.9/si ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/opt/share/miniconda3/envs/mofmthnn/lib/python3.9/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /opt/share/miniconda3/envs/mofmthnn/lib/python3.9/si ...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]


Metrics saved to CGCNN_MT/evaluation/TSD_SSD_WS24_water_WS24_water4_WS24_acid_WS24_base_WS24_boiling_seed42_att_cgcnn@version_43/val_metrics.csv
task_types:  ['regression', 'classification', 'classification', 'classification_4', 'classification', 'classification', 'classification']
Model hyperparameters:////////////////////////////////////////////////////////////
batch_size: 32
num_workers: 2
random_seed: 42
accelerator: gpu
devices: 1
max_epochs: 500
auto_lr_bs_find: False
progress_bar: False
focal_alpha: 0.25
focal_gamma: 2
optim: adam
lr: 0.001
weight_decay: 1e-05
momentum: 0.9
optim_config: fine
group_lr: True
lr_mult: 1
lr_scheduler: reduce_on_plateau
lr_decay_steps: 20
lr_decay_rate: 0.8
lr_decay_min_lr: 1e-06
max_steps: -1
decay_power: 1
warmup_steps: 2
load_best: False
log_dir: logs
patience: 50
min_delta: 0.001
monitor: val_Metric
mode: max
eval_freq: 10
max_num_nbr: 10
radius: 8
dmin: 0
step: 0.2
use_cell_params: True
use_extra_fea: False
augment: False
model_name: att_cgcnn


Predicting: |          | 0/? [00:00<?, ?it/s]

/opt/share/miniconda3/envs/mofmthnn/lib/python3.9/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /opt/share/miniconda3/envs/mofmthnn/lib/python3.9/si ...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]


Predicting WS24_water4...
prop_cols: ['water4_label']


Predicting: |          | 0/? [00:00<?, ?it/s]

/opt/share/miniconda3/envs/mofmthnn/lib/python3.9/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /opt/share/miniconda3/envs/mofmthnn/lib/python3.9/si ...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]


Predicting WS24_acid...
prop_cols: ['acid_label']


Predicting: |          | 0/? [00:00<?, ?it/s]

/opt/share/miniconda3/envs/mofmthnn/lib/python3.9/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /opt/share/miniconda3/envs/mofmthnn/lib/python3.9/si ...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]


Predicting WS24_base...
prop_cols: ['base_label']


Predicting: |          | 0/? [00:00<?, ?it/s]

/opt/share/miniconda3/envs/mofmthnn/lib/python3.9/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /opt/share/miniconda3/envs/mofmthnn/lib/python3.9/si ...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]


Predicting WS24_boiling...
prop_cols: ['boiling_label']


Predicting: |          | 0/? [00:00<?, ?it/s]

Metrics saved to CGCNN_MT/evaluation/TSD_SSD_WS24_water_WS24_water4_WS24_acid_WS24_base_WS24_boiling_seed42_att_cgcnn@version_43/val_metrics.csv


In [6]:
# log_dir = "./CGCNN_MT/evaluation/TSD_SSD_WS24_water_WS24_water4_WS24_acid_WS24_base_WS24_boiling_seed42_att_cgcnn@version_142"
log_dir = "./CGCNN_MT/evaluation/TSD_SSD_WS24_water_WS24_water4_WS24_acid_WS24_base_WS24_boiling_seed42_att_cgcnn@version_43"
log_dir = Path(log_dir)
latent_feas = {}
targets = {}
targets_map = {
    "SSD": {0: "unstable", 1: "stable"},
    "WS24_water": {0: "unstable", 1: "stable"},
    "WS24_water4": {0: "unstable", 1: "low kinetic stability", 2: "high kinetic stability", 3: "thermodynamic stable"},
    "WS24_acid": {0: "unstable", 1: "stable"},
    "WS24_base": {0: "unstable", 1: "stable"},
    "WS24_boiling": {0: "unstable", 1: "stable"},
}
for task in ["TSD", "SSD", "WS24_water", "WS24_water4", "WS24_acid", "WS24_base", "WS24_boiling"]:
    for split in [
        'train', 
        'val', 
        'test',
        # "external_test"
        ]:
        latent_fea = np.load(log_dir / f"{split}_last_layer_fea_{task}.npz")
        latent_feas[f"{task}_{split}"] = latent_fea[list(latent_fea.keys())[0]]
        res_df = pd.read_csv(log_dir / f"{split}_results_{task}.csv")
        # if task == "TSD":
        #     targets[f"{task}_{split}"] = (res_df['GroundTruth'].values > 359).astype(int)
        # else:
        targets[f"{task}_{split}"] = res_df['GroundTruth'].values

In [7]:
for task in [
    "TSD", "SSD", 
    "WS24_water", "WS24_water4", 
    "WS24_acid", "WS24_base", "WS24_boiling"
    ]:
    fig_file= log_dir/f"{task}_last_layer_tsne_by_targets.png"
    plot_tsne_for_task_by_targets(task, latent_feas, targets, targets_map, fig_file, (8, 6), size=30, alpha=0.6)