# deCIFer: Model Comparison & RMSD Evaluation  

- Loads and analyzes model-generated CIF comparison data  
- Computes mean RMSD values to assess structural similarity  
- Calculates match rates to evaluate generation accuracy  
- Derives match scores for ranking model performance  
- Prints a summary table for easy model comparison  


In [1]:
import gzip
import pickle
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import os

from tqdm.auto import tqdm
from pymatgen.core import Structure, Composition
from pymatgen.analysis.diffraction.xrd import XRDCalculator

# Import only the needed function from decifer_refactored.utility
from decifer.utility import (
    generate_continuous_xrd_from_cif,
    extract_formula_nonreduced,
    extract_space_group_symbol,
    space_group_symbol_to_number,
)

def load_comparison_dataframe(file_path):
    """
    Load a gzipped pickle file into a pandas DataFrame.
    """
    with gzip.open(file_path, 'rb') as file:
        return pd.DataFrame(pickle.load(file))


In [3]:
paths = {
    'U-deCIFer / None':
    '../experiments/model__conditioned_mlp_augmentation__context_3076__robust/comparison_files_fullXRD__robust/deCIFer-U_(None).pkl.gz',

    'U-deCIFer / Comp':
    '../experiments/model__conditioned_mlp_augmentation__context_3076__robust/comparison_files_fullXRD__robust/deCIFer-U_(Comp).pkl.gz',

    'U-deCIFer / CompSG':
    '../experiments/model__conditioned_mlp_augmentation__context_3076__robust/comparison_files_fullXRD__robust/deCIFer-U_(CompSG).pkl.gz',

    'deCIFer / None':
    '../experiments/model__conditioned_mlp_augmentation__context_3076__robust/comparison_files_fullXRD__robust/deCIFer_(None_N-0p00_B-0p05).pkl.gz',

    'deCIFer / Comp':
    '../experiments/model__conditioned_mlp_augmentation__context_3076__robust/comparison_files_fullXRD__robust/deCIFer_(Comp_N-0p00_B-0p05).pkl.gz',

    'deCIFer / CompSG':
    '../experiments/model__conditioned_mlp_augmentation__context_3076__robust/comparison_files_fullXRD__robust/deCIFer_(CompSG_N-0p00_B-0p05).pkl.gz',

    'deCIFer / Comp (N05B05)':
    '../experiments/model__conditioned_mlp_augmentation__context_3076__robust/comparison_files_fullXRD__robust/deCIFer_(Comp_N-0p05_B-0p05).pkl.gz',

    'deCIFer / Comp (N00B10)':
    '../experiments/model__conditioned_mlp_augmentation__context_3076__robust/comparison_files_fullXRD__robust/deCIFer_(Comp_N-0p00_B-0p10).pkl.gz',

     'deCIFer / Comp (N05B10)':
    '../experiments/model__conditioned_mlp_augmentation__context_3076__robust/comparison_files_fullXRD__robust/deCIFer_(Comp_N-0p05_B-0p10).pkl.gz',

    'deCIFer / Comp (N10B05)':
    '../experiments/model__conditioned_mlp_augmentation__context_3076__robust/comparison_files_fullXRD__robust/deCIFer_(Comp_N-0p10_B-0p05).pkl.gz',

     'deCIFer / Comp (N00B20)':
    '../experiments/model__conditioned_mlp_augmentation__context_3076__robust/comparison_files_fullXRD__robust/deCIFer_(Comp_N-0p00_B-0p20).pkl.gz',

    'deCIFer / Comp (N10B20)':
    '../experiments/model__conditioned_mlp_augmentation__context_3076__robust/comparison_files_fullXRD__robust/deCIFer_(Comp_N-0p10_B-0p20).pkl.gz',

    'deCIFer CHILI / Comp (N00B05)':
    '../experiments/model__conditioned_mlp_augmentation__context_3076__robust/comparison_files_fullXRD__robust/deCIFer_chili_(Comp_N-0p00_B-0p05).pkl.gz',

     'deCIFer CHILI / Comp (N05B10)':
    '../experiments/model__conditioned_mlp_augmentation__context_3076__robust/comparison_files_fullXRD__robust/deCIFer_chili_(Comp_N-0p05_B-0p10).pkl.gz',

    'deCIFer CHILI / Comp (N10B20)':
    '../experiments/model__conditioned_mlp_augmentation__context_3076__robust/comparison_files_fullXRD__robust/deCIFer_chili_(Comp_N-0p10_B-0p20).pkl.gz',
}

print(f"{'Model / Descriptor':<30} {'Mean RMSD':<15} {'Match Rate (%)':<15} {'Match Score':<15}")
print("-" * 70)

for name, path in paths.items():
    df = load_comparison_dataframe(path)
    match_rate = df['rmsd'].notna().sum() / len(df)
    
    if match_rate > 0:
        mean_rmsd = df['rmsd'].mean()
        match_score = match_rate / (mean_rmsd + 1e-13)
    else:
        mean_rmsd = float('nan')  # Explicitly set to NaN for clarity
        match_score = 0  # No matches, so match score is 0

    # Convert match rate to percentage
    match_rate_percentage = match_rate * 100

    print(f"{name:<30} {mean_rmsd:<15.5f} {match_rate_percentage:<15.2f} {match_score:<15.5f}")


Model / Descriptor             Mean RMSD       Match Rate (%)  Match Score    
----------------------------------------------------------------------
U-deCIFer / None               nan             0.00            0.00000        
U-deCIFer / Comp               0.04735         49.30           10.41217       
U-deCIFer / CompSG             0.02938         87.07           29.63337       
deCIFer / None                 0.01872         5.01            2.67440        
deCIFer / Comp                 0.02039         91.50           44.87920       
deCIFer / CompSG               0.02022         94.53           46.75593       
deCIFer / Comp (N05B05)        0.02172         88.21           40.61428       
deCIFer / Comp (N00B10)        0.02108         91.16           43.25372       
deCIFer / Comp (N05B10)        0.02171         89.28           41.11860       
deCIFer / Comp (N10B05)        0.02473         84.07           33.99305       
deCIFer / Comp (N00B20)        0.03029         80.38        