In [14]:
pip install ase

Collecting ase
  Using cached ase-3.25.0-py3-none-any.whl.metadata (4.2 kB)
Using cached ase-3.25.0-py3-none-any.whl (3.0 MB)
Installing collected packages: ase
Successfully installed ase-3.25.0
Note: you may need to restart the kernel to use updated packages.


In [25]:
# 🚀 QMOF fingerprint pipeline (100% matching last year's format)

import os
import json
import gzip
import pandas as pd
from tqdm import tqdm
from pymatgen.core import Structure

# 🚀 1️⃣ Load QMOF structure data (official json.gz)
json_path = '/Users/viewyan/Downloads/13147324/qmof_database/qmof_structure_data.json.gz'

with gzip.open(json_path) as f:
    data = json.load(f)

print(f'Loaded {len(data)} structures.')

# 🚀 2️⃣ Define fingerprint function (you should already have this in your colab!)
# Example: stoichiometric 45-dimensional fingerprint
from collections import defaultdict

def compute_stoich_fp(structure, feature_list):
    # count atoms
    atom_counts = defaultdict(int)
    for site in structure.sites:
        atom_counts[site.specie.symbol] += 1

    # build fingerprint vector
    fp_vector = []
    total_atoms = sum(atom_counts.values())

    for feat in feature_list:
        if feat in atom_counts:
            fp_vector.append(atom_counts[feat] / total_atoms)
        else:
            fp_vector.append(0.0)

    return fp_vector

# 🚀 3️⃣ Define 45-feature element list (this was used last year)
feature_list_45 = [
    'H', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Na', 'Mg',
    'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ca', 'Sc', 'Ti', 'V',
    'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As',
    'Se', 'Br', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Pd', 'Ag',
    'Cd', 'In', 'Sn', 'Sb', 'I'
]

# 🚀 4️⃣ Loop over structures and compute fingerprints
mof_names = []
fingerprints = []

for entry in tqdm(data):
    try:
        mof_name = entry['name']
        structure_dict = entry['structure']
        structure = Structure.from_dict(structure_dict)

        fp = compute_stoich_fp(structure, feature_list_45)

        mof_names.append(mof_name)
        fingerprints.append(fp)

    except Exception as e:
        print(f'Failed for {entry["name"]}: {e}')
        continue

# 🚀 5️⃣ Save to DataFrame
df_fps = pd.DataFrame(fingerprints, columns=feature_list_45)
df_fps.insert(0, 'MOF_name', mof_names)

print(f'Generated DataFrame shape: {df_fps.shape}')

# 🚀 6️⃣ Save CSV (same name as last year)
output_path = '/Users/viewyan/Downloads/13147324/qmof_database/reproduce-stoich45_fingerprints.csv'
df_fps.to_csv(output_path, index=False)

print(f'Fingerprint saved to {output_path} ✅')


Loaded 20425 structures.


100%|██████████| 20425/20425 [4:04:19<00:00,  1.39it/s]    


Generated DataFrame shape: (20425, 46)
Fingerprint saved to /Users/viewyan/Downloads/13147324/qmof_database/stoich45_fingerprints.csv ✅


In [26]:
def compute_stoich_fp(structure, feature_list):
    atom_counts = defaultdict(int)
    for site in structure.sites:
        atom_counts[site.specie.symbol] += 1

    fp_vector = []
    total_atoms = sum(atom_counts.values())

    for feat in feature_list:
        if feat in atom_counts:
            fp_vector.append(atom_counts[feat] / total_atoms)
        else:
            fp_vector.append(0.0)

    return fp_vector

# 🚀 3️⃣ Define 120-feature element list (same as last year's)
feature_list_120 = [
    'H', 'He',
    'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne',
    'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar',
    'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr',
    'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe',
    'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu',
    'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn',
    'Fr', 'Ra', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr',
    'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl', 'Mc', 'Lv', 'Ts', 'Og'
]

# 🚀 4️⃣ Loop over structures and compute fingerprints
mof_names = []
fingerprints = []

for entry in tqdm(data):
    try:
        mof_name = entry['name']
        structure_dict = entry['structure']
        structure = Structure.from_dict(structure_dict)

        fp = compute_stoich_fp(structure, feature_list_120)

        mof_names.append(mof_name)
        fingerprints.append(fp)

    except Exception as e:
        print(f'Failed for {entry["name"]}: {e}')
        continue

# 🚀 5️⃣ Save to DataFrame
df_fps = pd.DataFrame(fingerprints, columns=feature_list_120)
df_fps.insert(0, 'MOF_name', mof_names)

print(f'Generated DataFrame shape: {df_fps.shape}')

# 🚀 6️⃣ Save CSV (same name as last year)
output_path = '/Users/viewyan/Downloads/13147324/qmof_database/reproduce-stoich120_fingerprints.csv'
df_fps.to_csv(output_path, index=False)

print(f'Fingerprint saved to {output_path} ✅')

100%|██████████| 20425/20425 [04:12<00:00, 80.83it/s] 


Generated DataFrame shape: (20425, 119)
Fingerprint saved to /Users/viewyan/Downloads/13147324/qmof_database/reproduce-stoich120_fingerprints.csv ✅
