# Build training sets (skip this part if you have already completed Tutorial_2.1 or 2.2)
**!!! We strongly recommend using our Docker image for the setup of Jupyter backend.** 
**!!! If you choose not to use the Docker image, you'll need to install Slurm on your local machine, which can be tricky.** 
**!!! If you don't want to install the Slurm workload manager, you'll need to modify the code in utils.py, replacing 'qsub 0_run.pbs' with 'python 0_run.py' inside the splitRun function. Additionally, please ensure that the number of threads does not exceed the number of CPU threads on your computer. Exceeding this limit may lead to resource contention issues.**
**!!! 优先使用我们提供的Docker镜像运行Jupyter后端.** 
**!!! 如果您不想使用Docker镜像，则需要在本机上安装Slurm任务管理系统，这可能会比较复杂。请注意，在运行Jupyter后端之前，需要完成Slurm的安装,否则计算会报错.** 
**!!! 如果您不想安装Slurm任务管理系统，那么需要修改utils.py的代码，在splitRun函数内部替换 qsub 0_run.pbs为 python 0_run.py，并且确认线程数不会超过电脑的cpu线程数量，否则会出现计算资源挤占的问题.** 

https://github.com/xiaohang007/SLICES?tab=readme-ov-file#jupyter-backend-setup

In [None]:
%matplotlib inline
from slices.utils import temporaryWorkingDirectory,splitRun,show_progress,collect_json,collect_csv,adaptive_dynamic_binning
import os
import pandas as pd
import json
import pandas as pd
import numpy as np

if __name__ == "__main__":
    with temporaryWorkingDirectory("./data/mp20_nonmetal/"):
        output=[]
        data_path_predix="../mp20/"
        data=pd.read_csv(data_path_predix+"test.csv")
        cifs=list(data["cif"])
        ids=list(data["material_id"])
        eform=list(data["formation_energy_per_atom"])
        bandgap=list(data["band_gap"])
        for i in range(len(ids)):
            output.append({"material_id":ids[i],"formation_energy_per_atom":eform[i],"cif":cifs[i],"band_gap":bandgap[i]})
        data=pd.read_csv(data_path_predix+"val.csv")
        cifs=list(data["cif"])
        ids=list(data["material_id"])
        eform=list(data["formation_energy_per_atom"])
        bandgap=list(data["band_gap"])
        for i in range(len(ids)):
            output.append({"material_id":ids[i],"formation_energy_per_atom":eform[i],"cif":cifs[i],"band_gap":bandgap[i]})
        data=pd.read_csv(data_path_predix+"train.csv")
        cifs=list(data["cif"])
        ids=list(data["material_id"])
        eform=list(data["formation_energy_per_atom"])
        bandgap=list(data["band_gap"])
        for i in range(len(ids)):
            output.append({"material_id":ids[i],"formation_energy_per_atom":eform[i],"cif":cifs[i],"band_gap":bandgap[i]})
        with open('cifs.json', 'w') as f:
            json.dump(output, f)
        splitRun(filename='./cifs.json',threads=16,skip_header=False)
        show_progress()
        collect_json(output="cifs_filtered.json", \
            glob_target="./**/output.json",cleanup=False)
        collect_csv(output="mp20_eform_bandgap_nonmetal.csv", \
            glob_target="./**/result.csv",cleanup=True,header="SLICES,eform,bandgap\n")
        os.system("rm cifs.json")
        # 读取数据
        data = pd.read_csv('mp20_eform_bandgap_nonmetal.csv')
        target_column = data.columns[-1]  # 假设最后一列是目标值
        
        # 进行自适应动态分箱
        train_data, test_data, bins = adaptive_dynamic_binning(data, target_column)
        
        # 检查分布
        print("\n训练集分布:")
        print(train_data[target_column].value_counts(bins=bins, normalize=True).sort_index())
        print("\n测试集分布:")
        print(test_data[target_column].value_counts(bins=bins, normalize=True).sort_index())
        
        # 保存分割后的数据
        train_data.to_csv('train_data_reduce_zero.csv', index=False)
        test_data.to_csv('test_data_reduce_zero.csv', index=False)

# Train MatterGPT for Multi-Property Material Inverse Design (using [bandgap, eform] as an example)
<span style="color:red">**CUDA in xiaohang07/slices:chgnet2 Docker works on WSL@Windows 11 but may fail on Ubuntu; for Ubuntu, use host machine to train and generate SLICES with MatterGPT.**</span>

**Set --epochs 5 to speed up the test run.**

**To accelerate the training process, consider adjusting the batch_size appropriately.**


In [1]:
from slices.utils import temporaryWorkingDirectory
import os
with temporaryWorkingDirectory("./MatterGPT/bandgap_eform/1_train_generate"):
    os.system('''
    /bin/bash -c "source /opt/conda/etc/profile.d/conda.sh && conda activate chgnet && \
    python train.py --run_name bandgap_Aug1 --batch_size 12 --num_props 2 --max_epochs 5 --n_embd 768  --n_layer 12 --n_head 12 --learning_rate 3.3e-4 \
    --train_dataset '../../../data/mp20_nonmetal/train_data_reduce_zero.csv' --test_dataset '../../../data/mp20_nonmetal/test_data_reduce_zero.csv' "
    ''')

Co Co Co Co As As As As S S S S 0 9 ooo 0 6 ooo 0 4 ooo 0 8 ooo 0 11 oo- 0 5 ooo 1 4 ooo 1 10 ooo 1 6 o+o 1 11 ooo 1 7 o+o 1 8 o++ 2 5 --o 2 8 -oo 2 7 -oo 2 10 o-- 2 4 o-o 2 9 ooo 3 7 -oo 3 5 -oo 3 11 -oo 3 6 ooo 3 9 oo+ 3 10 ooo 4 8 o+o 5 9 +oo 6 11 ooo 7 10 +-o 
[-0.6162958260416668, 0.8586]
Constructing vocabulary...
Max length of slices:  392
Number of characters: 130
vocab_size: 132

['+++', '++-', '++o', '+-+', '+--', '+-o', '+o+', '+o-', '+oo', '-++', '-+-', '-+o', '--+', '---', '--o', '-o+', '-o-', '-oo', '0', '1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '3', '4', '5', '6', '7', '8', '9', '<', '>', 'Ag', 'Al', 'Ar', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cl', 'Co', 'Cr', 'Cs', 'Cu', 'Dy', 'Er', 'Eu', 'F', 'Fe', 'Ga', 'Gd', 'Ge', 'H', 'He', 'Hf', 'Hg', 'Ho', 'I', 'In', 'Ir', 'K', 'Kr', 'La', 'Li', 'Lu', 'Mg', 'Mn', 'Mo', 'N', 'Na', 'Nb', 'Nd', 'Ne', 'Ni', 'O', 'Os', 'P', 'Pb', 'Pd', 'Pm', 'Pr', 'Pt', 'Rb', 'Re', 'Rh', 'Ru', 'S',

epoch 1 iter 976: train loss 0.60171. lr 3.032845e-04: 100%|██████████| 977/977 [04:48<00:00,  3.39it/s]


test loss: 0.4337577740190483 

Saving at epoch 1 with best test loss: 0.4337577740190483


epoch 2 iter 976: train loss 0.39700. lr 2.219852e-04: 100%|██████████| 977/977 [04:47<00:00,  3.39it/s]


test loss: 0.37552824658662204 

Saving at epoch 2 with best test loss: 0.37552824658662204


epoch 3 iter 976: train loss 0.32866. lr 1.180529e-04: 100%|██████████| 977/977 [04:47<00:00,  3.40it/s]


test loss: 0.3379483014224512 

Saving at epoch 3 with best test loss: 0.3379483014224512


epoch 4 iter 976: train loss 0.23985. lr 3.300000e-05: 100%|██████████| 977/977 [04:47<00:00,  3.40it/s]


test loss: 0.3136829395405194 

Saving at epoch 4 with best test loss: 0.3136829395405194


epoch 5 iter 976: train loss 0.33603. lr 3.300000e-05: 100%|██████████| 977/977 [04:47<00:00,  3.40it/s]


test loss: 0.3058865898051243 

Saving at epoch 5 with best test loss: 0.3058865898051243


# Generate SLICES strings with specified [$E_{form}$, $E_{gap}$] = [-2.0 eV/atom, 1.0 eV]  
<span style="color:red">**CUDA in xiaohang07/slices:chgnet2 Docker works on WSL@Windows 11 but may fail on Ubuntu; for Ubuntu, use host machine to train and generate SLICES with MatterGPT.**</span>

**Set --gen_size 20 to speed up the test run.**

**To accelerate the generation process, consider adjusting the batch_size appropriately.**

In [3]:
from slices.utils import temporaryWorkingDirectory
import os
with temporaryWorkingDirectory("./MatterGPT/bandgap_eform/1_train_generate"):
    os.system('''
    /bin/bash -c "source /opt/conda/etc/profile.d/conda.sh && conda activate chgnet && \
    python generate.py --model_weight bandgap_Aug1.pt --prop_targets '[[-2.0,1.0]]' --gen_size 20 --batch_size 2 --csv_name inverse_designed_2props_SLICES  --n_head 12"
    ''')

Target 1 is bandgap: 1.0 eV and eform: -2.0 eV/atom.
./Voc_prior
Loading model
Model loaded


100%|██████████| 10/10 [01:06<00:00,  6.65s/it]


Total generated SLICES: 19
Unique canonical SLICES: 19
Valid ratio: 0.95
Unique ratio: 1.0


# Reconstruct crystals from SLICES and assess novelty

In [1]:
from slices.utils import temporaryWorkingDirectory,splitRun_csv,show_progress,collect_csv
import os
import glob
from slices.utils import splitRun_csv, show_progress, collect_csv
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
import numpy as np
import matplotlib.ticker as ticker
import pickle,json
from pymatgen.core.structure import Structure
def load_and_save_structure_database():
    with open('../../../data/mp20_nonmetal/cifs_filtered.json', 'r') as f:
        cifs = json.load(f)
    
    structure_database = []
    for i in range(len(cifs)):
        cif_string = cifs[i]["cif"]
        try:
            stru = Structure.from_str(cif_string, "cif")
            structure_database.append([stru, cifs[i]["band_gap"]])
        except Exception as e:
            print(e)
    
    # Serialize the data
    with open('structure_database.pkl', 'wb') as f:
        pickle.dump(structure_database, f)
        
def process_data():
    load_and_save_structure_database()
    splitRun_csv(filename='../1_train_generate/inverse_designed_2props_SLICES.csv', threads=8, skip_header=True)
    show_progress()
    collect_csv(output="results.csv",
                glob_target="./job_*/result.csv", cleanup=True,
                header="bandgap_target,eform_target,SLICES,poscar,novelty\n")
if __name__ == "__main__":
    with temporaryWorkingDirectory("./MatterGPT/bandgap_eform/2_inverse_novelty"):
        process_data()

Computational tasks have been submitted.


     100%|███████████████| 100/100 [15:12<00:00,  9.13s/it]   


Results have been collected into: results.csv


# Evaluate the bandgap & eform distribution of the reconstructed crystals at PBE level (need workstation or even HPC to run VASP fastly)

In [None]:
```bash
cd ./MatterGPT/bandgap_eform/3_DFT
python 1_run.py
# done
```