# Build training sets (skip this part if you have already completed Tutorial_2.1)
**!!! We strongly recommend using our Docker image for the setup of Jupyter backend.** 
**!!! If you choose not to use the Docker image, you'll need to install Slurm on your local machine, which can be tricky.** 
**!!! If you don't want to install the Slurm workload manager, you'll need to modify the code in utils.py, replacing 'qsub 0_run.pbs' with 'python 0_run.py' inside the splitRun function. Additionally, please ensure that the number of threads does not exceed the number of CPU threads on your computer. Exceeding this limit may lead to resource contention issues.**
**!!! 优先使用我们提供的Docker镜像运行Jupyter后端.** 
**!!! 如果您不想使用Docker镜像，则需要在本机上安装Slurm任务管理系统，这可能会比较复杂。请注意，在运行Jupyter后端之前，需要完成Slurm的安装,否则计算会报错.** 
**!!! 如果您不想安装Slurm任务管理系统，那么需要修改utils.py的代码，在splitRun函数内部替换 qsub 0_run.pbs为 python 0_run.py，并且确认线程数不会超过电脑的cpu线程数量，否则会出现计算资源挤占的问题.** 

https://github.com/xiaohang007/SLICES?tab=readme-ov-file#jupyter-backend-setup

In [None]:
%matplotlib inline
from slices.utils import temporaryWorkingDirectory,splitRun,show_progress,collect_json,collect_csv
import os
import pandas as pd
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
def stratified_split(data, target_column, test_size=0.1, random_state=42):
    non_numeric = data[pd.to_numeric(data[target_column], errors='coerce').isna()]
    if not non_numeric.empty:
        print("以下行无法转换为数值类型:")
        print(non_numeric)
        print("\n无法转换的唯一值:")
        print(non_numeric[target_column].unique())
    data[target_column] = pd.to_numeric(data[target_column], errors='coerce')
    data_cleaned = data.dropna(subset=[target_column])
    print(f"\n原始数据行数: {len(data)}")
    print(f"清理后数据行数: {len(data_cleaned)}")
    data_cleaned['bin'] = pd.cut(data_cleaned[target_column], 
                         bins=[-np.inf, 0, 0.5, 1, 2, np.inf], 
                         labels=['zero', 'low', 'medium', 'high', 'very_high'])
    train_data = pd.DataFrame(columns=data_cleaned.columns)
    test_data = pd.DataFrame(columns=data_cleaned.columns)
    for bin_label in data_cleaned['bin'].unique():
        bin_data = data_cleaned[data_cleaned['bin'] == bin_label]
        if len(bin_data) > 1:
            bin_train, bin_test = train_test_split(bin_data, test_size=test_size, random_state=random_state)
        else:
            bin_train, bin_test = bin_data, pd.DataFrame()
        train_data = pd.concat([train_data, bin_train])
        test_data = pd.concat([test_data, bin_test])
    train_data = train_data.drop('bin', axis=1)
    test_data = test_data.drop('bin', axis=1)
    train_data = train_data.sample(frac=1, random_state=random_state).reset_index(drop=True)
    test_data = test_data.sample(frac=1, random_state=random_state).reset_index(drop=True)
    return train_data, test_data

if __name__ == "__main__":
    with temporaryWorkingDirectory("./data/mp20_nonmetal/"):
        output=[]
        data_path_predix="../mp20/"
        data=pd.read_csv(data_path_predix+"test.csv")
        cifs=list(data["cif"])
        ids=list(data["material_id"])
        eform=list(data["formation_energy_per_atom"])
        bandgap=list(data["band_gap"])
        for i in range(len(ids)):
            output.append({"material_id":ids[i],"formation_energy_per_atom":eform[i],"cif":cifs[i],"band_gap":bandgap[i]})
        data=pd.read_csv(data_path_predix+"val.csv")
        cifs=list(data["cif"])
        ids=list(data["material_id"])
        eform=list(data["formation_energy_per_atom"])
        bandgap=list(data["band_gap"])
        for i in range(len(ids)):
            output.append({"material_id":ids[i],"formation_energy_per_atom":eform[i],"cif":cifs[i],"band_gap":bandgap[i]})
        data=pd.read_csv(data_path_predix+"train.csv")
        cifs=list(data["cif"])
        ids=list(data["material_id"])
        eform=list(data["formation_energy_per_atom"])
        bandgap=list(data["band_gap"])
        for i in range(len(ids)):
            output.append({"material_id":ids[i],"formation_energy_per_atom":eform[i],"cif":cifs[i],"band_gap":bandgap[i]})
        with open('cifs.json', 'w') as f:
            json.dump(output, f)
        splitRun(filename='./cifs.json',threads=16,skip_header=False)
        show_progress()
        collect_json(output="cifs_filtered.json", \
            glob_target="./**/output.json",cleanup=False)
        collect_csv(output="mp20_eform_bandgap_nonmetal.csv", \
            glob_target="./**/result.csv",cleanup=True,header="SLICES,eform,bandgap\n")
        os.system("rm cifs.json")
        data = pd.read_csv('mp20_eform_bandgap_nonmetal.csv')
        target_column = data.columns[-1]  # 假设最后一列是目标值
        train_data, test_data = stratified_split(data, target_column)
        print(train_data[target_column].value_counts(bins=5, normalize=True))
        print(test_data[target_column].value_counts(bins=5, normalize=True))
        train_data.to_csv('train_data_reduce_zero.csv', index=False)
        test_data.to_csv('test_data_reduce_zero.csv', index=False)

# Train MatterGPT for Single-Property Material Inverse Design (using band gap as an example)
<span style="color:red">**CUDA in xiaohang07/slices:chgnet2 Docker works on WSL@Windows 11 but may fail on Ubuntu; for Ubuntu, use host machine to train and generate SLICES with MatterGPT.**</span>

**Set --epochs 5 to speed up the test run.**

**To accelerate the training process, consider adjusting the batch_size appropriately.**


In [None]:
from slices.utils import temporaryWorkingDirectory
import os
with temporaryWorkingDirectory("./MatterGPT/bandgap/1_train_generate"):
    os.system('''
    /bin/bash -c "source /opt/conda/etc/profile.d/conda.sh && conda activate chgnet && \
    python train.py --run_name bandgap_Aug1 --batch_size 12 --num_props 1 --max_epochs 5 --n_embd 768  --n_layer 12 --n_head 12 --learning_rate 1e-4 \
    --train_dataset '../../../data/mp20_nonmetal/train_data_reduce_zero.csv' --test_dataset '../../../data/mp20_nonmetal/test_data_reduce_zero.csv' "
    ''')

# Generate SLICES strings with specified $E_{gap}$ = [1,2,3,4]  eV/atom
<span style="color:red">**CUDA in xiaohang07/slices:chgnet2 Docker works on WSL@Windows 11 but may fail on Ubuntu; for Ubuntu, use host machine to train and generate SLICES with MatterGPT.**</span>

**Set --gen_size 5 to speed up the test run.**

**To accelerate the generation process, consider adjusting the batch_size appropriately.**

In [None]:
from slices.utils import temporaryWorkingDirectory
import os
with temporaryWorkingDirectory("./MatterGPT/bandgap/1_train_generate"):
    os.system('''
    /bin/bash -c "source /opt/conda/etc/profile.d/conda.sh && conda activate chgnet && \
    python generate.py --model_weight bandgap_Aug1.pt --prop_targets "[1.0, 2.0, 3.0, 4.0]" --gen_size 5 --batch_size 5 --csv_name inverse_designed_SLICES --n_head 12"
    ''')

# Reconstruct crystals from SLICES and assess novelty

In [None]:
from slices.utils import temporaryWorkingDirectory,splitRun_csv,show_progress,collect_csv
import os
import glob
from slices.utils import splitRun_csv, show_progress, collect_csv
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
import numpy as np
import matplotlib.ticker as ticker
import pickle,json
from pymatgen.core.structure import Structure
def load_and_save_structure_database():
    with open('../../../data/mp20_nonmetal/cifs_filtered.json', 'r') as f:
        cifs = json.load(f)
    
    structure_database = []
    for i in range(len(cifs)):
        cif_string = cifs[i]["cif"]
        try:
            stru = Structure.from_str(cif_string, "cif")
            structure_database.append([stru, cifs[i]["band_gap"]])
        except Exception as e:
            print(e)
    
    # Serialize the data
    with open('structure_database.pkl', 'wb') as f:
        pickle.dump(structure_database, f)
        
def process_data():
    load_and_save_structure_database()
    splitRun_csv(filename='../1_train_generate/inverse_designed_SLICES.csv', threads=10, skip_header=True)
    show_progress()
    collect_csv(output="results.csv",
                glob_target="./job_*/result.csv", cleanup=True,
                header="bandgap_target,SLICES,poscar,novelty\n")
if __name__ == "__main__":
    with temporaryWorkingDirectory("./MatterGPT/bandgap/2_inverse_novelty"):
        process_data()

# Evaluate the formation energy distribution of the reconstructed crystals at PBE level (need workstation or even HPC to run VASP fastly)

In [None]:
```bash
cd ./MatterGPT/bandgap/3_DFT
python 1_run.py
# done
```