### Step 0
#### First, perform the DFT molecular dynamics calculation on the samples in &zwnj;**sample_data**&zwnj;, ensuring that the molecular dynamics trajectory file &zwnj;***SystemLabel.ANI***&zwnj; is obtained. This file stores the geometric structure evolution of the system at each time step.

In [None]:
import os
import sys
import numpy as np

In [None]:
# Read the *.ANI file
def read_ani_file(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()
    structures = []
    step_lines = []
    for line in lines:
        if line.strip().isdigit() and step_lines:
            stru_coords = [line.split() for line in step_lines[2:]]
            for i in range(len(stru_coords)):
                if stru_coords[i][0] == "C":
                    atom_species = 1
                # elif stru_coords[i][0] == "N":
                #     atom_species = 2
                else:
                    atom_species = 0
                stru_coords[i].append(int(atom_species))
            structures.append(np.array([x[1:] for x in stru_coords]))
            step_lines = []
        step_lines.append(line)

    if step_lines:
        stru_coords = [line.split() for line in step_lines[2:]]
        for i in range(len(stru_coords)):
            if stru_coords[i][0] == "C":
                atom_species = 1
            # elif stru_coords[i][0] == "N":
            #     atom_species = 2
            else:
                atom_species = 0
            stru_coords[i].append(int(atom_species))
        structures.append(np.array([x[1:] for x in stru_coords]))

    return structures

In [None]:
# Save the individual molecular dynamics trajectory to the input file of siesta
import re
def write_structure_to_folder(structure, fdf_file, foldername):
    if not os.path.exists(foldername):
        os.makedirs(foldername)

    filename = os.path.join(foldername, 'input.fdf')
    coords_str = ''
    for i in range(len(structure)):
        coords_str += f'{structure[i, 0]} {structure[i, 1]} {structure[i, 2]} {structure[i, 3]}'
        if i != len(structure) - 1:
            coords_str += '\n'

    with open(fdf_file, 'r') as f:
        content = f.read()

    pattern = r'%block\s+AtomicCoordinatesAndAtomicSpecies\s*\n(.*?)\n%endblock\s+AtomicCoordinatesAndAtomicSpecies'

    new_content = re.sub(pattern, 
                        f'%block AtomicCoordinatesAndAtomicSpecies\n{coords_str}\n%endblock AtomicCoordinatesAndAtomicSpecies',
                        content,
                        flags=re.DOTALL)  # 使.匹配换行符

    pattern = r'(MD\.FinalTimeStep\s+)\d+'
    content = re.sub(pattern, r'\g<1>1', new_content)

    insert_pos = content.find('MD.FinalTimeStep')
    if insert_pos != -1:
        line_end = content.find('\n', insert_pos) + 1
        final_content = content[:line_end] + 'MD.Steps             0\n' + content[line_end:]

    os.makedirs(foldername, exist_ok=True)
    with open(filename, 'w') as f:
        f.write(final_content)

In [None]:
#Read the molecular dynamics trajectories and save them to the extend_dataset

ani_filename = './sample_data/0/graphene72.ANI'
output_base_folder = './expand_dataset/raw/'
fdf_file = './sample_data/0/input.fdf'

structures = read_ani_file(ani_filename)[400:] # Select the structure of the last 600 time steps as the dataset
print(f"Select the structure of the last {len(structures)} time steps as the dataset.")

for step, structure in enumerate(structures):
    foldername = os.path.join(output_base_folder, f'{step+1}')
    #print(foldername)
    write_structure_to_folder(structure, fdf_file, foldername)

In [None]:
import os
import shutil
from pathlib import Path

def batch_copy_files(src_dir, dst_base, file_patterns, num_range):

    src_path = Path(src_dir)
    dst_base = Path(dst_base)
    
    if not src_path.exists():
        raise FileNotFoundError(f"The source directory does not exist: {src_path}")
    
    dst_base.mkdir(parents=True, exist_ok=True)
    
    src_files = []
    for pattern in file_patterns:
        src_files.extend(src_path.glob(pattern))
    print(src_files)
    if not src_files:
        print("Warning: No matching source file was found!")
        return
    
    for i in num_range:
        target_dir = dst_base / str(i)
        target_dir.mkdir(exist_ok=True)
        
        for src_file in src_files:
            shutil.copy2(src_file, target_dir)
            print(f"Copy: {src_file} -> {target_dir/ src_file.name}")

if __name__ == "__main__":

    ani_filename = './sample_data/0/'
    output_base_folder = './expand_dataset/raw'
    file_patterns = ['*.psf', '*.sh']

    batch_copy_files(
        src_dir=ani_filename,
        dst_base=output_base_folder,
        file_patterns=file_patterns,
        num_range=range(1, 601)
    )

In [None]:
# Submit all the data to the supercomputer for DFT calculations.
# num1 -> num2, Submit in batches, with 100 jobs in each batch.

save_str = """#!/bin/bash
num1=501
num2=600

if [ $num1 -gt $num2 ]; then
  echo "The first number must be less than or equal to the second number."
  exit 1
fi

for ((i = num1; i <= num2; i++))
do

cd ./expand_dataset/raw/$i
yhbatch job.sh
cd ../../../
done
    """
with open('./run_dft.sh', 'w') as save_f:
    save_f.write(save_str)

import subprocess
 
cmd = f"sh ./run_dft.sh"     
subprocess.run(cmd, shell=True) 