In [1]:
import os
import sys
import time
from pathos.multiprocessing import ProcessingPool as Pool #多线程
import torch

from get_data_from_siesta import get_data_from_siesta
from get_rotate_coord import get_rc
from rotate import get_rh
from data import HData

In [2]:
current_sys_path = os.path.abspath(".")
print("current sys path:", current_sys_path)

current sys path: /fs2/home/ndsim10/DeepQT/1_preprocess


In [3]:
config = {
    "basic": {
        "raw_dir": "/fs2/home/ndsim10/DeepQT/DeepQTH/0_generate_dataset/expand_dataset/raw/",
        "processed_data_dir": "/fs2/home/ndsim10/DeepQT/DeepQTH/0_generate_dataset/expand_dataset/processed/",
        "graph_dir": "/fs2/home/ndsim10/DeepQT/DeepQTH/0_generate_dataset/expand_dataset/graph/",
        "target": "hamiltonian",
        "interface": "siesta",
        "data_format": "h5",
        "input_file": "input.fdf",
        "multiprocessing": 8,
        "local_coordinate": True,
        "material_dimension": 2,
    },
    "interpreter": {
        "python_interpreter": "~/miniconda3/envs/deeph-cpu/bin/python"
    },
    "graph": {
        "radius": 7.0, #graphene 7.0 Å, MoS2 8.0 Å, and silicon  9.0 Å
        "num_l": 4,
        "if_lcmp_graph": True,
        "separate_onsite": False,
        "new_sp": False,
        "shortest_path_length": 5,
        "r2_rand": False
    }
}

In [5]:
def main(config, current_sys_path):

    assert config['basic']['target'] in ['hamiltonian']
    assert config['basic']['input_file'] in ['input.fdf']
    
    raw_dir = os.path.abspath(config['basic']['raw_dir']) #../deeph/example/work_dir/dataset/raw/
    print("raw_dir:", raw_dir)
    processed_data_dir = os.path.abspath(config['basic']['processed_data_dir']) #../deeph/example/work_dir/dataset/processed/
    print("processed_data_dir:", processed_data_dir)

    target = config['basic']['target'] #hamiltonian
    interface = config['basic']['interface'] #siesta/transiesa
    input_file = config['basic']['input_file'] #input.fdf
    local_coordinate = config['basic']['local_coordinate'] #True
    multiprocessing = config['basic']['multiprocessing'] #4
    radius = config['graph']['radius']
    
    os.chdir(raw_dir) #改变当前工作目录到指定的路径#../deeph/example/work_dir/dataset/raw
    relpath_list = [] #相对路径列表
    abspath_list = [] #绝对路径列表

    for root, dirs, files in os.walk('./'):
        if (any(['.HSX' in ifile for ifile in files])) or (any(['.TSHS' in ifile for ifile in files])):
            relpath_list.append(root)#./0-./575,该目录下所有文件夹的相对路径，并没有文件路径。
            abspath_list.append(os.path.abspath(root))#/fs2/home/ndsim10/example/work_dir/dataset/raw/0-575，该目录下所有文件夹的绝对路径，并没有文件路径。
    os.makedirs(processed_data_dir, exist_ok=True)
    os.chdir(processed_data_dir)
    print(f"Found {len(abspath_list)} directories to preprocess") #Found 600 directories to preprocess
    # print(abspath_list)
    # print(relpath_list)
    
    def worker(index):#每次传入一个数，1-600
        time_cost = time.time() - begin_time
        
        abspath = abspath_list[index] #处理每一个子文件夹下的siesta
        relpath = relpath_list[index]
        os.makedirs(relpath, exist_ok=True) #在processed_data_dir文件夹下创建相对路径文件，即0-575
        struct_id = int(''.join(filter(str.isdigit, relpath)))
        # print(f'\rPreprocessing No.{struct_id}')
        
        #get_data_from_siesta(interface, input_file, abspath, os.path.abspath(relpath)) #直接进行siesta预处理操作，这里的绝对路径和相对路径分别对应raw和processed文件夹。
        
        # if local_coordinate:
        #     get_rc(os.path.abspath(relpath), os.path.abspath(relpath), radius=radius, neighbour_file='hamiltonians.h5')
        #     #得到所有原子i的截断半径内排序后的邻居原子j对应的3*3的单位局域坐标，存入rc.h5文件，2016*3*3
        #     get_rh(os.path.abspath(relpath), os.path.abspath(relpath), target) #hamiltonian
        #     #得到截断半径内局域坐标下旋转后的哈密顿量，key保持不变，对应key下的哈密顿矩阵旋转后，存在rh.h5文件中，这一步是关键！涉及e3nn
        

    begin_time = time.time()
    if multiprocessing != 0:
        if multiprocessing > 0:
            pool_dict = {'nodes': multiprocessing} #并行计算的节点数
        else:
            pool_dict = {}
        # 列表、元组前面加1个星号作用是将列表解开（unpacke）成多个独立的参数，字典前加1个星号是输出各个独立的键，字典前面加2个星号，是将字典解开成为独立的值元素作为实参。
        with Pool(**pool_dict) as pool:
            nodes = pool.nodes #即multiprocessing的数值
            print(f'Use multiprocessing (nodes = {nodes})')
            #map()函数。需要传递两个参数，第一个参数就是需要引用的函数，第二个参数是一个可迭代对象，它会把需要迭代的元素一个个的传入第一个参数我们的函数中。
            pool.map(worker, range(len(abspath_list))) #创建一个从0-575的列表，输入到worker函数中并行计算，输入的参数为一个list，每一个元素对应于一个输入，每个输入并行。
    else:
        nodes = 1
        for index in range(len(abspath_list)): #0-575
            worker(index) #把0-575单个数输入worker函数中计算。
    print(f'\nFinished preprocess {len(abspath_list)} directories and cost {time.time() - begin_time:.2f} seconds')


    if current_sys_path not in sys.path:
        sys.path.insert(0, current_sys_path)
    dataset = HData(config, default_dtype_torch=torch.get_default_dtype()) #获取当前的默认浮点 torch.dtype。
    return dataset
    
if __name__ == '__main__':
    
    dataset = main(config, current_sys_path)
    

raw_dir: /fs2/home/ndsim10/DeepQT/0_generate_dataset/expand_dataset/raw
processed_data_dir: /fs2/home/ndsim10/DeepQT/0_generate_dataset/expand_dataset/processed
Found 600 directories to preprocess
Use multiprocessing (nodes = 8)

Finished preprocess 600 directories and cost 0.11 seconds
Graph data file: HGraph-h5-4l.pkl


Processing...


Process new data file......
Use multiprocessing (nodes = num_processors x num_threads = 1 x 4)


  0%|                                                   | 0/600 [00:00<?, ?it/s]

process dir: 1


  0%|                                           | 1/600 [00:02<29:01,  2.91s/it]

process dir: 10


  0%|▏                                          | 2/600 [00:05<24:13,  2.43s/it]

process dir: 100


  0%|▏                                          | 3/600 [00:07<23:16,  2.34s/it]

process dir: 101


  1%|▎                                          | 4/600 [00:09<22:30,  2.27s/it]

process dir: 102


  1%|▎                                          | 5/600 [00:11<22:05,  2.23s/it]

process dir: 103


  1%|▍                                          | 6/600 [00:13<22:00,  2.22s/it]

process dir: 104


  1%|▌                                          | 7/600 [00:15<21:41,  2.19s/it]

process dir: 105


  1%|▌                                          | 8/600 [00:17<21:11,  2.15s/it]

process dir: 106


KeyboardInterrupt: 

In [6]:
print(dataset.num_edge_features)
print(dataset.num_node_features)

10
1


In [7]:
dataset[0]

Data(
  x=[72],
  edge_index=[2, 2664],
  edge_attr=[2664, 10],
  stru_id='1',
  voronoi_values=[72, 1],
  centralities=[72, 1],
  cart_coords=[72, 3],
  lattice=[3, 3],
  node_paths=[1, 72, 72, 5],
  edge_paths=[1, 72, 72, 4],
  atom_num_orbital=[72],
  subgraph_dict={
    subgraph_atom_idx=[197136, 2],
    subgraph_edge_idx=[197136],
    subgraph_edge_ang=[197136, 16],
    subgraph_index=[197136],
  },
  spinful=[1],
  mask=[2664, 81],
  label=[2664, 81]
)

In [8]:
dataset[0].x

tensor([6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6])

In [9]:
dataset[0].atom_num_orbital

array([9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9])

In [10]:
dataset.info

{'spinful': False,
 'index_to_Z': tensor([6]),
 'Z_to_index': tensor([-1, -1, -1, -1, -1, -1,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])}