In [3]:
import pickle
import lmdb
import os
import pandas as pd
import numpy as np
import torch
import shutil
from tqdm import tqdm
from broaden import SpecBroadener
from rdkit import Chem
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

数据拆分，转Lmdb

In [6]:
"""
获取qm10的smiles
"""
lmdb_path = 'qm10'

qm10_files = [f'QM10_{i+1}' for i in range(35)]
all_smiles = []
for qm10_file in tqdm(qm10_files):
    folder_path = rf'D:\IR\{qm10_file}'
    files = os.listdir(folder_path)
    smiles = [f.split('.')[0] for f in files]
    all_smiles.extend(smiles)

all_smiles = set(all_smiles)
with open(f'{lmdb_path}/all_smiles.txt', 'w') as f:
    for idx, smiles in enumerate(all_smiles):
        f.write(f'{idx},{smiles}\n')

100%|██████████| 35/35 [00:03<00:00,  9.17it/s]


In [15]:
"""
复制qm10中的文件到同一个文件夹下
"""
src_dirs = [f'D:\IR\QM10_{i+1}' for i in range(35)]
# 目标文件夹路径
dest_dir = 'D:\qm10'

# 对于源文件夹列表中的每个文件夹路径
for dir_path in tqdm(src_dirs):
    # 获取文件夹中的文件列表
    files = os.listdir(dir_path)
    for file in files:
        # 构造完整的文件路径
        full_file_path = os.path.join(dir_path, file)
        # 将其复制到目标文件夹
        shutil.copy(full_file_path, dest_dir)

100%|██████████| 35/35 [26:57<00:00, 46.22s/it]


In [11]:
"""
all_smiles.txt -> train_smiles.txt and test_smiles.txt (8:2)
"""
with open('qm10/all_smiles.txt', 'r') as f:
    lines = f.readlines()
    moles = []
    for line in lines:
        moles.append(line.strip().split(',')[1])

import random

random.seed(3407)
random.shuffle(moles)

train_size = int(len(moles) * 0.8)

train_smiles = moles[:train_size]
test_smiles = moles[train_size:]

# with open('qm10/train_smiles.txt', 'w') as f:
#     for smiles in train_smiles:
#         f.write('1' + ',' + smiles + '\n')
# 
# with open('qm10/test_smiles.txt', 'w') as f:
#     for smiles in test_smiles:
#         f.write('1' + ',' + smiles + '\n')
        
"""
temp_smiles.txt
"""
temp_size = int(len(moles) * 0.0002)

temp_smiles = moles[:temp_size]
print('temp_size:', temp_size)
with open('qm10/temp_smiles.txt', 'w') as f:
    for smiles in temp_smiles:
        f.write('1' + ',' + smiles + '\n')

temp_size: 69


In [4]:
def read_inp(filepath):
    smiles, extension = os.path.splitext(os.path.basename(filepath))
    freqs = []
    irs = []
    with open(filepath, 'r') as f:
        lines = f.readlines()
        for line in lines:
            freq, ir = line.strip().split('\t')
            freqs.append(float(freq))
            irs.append(float(ir))
    return smiles, freqs, irs

In [11]:
# 定义要读取的文件夹路径和要保存的 LMDB 文件路径
folder_path = r'D:\qm10'
lmdb_path = 'qm10'

# 打开 LMDB 文件并开始写入数据
env = lmdb.open(lmdb_path, map_size=12500000000, readonly=False, lock=False)

# 读取分子映射
with open('qm10/train_smiles.txt', 'r') as f:
    lines = f.readlines()
    moles = []
    for line in lines:
        moles.append(line.strip().split(',')[1])
        
# 开始一个事务
# 遍历文件夹中的每个文本文件
txn = env.begin(write=True)
for idx, mol in enumerate(tqdm(moles), 1):
    file_path = os.path.join(folder_path, f'{mol}.inp')
    
    # 读取文本文件并处理数据
    try:
        smiles, freq, ir = read_inp(file_path)
        spec = SpecBroadener(smiles, freq, ir, 4000, 20)
    except:
        print(file_path)
        continue
      
    # 构建pickle对象
    data_dict = {
        'smi': mol,
        'ir': spec.y,
    }
    
    # 将数据写入LMDB，key为数据的编号（从1到N）
    txn.put(str(idx).encode(), pickle.dumps(data_dict))
    
    if idx % 200 ==0 or idx == len(moles):
        txn.commit()
        txn = env.begin(write=True)
    
# 关闭LMDB环境
env.close()

print("数据已成功转换为LMDB格式并保存到文件:", lmdb_path)

 76%|███████▌  | 212399/279996 [1:12:05<23:16, 48.40it/s]

D:\qm10\CC1=NC=CN1C(=C)C#C.inp


100%|██████████| 279996/279996 [1:34:00<00:00, 49.64it/s]

数据已成功转换为LMDB格式并保存到文件: qm10





In [10]:
env.close()

In [12]:
env = lmdb.open('qm10',readonly=False, lock=False)
with env.begin(write=True) as txn:
    # 执行垃圾回收操作
    txn.cursor().delete()
# 关闭 LMDB 环境
env.close()

字典制作

In [ ]:
import selfies as sf
from tqdm import tqdm

In [ ]:
mol_selfies = []
with open('qm10/all_smiles.txt') as f:
    lines = f.readlines()
    for line in tqdm(lines):
        mol_selfies.append(sf.encoder(line.strip().split(',')[1]))
mol_selfies

In [ ]:
alphabet = list(sf.get_alphabet_from_selfies(mol_selfies))

In [ ]:
np.save('qm10/alphabet.npy', alphabet)