# 安装环境

In [1]:
# # !pip install torchtext
# !pip install pandas
# !pip install scikit-optimize
# !pip install torch torchvision
# # !git clone https://github.com/xzxg001/AI-chemistry.git
# !pip3 install torch==2.1.0 # 对于 x86 平台 pip3 install torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu
# !pip3 install pyyaml 
# !pip3 install setuptools
# !pip3 install torch-npu==2.1.0.post8
!export PYTHONPATH=usr/local/Ascend/ascend-toolkit/8.0.RC1/aarch64-linux/ascend-toolkit/latest/tools/ms_fmk_transplt/torch_npu_bridge:$PYTHONPATH

# 对数据处理进行初步尝试

In [2]:
import pandas as pd
import torch_npu
import transfer_to_npu
from torch.utils.data import Dataset, DataLoader, Subset
from typing import List, Tuple
import re
import torch
import torch.nn as nn
import time
import torch.optim as optim
import time
import torch
import torch.nn as nn
import torch_npu
from torch_npu.npu import amp
device=torch.device('npu:0')



    *************************************************************************************************************
    The torch.Tensor.cuda and torch.nn.Module.cuda are replaced with torch.Tensor.npu and torch.nn.Module.npu now..
    The torch.cuda.DoubleTensor is replaced with torch.npu.FloatTensor cause the double type is not supported now..
    The backend in torch.distributed.init_process_group set to hccl now..
    The torch.cuda.* and torch.cuda.amp.* are replaced with torch.npu.* and torch.npu.amp.* now..
    The device parameters have been replaced with npu in the function below:
    torch.logspace, torch.randint, torch.hann_window, torch.rand, torch.full_like, torch.ones_like, torch.rand_like, torch.randperm, torch.arange, torch.frombuffer, torch.normal, torch._empty_per_channel_affine_quantized, torch.empty_strided, torch.empty_like, torch.scalar_tensor, torch.tril_indices, torch.bartlett_window, torch.ones, torch.sparse_coo_tensor, torch.randn, torch.kaiser_window, torch.ten

In [3]:
# tokenizer，鉴于SMILES的特性，这里需要自己定义tokenizer和vocab
# 这里直接将smiles str按字符拆分，并替换为词汇表中的序号
class Smiles_tokenizer():
    def __init__(self, pad_token, regex, vocab_file, max_length):
        self.pad_token = pad_token
        self.regex = regex
        self.vocab_file = vocab_file
        self.max_length = max_length

        with open(self.vocab_file, "r") as f:
            lines = f.readlines()
        lines = [line.strip("\n") for line in lines]
        vocab_dic = {}
        for index, token in enumerate(lines):
            vocab_dic[token] = index
        self.vocab_dic = vocab_dic

    def _regex_match(self, smiles):
        regex_string = r"(" + self.regex + r"|"
        regex_string += r".)"
        prog = re.compile(regex_string)

        tokenised = []
        for smi in smiles:
            tokens = prog.findall(smi)
            if len(tokens) > self.max_length:
                tokens = tokens[:self.max_length]
            tokenised.append(tokens) # 返回一个所有的字符串列表
        return tokenised

    def tokenize(self, smiles):
        tokens = self._regex_match(smiles)
        # 添加上表示开始和结束的token：<cls>, <end>
        tokens = [["<CLS>"] + token + ["<SEP>"] for token in tokens]
        tokens = self._pad_seqs(tokens, self.pad_token)
        token_idx = self._pad_token_to_idx(tokens)
        return tokens, token_idx

    def _pad_seqs(self, seqs, pad_token):
        pad_length = max([len(seq) for seq in seqs])
        padded = [seq + ([pad_token] * (pad_length - len(seq))) for seq in seqs]
        return padded

    def _pad_token_to_idx(self, tokens):
        idx_list = []
        new_vocab = []
        for token in tokens:
            tokens_idx = []
            for i in token:
                if i in self.vocab_dic.keys():
                    tokens_idx.append(self.vocab_dic[i])
                else:
                    new_vocab.append(i)
                    self.vocab_dic[i] = max(self.vocab_dic.values()) + 1
                    tokens_idx.append(self.vocab_dic[i])
            idx_list.append(tokens_idx)

        with open("../new_vocab_list.txt", "a") as f:
            for i in new_vocab:
                f.write(i)
                f.write("\n")

        return idx_list

    def _save_vocab(self, vocab_path):
        with open(vocab_path, "w") as f:
            for i in self.vocab_dic.keys():
                f.write(i)
                f.write("\n")
        print("update new vocab!")

In [4]:
# 处理数据

def read_data(file_path, train=True):
    df = pd.read_csv(file_path)
    reactant1 = df["Reactant1"].tolist()
    reactant2 = df["Reactant2"].tolist()
    product = df["Product"].tolist()
    additive = df["Additive"].tolist()
    solvent = df["Solvent"].tolist()
    if train:
        react_yield = df["Yield"].tolist()
    else:
        react_yield = [0 for i in range(len(reactant1))]

    # 将reactant\additive\solvent拼到一起，之间用.分开。product也拼到一起，用>>分开
    input_data_list = []
    for react1, react2, prod, addi, sol in zip(reactant1, reactant2, product, additive, solvent):
        # input_info = ".".join([react1, react2, addi, sol])
        input_info = ".".join([react1, react2])
        input_info = ">".join([input_info, prod])
        input_data_list.append(input_info)
    output = [(react, y) for react, y in zip(input_data_list, react_yield)]

    return output


In [5]:
# 定义数据集
class ReactionDataset(Dataset):
    def __init__(self, data: List[Tuple[List[str], float]]):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def collate_fn(batch):
    REGEX = r"\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9]"
    tokenizer = Smiles_tokenizer("<PAD>", REGEX, "/home/ma-user/work/AI-chemistry/mp/vocab_full.txt", 300)
    smi_list = []
    yield_list = []
    for i in batch:
        smi_list.append(i[0])
        yield_list.append(i[1])
    tokenizer_batch = torch.tensor(tokenizer.tokenize(smi_list)[1])
    yield_list = torch.tensor(yield_list)
    return tokenizer_batch, yield_list


In [6]:
# 模型
'''
直接采用一个transformer encoder model就好了
'''
class TransformerEncoderModel(nn.Module):
    def __init__(self, input_dim, d_model, num_heads, fnn_dim, num_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, d_model)
        self.layerNorm = nn.LayerNorm(d_model)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model,
                                                        nhead=num_heads,
                                                        dim_feedforward=fnn_dim,
                                                        dropout=dropout,
                                                        batch_first=True,
                                                        norm_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer,
                                                         num_layers=num_layers,
                                                         enable_nested_tensor=False,
                                                         norm=self.layerNorm)
        self.dropout = nn.Dropout(dropout)
        self.lc = nn.Sequential(nn.Linear(d_model, 256),
                                nn.Sigmoid(),
                                nn.Linear(256, 96),
                                nn.Sigmoid(),
                                nn.Linear(96, 1))

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs = self.transformer_encoder(embedded)
        z = outputs[:,0,:]
        outputs = self.lc(z)
        return outputs.squeeze(-1)

In [7]:
def adjust_learning_rate(optimizer, epoch, start_lr):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = start_lr * (0.1 ** (epoch // 3))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


In [8]:
from torch.utils.data import random_split
import os

def prepare_data(data_path, batch_size, test_size=0.1, valid_size=0.1):
    data = read_data(data_path)
    dataset = ReactionDataset(data)

    # 确保测试集和验证集的大小加起来不超过数据集大小
    test_size = int(len(dataset) * test_size)
    valid_size = int(len(dataset) * valid_size)

    # 计算训练集的大小，确保总和等于数据集大小
    train_size = len(dataset) - test_size - valid_size

    # 分割数据集
    train_dataset, valid_test_dataset = random_split(dataset, [train_size, len(dataset) - train_size])
    valid_dataset, test_dataset = random_split(valid_test_dataset, [valid_size, test_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    return train_loader, valid_loader, test_loader

def train(train_loader, valid_loader, model_file, N_EPOCHS, LR, CLIP, INPUT_DIM, D_MODEL, NUM_HEADS, FNN_DIM, NUM_LAYERS, DROPOUT):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = TransformerEncoderModel(INPUT_DIM, D_MODEL, NUM_HEADS, FNN_DIM, NUM_LAYERS, DROPOUT).to(device)
    model.train()
    optimizer = torch.optim.SGD(model.parameters(), lr=LR, weight_decay=0.01)    # 定义优化器
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10)
    criterion = nn.MSELoss()
    scaler = amp.GradScaler()    # 在模型、优化器定义之后，定义GradScaler
    best_valid_loss = float('inf')
    for epoch in range(N_EPOCHS):
        total_loss = 0
        for i, (src, y) in enumerate(train_loader):
            src, y = src.to(device), y.to(device)
            optimizer.zero_grad()
            output = model(src)
            loss = criterion(output, y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
            optimizer.step()
            total_loss += loss.item()

        average_loss = total_loss / len(train_loader)
        print(f'Epoch: {epoch+1:02} | Train Loss: {average_loss:.3f}')
        scheduler.step(average_loss)

        # 计算验证损失
        valid_loss = 0
        model.eval()
        with torch.no_grad():
            for src, y in valid_loader:
                src, y = src.to(device), y.to(device)
                output = model(src)
                valid_loss += criterion(output, y).item()

        valid_loss /= len(valid_loader)
        print(f'Epoch: {epoch+1:02} | Valid Loss: {valid_loss:.3f}')
        model.train()

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), model_file)

def load_model(model_file, device):
    model = TransformerEncoderModel(INPUT_DIM, D_MODEL, NUM_HEADS, FNN_DIM, NUM_LAYERS, DROPOUT).to(device)
    if os.path.exists(model_file):
        model.load_state_dict(torch.load(model_file, map_location=device))
        model.eval()
        print(f"Model loaded from {model_file}")
        return model
    else:
        print("No existing model found. Training from scratch.")
        return None

if __name__ == '__main__':
    # 设置参数
    batch_size = 128
    N_EPOCHS = 40
    LR = 1e-4
    CLIP = 1
    INPUT_DIM = 292
    D_MODEL = 512
    NUM_HEADS = 4
    FNN_DIM = 1024
    NUM_LAYERS = 4
    DROPOUT = 0.2
    # device = torch.device('npu:{}'.format(args.gpu))
    # model.npu(args.gpu)
    model_file = "/home/ma-user/modelarts/user-job-dir/model/transformer.pth"

    train_loader, valid_loader, test_loader = prepare_data("/home/ma-user/work/AI-chemistry/mp/dataset/train_data.csv", batch_size)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = load_model(model_file, device)

    if model is None:
        train(train_loader, valid_loader, model_file, N_EPOCHS, LR, CLIP, INPUT_DIM, D_MODEL, NUM_HEADS, FNN_DIM, NUM_LAYERS, DROPOUT)
    else:
        # 如果模型已加载，可以在这里添加测试或其他操作
        pass

  if not dirpath.find("AppData\Local\Temp"):
  """
  """


No existing model found. Training from scratch.


RuntimeError: call aclnnFlashAttentionScore failed, detail:EZ9999: Inner Error!
EZ9999: 2024-12-07-23:22:08.380.296  Cannot find bin of op FlashAttentionScore, integral key 0/1/|float/ND/float/ND/float/ND/float/ND/float/ND/float/ND/float/ND/.
        TraceBack (most recent call last):
        Cannot find binary for op FlashAttentionScore.
        Kernel GetWorkspace failed. opType: 100
        Kernel Run failed. opType: 100, FlashAttentionScore
        launch failed for FlashAttentionScore, errno:561000.

[ERROR] 2024-12-07-23:22:08 (PID:1627011, Device:0, RankID:-1) ERR01100 OPS call acl api failed

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import r2_score

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def test(model, test_loader, device):
    model.eval()
    total_mse_loss = 0
    total_mae_loss = 0
    total_r2 = 0
    total_count = 0
    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for src, y in test_loader:
            src, y = src.to(device), y.to(device)
            output = model(src)
            mse_loss = nn.MSELoss()(output, y)
            mae_loss = nn.L1Loss()(output, y)
            r2 = r2_score(y.cpu().numpy(), output.cpu().numpy())  # 计算 R-squared
            total_mse_loss += mse_loss.item() * len(y)
            total_mae_loss += mae_loss.item() * len(y)
            total_r2 += r2 * len(y)
            total_count += len(y)

            # 保存所有预测和目标值用于整体 R-squared 计算
            all_predictions.extend(output.cpu().numpy().flatten())
            all_targets.extend(y.cpu().numpy().flatten())

    avg_mse_loss = total_mse_loss / total_count if total_count > 0 else 0
    avg_mae_loss = total_mae_loss / total_count if total_count > 0 else 0
    avg_r2 = total_r2 / total_count if total_count > 0 else 0

    # 计算 RMSE
    rmse = (mean_squared_error(all_targets, all_predictions, squared=False)) if total_count > 0 else 0

    print(f'Test MSE Loss: {avg_mse_loss:.3f}')
    print(f'Test MAE Loss: {avg_mae_loss:.3f}')
    print(f'Test R-squared: {avg_r2:.3f}')
    print(f'Test RMSE: {rmse:.3f}')

    return avg_mse_loss, avg_mae_loss, avg_r2, rmse
# 加载模型并测试
def load_and_test(model_file, test_loader, INPUT_DIM, D_MODEL, NUM_HEADS, FNN_DIM, NUM_LAYERS, DROPOUT):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = TransformerEncoderModel(INPUT_DIM, D_MODEL, NUM_HEADS, FNN_DIM, NUM_LAYERS, DROPOUT).to(device)
    if os.path.exists(model_file):
        model.load_state_dict(torch.load(model_file, map_location=device))
        model.eval()
        print(f"Model loaded from {model_file}")
        return test(model, test_loader, device)
    else:
        print("No existing model found. Please train the model first.")
        return None

# 调用测试函数
# 确保在调用之前已经定义了 model_file, test_loader, INPUT_DIM, D_MODEL, NUM_HEADS, FNN_DIM, NUM_LAYERS, DROPOUT
mse_loss, mae_loss, r2_score, rmse= load_and_test(model_file, test_loader, INPUT_DIM, D_MODEL, NUM_HEADS, FNN_DIM, NUM_LAYERS, DROPOUT)