In [1]:
from transformers import BertTokenizerFast, BertModel, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch

In [5]:
# 加载预训练的分词器和模型
checkpoint = 'unikei/bert-base-smiles'
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at unikei/bert-base-smiles and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
import pandas as pd
# 准备数据集（这里需要您自己加载和处理数据）
train_data = pd.read_csv('mol_train.csv', encoding='gbk')
test_data = pd.read_csv('mol_test.csv', encoding='gbk')

In [14]:
# 分词并编码
tokenized_data = tokenizer(
    train_data['SMILES'].tolist(),
    padding=True,
    truncation=True,
    max_length=512,  # 或您选择的最大长度
    return_tensors="pt"
)

# 创建标签和特征
labels = torch.tensor(train_data['TARGET'].values)
input_ids = tokenized_data['input_ids']
attention_mask = tokenized_data['attention_mask']

# 创建TensorDataset
dataset = TensorDataset(input_ids, attention_mask, labels)

In [16]:
from sklearn.model_selection import train_test_split

# 划分数据集
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

# 创建DataLoader
batch_size = 32  # 或您选择的批次大小
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [17]:
# 定义计算准确率的函数
def compute_accuracy(predictions, labels):
    preds_flat = predictions.argmax(dim=1).flatten()
    labels_flat = labels.flatten()
    return (preds_flat == labels_flat).cpu().numpy().mean()

# 验证模型
def validate_model(model, val_loader, loss_fn):
    model.eval()
    total_eval_loss = 0
    total_eval_accuracy = 0

    for batch in val_loader:
        input_ids = batch[0].to(model.device)
        attention_mask = batch[1].to(model.device)
        labels = batch[2].to(model.device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        total_eval_loss += loss.item()

        logits = outputs.logits
        total_eval_accuracy += compute_accuracy(logits, labels)

    avg_val_loss = total_eval_loss / len(val_loader)
    avg_val_accuracy = total_eval_accuracy / len(val_loader)

    return avg_val_loss, avg_val_accuracy

In [18]:
from torch.nn import CrossEntropyLoss

# 参数设置
learning_rate = 5e-5  # 学习率
total_epochs = 3  # 总轮数

# 定义优化器
optimizer = AdamW(model.parameters(), lr=learning_rate)

# 定义损失函数
loss_fn = CrossEntropyLoss()

# 训练模型
for epoch in range(total_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # 将数据移至正确的设备（例如 GPU）
        input_ids = batch[0].to(model.device)
        attention_mask = batch[1].to(model.device)
        labels = batch[2].to(model.device)

        # 前向传播
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # 打印每轮的平均损失
    print(f"Epoch {epoch + 1}/{total_epochs}, Loss: {total_loss / len(train_loader)}")

    # 验证模型
    avg_val_loss, avg_val_accuracy = validate_model(model, val_loader, loss_fn)
    print(f"Validation Loss: {avg_val_loss}, Accuracy: {avg_val_accuracy}")



Epoch 1/3, Loss: 0.5602369027005302
Validation Loss: 0.5421294808387757, Accuracy: 0.75625
Validation Loss: 0.5421294808387757, Accuracy: 0.75625
Validation Loss: 0.5421294808387757, Accuracy: 0.75625
Validation Loss: 0.5421294808387757, Accuracy: 0.75625
Validation Loss: 0.5421294808387757, Accuracy: 0.75625
Epoch 2/3, Loss: 0.3215409066114161
Validation Loss: 0.401987162232399, Accuracy: 0.8520833333333334
Validation Loss: 0.401987162232399, Accuracy: 0.8520833333333334
Validation Loss: 0.401987162232399, Accuracy: 0.8520833333333334
Validation Loss: 0.401987162232399, Accuracy: 0.8520833333333334
Validation Loss: 0.401987162232399, Accuracy: 0.8520833333333334
Epoch 3/3, Loss: 0.19960344313747352
Validation Loss: 0.4361471079289913, Accuracy: 0.8625
Validation Loss: 0.4361471079289913, Accuracy: 0.8625
Validation Loss: 0.4361471079289913, Accuracy: 0.8625
Validation Loss: 0.4361471079289913, Accuracy: 0.8625
Validation Loss: 0.4361471079289913, Accuracy: 0.8625


NameError: name 'test_loader' is not defined

In [22]:
# 测试模型
test_data = pd.read_csv('mol_test.csv', encoding='gbk')
# 分词并编码
tokenized_testdata = tokenizer(
    test_data['SMILES'].tolist(),
    padding=True,
    truncation=True,
    max_length=512,  # 或您选择的最大长度
    return_tensors="pt"
)

input_ids = tokenized_testdata['input_ids']
attention_mask = tokenized_testdata['attention_mask']

# 创建TensorDataset
test_dataset = TensorDataset(input_ids, attention_mask)

test_loader = DataLoader(test_dataset, batch_size=batch_size)

# 模型推断
model.eval()  # 确保模型在评估模式
predictions = []

with torch.no_grad():
    for batch in test_loader:
        batch_input_ids = batch[0].to(model.device)
        batch_attention_mask = batch[1].to(model.device)

        # 获取模型的预测结果
        outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=1)
        predictions.extend(batch_predictions.cpu().numpy())

# 将预测结果转换为 DataFrame
test_data['TARGET'] = predictions

# 保存预测结果到 CSV 文件，如果需要的话
test_data.to_csv('submission.csv', index=False)