In [1]:
num_train_chunks = 200
num_test_chunks = 69
num_epochs= 50

In [3]:
from torch.utils.data import Dataset, DataLoader
import torch
class CustomDataset(Dataset):
    def __init__(self, file_paths):
        self.data = []
        for file_path in file_paths:
            inputs, masks, labels = torch.load(file_path)
            self.data.extend(zip(inputs, masks, labels))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids, attention_mask, labels = self.data[idx]
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# 假设您的数据文件名遵循 "preprocessed_data_chunk_*.pt" 的模式
train_file_paths = [f'train_data/preprocessed_data_chunk_{i}.pt' for i in range(num_train_chunks)]
test_file_paths = [f'test_data/preprocessed_data_chunk_{i}.pt' for i in range(num_test_chunks)]

train_dataset = CustomDataset(train_file_paths)
test_dataset = CustomDataset(test_file_paths)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


KeyboardInterrupt: 

In [None]:
from transformers import BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score

# 初始化BERT模型
model = BertForSequenceClassification.from_pretrained('/public/home/lvshuhang/model_space/workspace/bert-base-uncased', num_labels=2)
model.to('cuda')  # 如果可用的话，将模型移到GPU

# 设置优化器
optimizer = AdamW(model.parameters(), lr=5e-5)

# 训练模型
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # 移到GPU
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch} finished")

# 保存模型
torch.save(model.state_dict(), 'bert_model.pt')


In [None]:
model.eval()
predictions, true_labels = [], []

for batch in test_loader:
    batch = {k: v.to('cuda') for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        labels = batch['labels'].cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels)

accuracy = accuracy_score(true_labels, predictions)
print(f"Test Accuracy: {accuracy}")
