In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig

# 自定义数据集类
class CustomDataset(Dataset):
    def __init__(self, data_path, tokenizer):
        self.sentences, self.labels = self.load_data(data_path)
        self.tokenizer = tokenizer

    def load_data(self, data_path):
        sentences = []
        labels = []
        with open(data_path, 'r', encoding='utf-8') as file:
            for line in file:
                sentence, label = line.strip().split('&')
                sentences.append(sentence)
                labels.append(int(label))
        return sentences, labels

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]

        # 使用BERT的tokenizer对句子进行编码
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label)
        }


# 设置训练参数
data_path = './data.txt'
model_name = 'hfl/chinese-roberta-wwm-ext'
batch_size = 32
label_cnt = 11
learning_rate = 1e-4
num_epochs = 30

# 加载预训练的BERT模型和tokenizer
tokenizer = BertTokenizer.from_pretrained(model_name)
config = BertConfig.from_pretrained(model_name)
config.num_hidden_layers = 2
config.num_labels = label_cnt   # 将 num_labels 添加到 config 中
model = BertForSequenceClassification.from_pretrained(model_name, config=config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-roberta-wwm-ext and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
# 加载数据集
dataset = CustomDataset(data_path, tokenizer)

# 划分训练集和验证集
train_size = int(0.8 * len(dataset))
valid_size = len(dataset) - train_size
train_dataset, valid_dataset = torch.utils.data.random_split(
    dataset, [train_size, valid_size])

# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

# 定义优化器和损失函数
optimizer = AdamW(model.parameters(), lr=learning_rate)
# scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
loss_fn = torch.nn.CrossEntropyLoss()



In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.device_count() > 1:
    model = torch.DataParallel(model, device_ids=[0, 1, 2])

model.to(device)
print(device)

cuda


In [4]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        _, predicted_labels = torch.max(outputs.logits, dim=1)
        correct_predictions += (predicted_labels == labels).sum().item()
        total_predictions += labels.size(0)
    
    # 在验证集上评估模型
    model.eval()
    valid_loss = 0
    correct_valid_predictions = 0
    total_valid_predictions = 0

    with torch.no_grad():
        for batch in valid_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted_labels = torch.max(logits, dim=1)

            valid_loss += loss_fn(logits, labels).item()
            correct_valid_predictions += (predicted_labels == labels).sum().item()
            total_valid_predictions += labels.size(0)

    epoch_loss = total_loss / len(train_loader)
    epoch_valid_loss = valid_loss / len(valid_loader)
    accuracy = correct_predictions / total_predictions
    valid_accuracy = correct_valid_predictions / total_valid_predictions
    # scheduler.step()

    print(f'Epoch {epoch + 1}/{num_epochs} - Loss: {epoch_loss:.4f} - Valid Loss: {epoch_valid_loss:.4f} - Accuracy: {accuracy:.4f} - Valid Accuracy: {valid_accuracy:.4f}')

Epoch 1/30 - Loss: 2.4697 - Valid Loss: 2.4001 - Accuracy: 0.1515 - Valid Accuracy: 0.1176
Epoch 2/30 - Loss: 2.3332 - Valid Loss: 2.4109 - Accuracy: 0.1818 - Valid Accuracy: 0.1176
Epoch 3/30 - Loss: 2.3231 - Valid Loss: 2.4203 - Accuracy: 0.2273 - Valid Accuracy: 0.1176
Epoch 4/30 - Loss: 2.3509 - Valid Loss: 2.4029 - Accuracy: 0.2273 - Valid Accuracy: 0.1176
Epoch 5/30 - Loss: 2.3683 - Valid Loss: 2.3788 - Accuracy: 0.2273 - Valid Accuracy: 0.1176
Epoch 6/30 - Loss: 2.2438 - Valid Loss: 2.3475 - Accuracy: 0.2273 - Valid Accuracy: 0.1176
Epoch 7/30 - Loss: 2.2199 - Valid Loss: 2.3103 - Accuracy: 0.2424 - Valid Accuracy: 0.1765
Epoch 8/30 - Loss: 2.0366 - Valid Loss: 2.2571 - Accuracy: 0.3788 - Valid Accuracy: 0.2941
Epoch 9/30 - Loss: 1.6798 - Valid Loss: 2.2387 - Accuracy: 0.4091 - Valid Accuracy: 0.2941
Epoch 10/30 - Loss: 1.8996 - Valid Loss: 2.1302 - Accuracy: 0.4697 - Valid Accuracy: 0.2941
Epoch 11/30 - Loss: 1.7134 - Valid Loss: 1.9543 - Accuracy: 0.5455 - Valid Accuracy: 0.35

In [5]:
# 计算整体准确率
v_loader = DataLoader(dataset)
model.eval()
correct_predictions = 0
total_predictions = 0
with torch.no_grad():
        for batch in v_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted_labels = torch.max(logits, dim=1)

            correct_predictions += (predicted_labels == labels).sum().item()
            total_predictions += labels.size(0)

valid_accuracy = correct_predictions / total_predictions

print(valid_accuracy)

0.8433734939759037


In [6]:
# 保存模型
torch.save(model, 'job.model')

In [7]:
label = {
    0: '其他',
    1: '产品运营',
    2: '平面设计师',
    3: '财务',
    4: '市场营销',
    5: '项目主管',
    6: '开发工程师',
    7: '文员',
    8: '电商运营',
    9: '人力资源管理',
    10: '风控专员'
}


def job_predict(sentence, model, tokenizer):
    # 对输入句子进行编码
    encoding = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].squeeze()
    attention_mask = encoding['attention_mask'].squeeze()

    # 在模型中进行前向传播
    with torch.no_grad():
        outputs = model(input_ids.unsqueeze(
            0), attention_mask=attention_mask.unsqueeze(0))
        logits = outputs.logits

    # 获取预测的标签和对应的预测概率值
    predicted_probs = torch.softmax(logits, dim=1)
    # predicted_label = torch.argmax(predicted_probs, dim=1)

    predicted_labels = {}
    for index, value in enumerate(predicted_probs.squeeze().tolist()):
        # 设置概率阈值 超过该阈值的可以作为候选项 此处 0.1 较合理
        if value >= 0.1:
            predicted_labels[index] = value
    
    return dict(sorted(predicted_labels.items(), key=lambda x: x[1], reverse=True))


# 加载训练好的模型参数
model.load_state_dict(torch.load('job.pt'))

# 设置模型为评估模式
model.eval()

# 测试例子
input_sentence = '质管部部长助理.负责票证车间生产质量检查'
predicted_labels = job_predict(
    input_sentence, model, tokenizer)

print(f'输入句子: {input_sentence}')

for key,value in predicted_labels.items():
    print(f'{label[key]}  --  {value}')

TypeError: Expected state_dict to be dict-like, got <class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'>.