In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split


# 自定义Dataset类
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# BERT+LSTM模型定义
class BertLSTM(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(BertLSTM, self).__init__()
        self.bert = bert
        self.lstm = nn.LSTM(bert.config.hidden_size,
                            hidden_dim,
                            num_layers=n_layers,
                            bidirectional=bidirectional,
                            batch_first=True,
                            dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = bert_output.last_hidden_state
        lstm_output, (hidden, cell) = self.lstm(last_hidden_state)
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2], hidden[-1]), dim=1))
        else:
            hidden = self.dropout(hidden[-1])
        output = self.fc(hidden)
        return output

def evaluate(model, val_loader, device):
    """
    评估模型在验证集上的性能。
    
    参数:
    - model: 待评估的模型
    - val_loader: 验证数据加载器
    - device: 设备（CPU或GPU）
    
    返回:
    - accuracy: 准确率
    - precision: 精确率
    - recall: 召回率
    - f1: F1得分
    """
    model.eval()
    predictions, true_labels = [], []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)

    return accuracy, precision, recall, f1

# 文本预测函数
def predict(text):
    model.eval()
    with torch.no_grad():
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LEN,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        output = model(input_ids, attention_mask)
        _, prediction = torch.max(output, dim=1)
    return prediction.item()

# 数据准备
df = pd.read_excel("social support_coding scheme_0313.xlsx", sheet_name=1)
df['content'] = df['content'].astype(str)
x = df['content'].tolist()

labels_to_process = ['symp_e', 'symptom_i', 'experience_i', 'objective_i']
for label in labels_to_process:
    print(f"Processing label: {label}")
    with open("evaluate.csv", "a") as f:
        f.write(f"{label}\n")
        
    y = df[label].tolist()

    # 设置参数
    PRETRAINED_MODEL_NAME = './bert_base_chinese'
    MAX_LEN = 128
    BATCH_SIZE = 16
    HIDDEN_DIM = 256
    OUTPUT_DIM = 2
    N_LAYERS = 2
    BIDIRECTIONAL = True
    DROPOUT = 0.3
    EPOCHS = 10
    
    # 加载BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
    
    # 数据分割
    train_texts, val_texts, train_labels, val_labels = train_test_split(x, y, test_size=0.2, random_state=22)
    
    # 创建DataLoader
    train_dataset = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
    val_dataset = TextDataset(val_texts, val_labels, tokenizer, MAX_LEN)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    
    # 加载预训练BERT模型
    bert = BertModel.from_pretrained(PRETRAINED_MODEL_NAME)
    
    # 初始化BertLSTM模型
    model = BertLSTM(bert, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
    
    # 设置优化器和损失函数
    optimizer = optim.Adam(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()
    
    # 训练模型
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    criterion = criterion.to(device)
    
    # 训练模型并保存最优模型参数
    best_valid_loss = float('inf')
    model_save_path = 'best_model.pt'
    
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        avg_train_loss = total_loss / len(train_loader)
        
        # 评估当前epoch模型在验证集上的表现
        accuracy, precision, recall, f1 = evaluate(model, val_loader, device)
        
        print(f'Epoch {epoch+1}/{EPOCHS}, Loss: {avg_train_loss:.4f}, Validation F1: {f1:.4f}')
        
        # 如果当前epoch验证损失低于之前保存的最佳模型，则保存当前模型
        if avg_train_loss < best_valid_loss:
            best_valid_loss = avg_train_loss
            torch.save(model.state_dict(), model_save_path)
    
    # 加载最优模型
    model.load_state_dict(torch.load(model_save_path))
    
    # 在验证集上评估最优模型
    accuracy, precision, recall, f1 = evaluate(model, val_loader, device)
    print('Optimal Model Performance on Validation Set:')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')

    # 将模型评估结果写入文件
    with open("evaluate.csv", "a") as f:
        f.write(f'Accuracy: {accuracy:.4f}\n')
        f.write(f'Precision: {precision:.4f}\n')
        f.write(f'Recall: {recall:.4f}\n')
        f.write(f'F1 Score: {f1:.4f}\n')
    # # 文本预测
    # sample_text = "需要预测的文本"
    # predicted_label = predict(sample_text)
    # print(f'Predicted Label: {predicted_label}')

Processing label: symp_e


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10, Loss: 0.4873, Validation F1: 0.0000




Epoch 2/10, Loss: 0.3743, Validation F1: 0.6250




Epoch 3/10, Loss: 0.2919, Validation F1: 0.6971




Epoch 4/10, Loss: 0.2542, Validation F1: 0.7170




Epoch 5/10, Loss: 0.2306, Validation F1: 0.7348




Epoch 6/10, Loss: 0.2184, Validation F1: 0.7232




Epoch 7/10, Loss: 0.2109, Validation F1: 0.7237




Epoch 8/10, Loss: 0.1982, Validation F1: 0.7113




Epoch 9/10, Loss: 0.1889, Validation F1: 0.7106




Epoch 10/10, Loss: 0.1797, Validation F1: 0.7109


  model.load_state_dict(torch.load(model_save_path))


Optimal Model Performance on Validation Set:
Accuracy: 0.8959
Precision: 0.7398
Recall: 0.6842
F1 Score: 0.7109
Processing label: symptom_i


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10, Loss: 0.5273, Validation F1: 0.0000




Epoch 2/10, Loss: 0.4367, Validation F1: 0.4978




Epoch 3/10, Loss: 0.3407, Validation F1: 0.7006




Epoch 4/10, Loss: 0.3027, Validation F1: 0.6722




Epoch 5/10, Loss: 0.2934, Validation F1: 0.6959




Epoch 6/10, Loss: 0.2843, Validation F1: 0.6967




Epoch 7/10, Loss: 0.2732, Validation F1: 0.6819




Epoch 8/10, Loss: 0.2638, Validation F1: 0.6894




Epoch 9/10, Loss: 0.2497, Validation F1: 0.7010




Epoch 10/10, Loss: 0.2436, Validation F1: 0.6923


  model.load_state_dict(torch.load(model_save_path))


Optimal Model Performance on Validation Set:
Accuracy: 0.8537
Precision: 0.6223
Recall: 0.7800
F1 Score: 0.6923
Processing label: experience_i


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10, Loss: 0.3384, Validation F1: 0.0000


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/10, Loss: 0.2320, Validation F1: 0.0000




Epoch 3/10, Loss: 0.2082, Validation F1: 0.2500




Epoch 4/10, Loss: 0.1964, Validation F1: 0.3846




Epoch 5/10, Loss: 0.1855, Validation F1: 0.4578




Epoch 6/10, Loss: 0.1766, Validation F1: 0.4952




Epoch 7/10, Loss: 0.1677, Validation F1: 0.5347




Epoch 8/10, Loss: 0.1645, Validation F1: 0.4186




Epoch 9/10, Loss: 0.1555, Validation F1: 0.4270




Epoch 10/10, Loss: 0.1437, Validation F1: 0.4368


  model.load_state_dict(torch.load(model_save_path))


Optimal Model Performance on Validation Set:
Accuracy: 0.9311
Precision: 0.7917
Recall: 0.3016
F1 Score: 0.4368
Processing label: objective_i


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10, Loss: 0.4326, Validation F1: 0.0000


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/10, Loss: 0.3358, Validation F1: 0.0000




Epoch 3/10, Loss: 0.2676, Validation F1: 0.4182




Epoch 4/10, Loss: 0.2296, Validation F1: 0.6036




Epoch 5/10, Loss: 0.2150, Validation F1: 0.6044




Epoch 6/10, Loss: 0.1959, Validation F1: 0.6190




Epoch 7/10, Loss: 0.1904, Validation F1: 0.6225




Epoch 8/10, Loss: 0.1770, Validation F1: 0.6377




Epoch 9/10, Loss: 0.1761, Validation F1: 0.6536




Epoch 10/10, Loss: 0.1641, Validation F1: 0.6174


  model.load_state_dict(torch.load(model_save_path))


Optimal Model Performance on Validation Set:
Accuracy: 0.9198
Precision: 0.6667
Recall: 0.5750
F1 Score: 0.6174
