In [None]:
%pip install transformers datasets scikit-learn pandas torch tqdm

In [1]:
%pip install opencc-python-reimplemented

Collecting opencc-python-reimplemented
  Downloading opencc_python_reimplemented-0.1.7-py2.py3-none-any.whl (481 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.8/481.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: opencc-python-reimplemented
Successfully installed opencc-python-reimplemented-0.1.7
Note: you may need to restart the kernel to use updated packages.


✅ 第二步：使用 OpenCC 进行繁简转换

✏ 示例：批量转换 CSV 文件中的文本列（如 text 列）

In [6]:
import pandas as pd
from opencc import OpenCC

# 初始化转换器：将 繁体 转 简体
cc = OpenCC('t2s')  # t2s: Traditional to Simplified

# 读取繁体数据
df_phrase = pd.read_csv('ChineseEmoBank/CVAP_SD/CVAP_all_SD.csv', sep='\t')  # 包含 text, valence, arousal 列
df_sentence = pd.read_csv('ChineseEmoBank/CVAS_SD/CVAS_all.csv', sep='\t')  # 包含 text, valence, arousal 列
df_text = pd.read_csv('ChineseEmoBank/CVAT_SD/CVAT_all_SD.csv', sep='\t')  # 包含 text, valence, arousal 列
df_word = pd.read_csv('ChineseEmoBank/CVAW_SD/CVAW_all_SD.csv', sep='\t')  # 包含 text, valence, arousal 列

df_phrase.rename(columns={'Phrase': 'text'}, inplace=True)
df_word.rename(columns={'Word': 'text'}, inplace=True)
df_sentence.rename(columns={'Text': 'text'}, inplace=True)
df_text.rename(columns={'Text': 'text'}, inplace=True)

# 定义文件保存名
output_files = ['simplified_phrase.csv', 'simplified_sentence.csv', 'simplified_text.csv', 'simplified_word.csv']

# 批量处理并保存
for df, filename in zip([df_phrase, df_sentence, df_text, df_word], output_files):
    df = df.copy()  # 复制整个 DataFrame，保留所有列
    df['text'] = df['text'].apply(lambda x: cc.convert(str(x)) if pd.notnull(x) else "")
    df.to_csv(filename, index=False)
    print(f"{filename} 保存成功")


print("繁体转简体完成，保存为 simplified_data.csv")


simplified_phrase.csv 保存成功
simplified_sentence.csv 保存成功
simplified_text.csv 保存成功
simplified_word.csv 保存成功
繁体转简体完成，保存为 simplified_data.csv


In [8]:
# 读取简体数据
df_simplified_phrase = pd.read_csv('simplified_phrase.csv')    # 包含 text, valence, arousal
df_simplified_sentence = pd.read_csv('simplified_sentence.csv')
df_simplified_text = pd.read_csv('simplified_text.csv')
df_simplified_word = pd.read_csv('simplified_word.csv')

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm import tqdm
import pandas as pd
import numpy as np

# ✅ 设置模型路径（根据所选模型修改）
MODEL_NAME = 'hfl/chinese-macbert-base'  # 或者 'hfl/chinese-bert-wwm-ext'

# ✅ 模型定义
class VARegressionModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.regressor = nn.Linear(self.bert.config.hidden_size, 2)
        self.activation = nn.Sigmoid()  # 压缩输出到 [0, 1]

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]  # [CLS]向量
        raw_output = self.regressor(pooled)       # 输出原始值
        scaled_output = self.activation(raw_output) * 8 + 1  # 映射到 [1, 9]
        return scaled_output

# ✅ 自定义数据集
class TextVADataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.float)
        }

# ✅ 加载数据（示例格式）
def load_data(path):
    df = pd.read_csv(path)  # 必须有列 ['text', 'valence', 'arousal']
    texts = df['text'].tolist()
    labels = df[['valence', 'arousal']].values
    return texts, labels

# ✅ 训练函数
def train_model(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    criterion = nn.MSELoss()
    for batch in tqdm(dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# ✅ 验证函数
def evaluate_model(model, dataloader, device):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids, attention_mask).cpu().numpy()
            preds.append(outputs)
            trues.append(labels)
    preds = np.vstack(preds)
    trues = np.vstack(trues)
    mae = mean_absolute_error(trues, preds)
    rmse = mean_squared_error(trues, preds, squared=False)
    return mae, rmse



In [None]:
# ✅ 主训练逻辑
def main():
    # 加载数据
    train_texts, train_labels = load_data('train.csv')
    val_texts, val_labels = load_data('val.csv')

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    train_dataset = TextVADataset(train_texts, train_labels, tokenizer)
    val_dataset = TextVADataset(val_texts, val_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = VARegressionModel(MODEL_NAME).to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)

    for epoch in range(5):
        print(f"\nEpoch {epoch+1}")
        train_loss = train_model(model, train_loader, optimizer, device)
        mae, rmse = evaluate_model(model, val_loader, device)
        print(f"Train Loss: {train_loss:.4f} | Val MAE: {mae:.4f} | RMSE: {rmse:.4f}")

if __name__ == '__main__':
    main()