dataset

是的，**BERT 的分词器（如 `BertTokenizer`）会将整个句子当作一个整体来进行分词，并返回**：

* 整体的 token 列表；
* 每个 token 的 ID；
* 整体的 token 数（这才是所谓的“句子长度”）；

---

## ✅ 举个例子（英文）

```python
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text = "The stock performance was extremely disappointing."

tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)
print("Token count:", len(tokens))
```

输出可能为：

```text
Tokens: ['the', 'stock', 'performance', 'was', 'extremely', 'dis', '##appoint', '##ing', '.']
Token count: 9
```

→ 注意：`disappointing` 被拆成了 `['dis', '##appoint', '##ing']`，但整体作为一个句子被处理。

---

## ✅ 中文也一样

```python
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
text = "我今天很开心，但是明天有点担忧。"
tokens = tokenizer.tokenize(text)
print(tokens)
print("Token count:", len(tokens))
```

输出示例：

```text
['我', '今天', '很', '开', '心', '，', '但是', '明天', '有', '点', '担', '忧', '。']
Token count: 13
```

→ 整句话是作为**一个输入整体**送入 BERT 的，分词后自然得到总 token 数。

---

## 🧠 总结一句话：

> ✔️ **BERT 分词器不是逐词分，而是“整句分”**，它会整体考虑句子的结构、标点、特殊字符等，然后拆成一组 token，并输出 token 数。

这就是为什么你在用：

```python
tokenizer(text, padding='max_length', truncation=True, max_length=128)
```

时，BERT 会：

* **自动计算整个输入的 token 长度**；
* 超出 `max_length` 就 **截断**；
* 不足就 **填充（pad）**。

---

如果你想我帮你写个小脚本：**分析一批文本的 token 平均长度、中位数、最大值**，来决定是否用 `max_len=128` 是不是合适，我可以马上写。需要吗？


In [None]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class VADataset(Dataset):
    # max_len=128是类似句子最长长度，一个token是一个句子
    def __init__(self, dataframe, tokenizer_name='bert-base-chinese', max_len=128,column_name = 'text'):
        # TODO
        # 还有按照phrase、word来命名的
        self.texts = dataframe[column_name].tolist()
        self.labels = dataframe[['Valence_Mean', 'Arousal_Mean']].values.astype(float)
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoded = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'labels': self.labels[idx]
        }


model

In [None]:
import torch.nn as nn
from transformers import AutoModel

class VAModel(nn.Module):
    def __init__(self, model_name='bert-base-chinese'):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.regressor = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = output.last_hidden_state[:, 0, :]
        return self.regressor(cls_output)


utils

In [None]:
import torch
import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics import mean_absolute_error

def compute_metrics(preds, labels):
    preds = preds.cpu().numpy()
    labels = labels.cpu().numpy()
    mae = mean_absolute_error(labels, preds)
    pearson_v = pearsonr(preds[:, 0], labels[:, 0])[0]
    pearson_a = pearsonr(preds[:, 1], labels[:, 1])[0]
    return {
        'MAE': mae,
        'Pearson_V': pearson_v,
        'Pearson_A': pearson_a
    }


train

In [None]:
import glob
import os

# 定义根目录
root_dir = 'ChineseEmoBank'

# 匹配所有 *_all.csv 或 *_all_SD.csv 文件
all_csv_paths = glob.glob(os.path.join(root_dir, '*', '*all*.csv'))

# 读取并合并
df_list = []
for file in all_csv_paths:
    # df = pd.read_csv(file)
    pd.read_csv(file, encoding='utf-8', nrows=10)
    df['source_file'] = os.path.basename(file)  # 可选：记录来源
    df_list.append(df)

df = pd.concat(df_list, ignore_index=True)


In [14]:
import pandas as pd
import glob

# 匹配文件路径（确保在当前工作目录或使用绝对路径）
csv_paths = sorted(glob.glob('ChineseEmoBank/CVAT_SD/CVAT_*_SD.csv'))

# 逐个读取并合并
df_list = [pd.read_csv(path, encoding='utf-8', on_bad_lines='skip') for path in csv_paths]
df_cvat_merged = pd.concat(df_list, ignore_index=True)

# 输出信息
print("✅ 合并完成，总样本数：", len(df_cvat_merged))
df_cvat_merged.head()


✅ 合并完成，总样本数： 2949


Unnamed: 0,No.\tText\tValence_Mean\tArousal_Mean\tValence_SD\tArousal_SD\tCategory
0,2525\t人生沒法假設，我沒法假設如果我吐的嚴重心情會怎樣，我只知道如何在自己現有的狀況下...
1,1045\t「好萊塢報導」指出，今天公佈的倫敦影評人協會獎，臺灣導演李安擊敗同樣入圍奧斯卡金...
2,138\t想到故宮的烏鴉都有了歡喜的感覺.\t6.250\t4.333\t0.968\t0....
3,1781\t政黨就是理念的結合，華人世界既然只剩下這兩個政黨還相信理智及科學，與我理念相符，...
4,2695\t107年度綜合所得稅申報31日最後1天，據統計，利用電子申報的民眾已超過6成，到...


In [16]:
# 保存合并后的 DataFrame 到本地 CSV 文件
df_cvat_merged.to_csv('ChineseEmoBank/CVAT_SD/CVAT_all_SD.csv', index=False, encoding='utf-8')


In [18]:
import pandas as pd

# 数据加载
df_phrase = pd.read_csv('ChineseEmoBank/CVAP_SD/CVAP_all_SD.csv', sep='\t')  # 包含 text, valence, arousal 列
df_sentence = pd.read_csv('ChineseEmoBank/CVAS_SD/CVAS_all.csv', sep='\t')  # 包含 text, valence, arousal 列
df_text = pd.read_csv('ChineseEmoBank/CVAT_SD/CVAT_all_SD.csv', sep='\t')  # 包含 text, valence, arousal 列
df_word = pd.read_csv('ChineseEmoBank/CVAW_SD/CVAW_all_SD.csv', sep='\t')  # 包含 text, valence, arousal 列


In [None]:
from transformers import AdamW, get_scheduler
from model import VAModel
from dataset import VADataset
from utils import compute_metrics
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

train_df = df.sample(frac=0.8)
val_df = df.drop(train_df.index)

train_set = VADataset(train_df)
val_set = VADataset(val_df)
train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
val_loader = DataLoader(val_set, batch_size=16)

# 模型初始化
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = VAModel().to(device)
loss_fn = torch.nn.MSELoss()
optimizer = AdamW(model.parameters(), lr=2e-5)
lr_scheduler = get_scheduler("linear", optimizer=optimizer,
                             num_warmup_steps=0,
                             num_training_steps=len(train_loader) * 5)

best_mae = float('inf')
patience, patience_counter = 3, 0

# 训练
for epoch in range(10):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)

        outputs = model(**inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        train_loss += loss.item()

    # 验证
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for batch in val_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            output = model(**inputs)
            preds.append(output)
            targets.append(labels)
    
    preds = torch.cat(preds)
    targets = torch.cat(targets)
    metrics = compute_metrics(preds, targets)

    print(f"\nEpoch {epoch+1} | Train Loss: {train_loss/len(train_loader):.4f} | Val MAE: {metrics['MAE']:.4f}")

    # Early Stopping
    if metrics['MAE'] < best_mae:
        best_mae = metrics['MAE']
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break
