In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
import warnings
warnings.filterwarnings('ignore')

# 设置中文显示
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['Arial Unicode MS']  # macOS
# plt.rcParams['font.family'] = ['SimHei']  # Windows

# 读取数据 - 尝试不同的编码方式
encodings = ['gbk', 'gb18030', 'latin1', 'iso-8859-1']

for encoding in encodings:
    try:
        print(f"\n尝试使用 {encoding} 编码读取...")
        train_df = pd.read_csv('train.csv', encoding=encoding)
        valid_df = pd.read_csv('valid.csv', encoding=encoding)
        test_df = pd.read_csv('test.csv', encoding=encoding)
        print(f"成功使用 {encoding} 编码读取数据!")
        break
    except UnicodeDecodeError:
        print(f"{encoding} 编码读取失败")
        continue
    except Exception as e:
        print(f"发生其他错误: {e}")
        continue
else:
    raise Exception("所有编码方式都失败了，请检查文件编码格式")

# 显示数据基本信息
print("\n数据集大小:")
print(f"训练集: {train_df.shape}")
print(f"验证集: {valid_df.shape}")
print(f"测试集: {test_df.shape}")

# 显示训练集的前几条数据
print("\n训练集示例:")
print(train_df.head())

# 统计训练集中各MBTI类型的分布
print("\nMBTI类型分布:")
print(train_df['type'].value_counts())

# 检查是否有缺失值
print("\n检查缺失值:")
print(train_df.isnull().sum())


尝试使用 gbk 编码读取...
gbk 编码读取失败

尝试使用 gb18030 编码读取...
gb18030 编码读取失败

尝试使用 latin1 编码读取...
成功使用 latin1 编码读取数据!

数据集大小:
训练集: (6072, 2)
验证集: (867, 2)
测试集: (1736, 3)

训练集示例:
   type                                              posts
0  ENFP  Its a bizarre condition of mine, gets me into ...
1  ISTP  'wonder are they huggable.. :unsure: I'll try ...
2  ENTJ  'Hi ! There is some material in your answer to...
3  ENFP  'This is great and makes a lot of sense, I fee...
4  INFJ  'so he is not Mr Right? I mean, I thought INFJ...

MBTI类型分布:
type
INFP    1276
INFJ    1044
INTP     877
INTJ     771
ENFP     481
ENTP     479
ISTP     247
ISFP     197
ENTJ     169
ISTJ     135
ENFJ     132
ISFJ     110
ESTP      64
ESTJ      31
ESFJ      30
ESFP      29
Name: count, dtype: int64

检查缺失值:
type     0
posts    0
dtype: int64


In [4]:
# 1. 创建自定义数据集类
class MBTIDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # 创建MBTI类型到数字的映射
        self.label2id = {label: idx for idx, label in enumerate(sorted(set(labels)))}
        self.id2label = {idx: label for label, idx in self.label2id.items()}
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # 将文本转换为BERT的输入格式
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.label2id[label])
        }

# 2. 初始化tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 3. 创建数据集实例
train_dataset = MBTIDataset(train_df['posts'].values, train_df['type'].values, tokenizer)
valid_dataset = MBTIDataset(valid_df['posts'].values, valid_df['type'].values, tokenizer)

print(f"标签映射:\n{train_dataset.label2id}")
print(f"\n类别数量: {len(train_dataset.label2id)}")

# 4. 检查一个样本的格式
sample = train_dataset[0]
print("\n样本格式:")
for k, v in sample.items():
    print(f"{k}: {v.shape}")

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

标签映射:
{'ENFJ': 0, 'ENFP': 1, 'ENTJ': 2, 'ENTP': 3, 'ESFJ': 4, 'ESFP': 5, 'ESTJ': 6, 'ESTP': 7, 'INFJ': 8, 'INFP': 9, 'INTJ': 10, 'INTP': 11, 'ISFJ': 12, 'ISFP': 13, 'ISTJ': 14, 'ISTP': 15}

类别数量: 16

样本格式:
input_ids: torch.Size([512])
attention_mask: torch.Size([512])
labels: torch.Size([])


In [5]:
from transformers import BertModel
from torch import nn

class MBTIClassifier(nn.Module):
    def __init__(self, num_labels=16):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# 创建数据加载器
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

# 初始化模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MBTIClassifier()
model.to(device)

# 定义优化器和损失函数
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

print(f"使用设备: {device}")
print(f"训练批次大小: {batch_size}")
print(f"训练集批次数: {len(train_loader)}")
print(f"验证集批次数: {len(valid_loader)}")

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

使用设备: cpu
训练批次大小: 16
训练集批次数: 380
验证集批次数: 55


In [11]:
# 检查并设置MPS设备
device = torch.device("cpu")

print(f"使用设备: {device}")

# 初始化模型
model = MBTIClassifier()
model.to(device)

# 定义优化器和损失函数
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# 训练参数
batch_size = 32
num_epochs = 5
best_val_acc = 0
best_model = None

# 创建数据加载器 - 移除多进程加载
train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True
)
valid_loader = DataLoader(
    valid_dataset, 
    batch_size=batch_size
)

print(f"训练批次大小: {batch_size}")
print(f"训练集批次数: {len(train_loader)}")
print(f"验证集批次数: {len(valid_loader)}")

# 添加进度条显示
from tqdm import tqdm

def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    # 添加进度条
    pbar = tqdm(train_loader, desc="Training")
    for batch in pbar:
        # 将数据移到指定设备
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        # 更新进度条
        pbar.set_postfix({
            'loss': f'{total_loss/total:.4f}',
            'acc': f'{correct/total:.4f}'
        })
    
    return total_loss / len(train_loader), correct / total

def evaluate(model, valid_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in tqdm(valid_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return total_loss / len(valid_loader), correct / total

# 训练循环
print("\n开始训练...")
for epoch in range(num_epochs):
    print(f'\nEpoch {epoch+1}/{num_epochs}')
    
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, valid_loader, criterion, device)
    
    print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
    print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model = model.state_dict().copy()
        print(f'新的最佳验证准确率: {best_val_acc:.4f}')
    print('-' * 50)

print(f'\n训练完成! 最佳验证准确率: {best_val_acc:.4f}')

使用设备: cpu
训练批次大小: 32
训练集批次数: 190
验证集批次数: 28

开始训练...

Epoch 1/5


Training: 100%|██████████| 190/190 [3:10:00<00:00, 60.00s/it, loss=0.0720, acc=0.2330]    
Evaluating: 100%|██████████| 28/28 [00:59<00:00,  2.11s/it]


Train Loss: 2.3020, Train Acc: 0.2330
Val Loss: 2.1654, Val Acc: 0.3022
新的最佳验证准确率: 0.3022
--------------------------------------------------

Epoch 2/5


Training: 100%|██████████| 190/190 [1:06:21<00:00, 20.96s/it, loss=0.0615, acc=0.3910] 
Evaluating: 100%|██████████| 28/28 [01:36<00:00,  3.43s/it]


Train Loss: 1.9653, Train Acc: 0.3910
Val Loss: 1.8475, Val Acc: 0.4268
新的最佳验证准确率: 0.4268
--------------------------------------------------

Epoch 3/5


Training: 100%|██████████| 190/190 [36:32<00:00, 11.54s/it, loss=0.0526, acc=0.4975] 
Evaluating: 100%|██████████| 28/28 [00:55<00:00,  2.00s/it]


Train Loss: 1.6795, Train Acc: 0.4975
Val Loss: 1.7048, Val Acc: 0.4798
新的最佳验证准确率: 0.4798
--------------------------------------------------

Epoch 4/5


Training:   5%|▍         | 9/190 [01:48<35:38, 11.82s/it, loss=0.0451, acc=0.5972]

In [None]:
# 检查并设置MPS设备
device = torch.device("cpu")
# device = (
#     torch.device("mps") 
#     if torch.backends.mps.is_available() 
#     else torch.device("cpu")
# )

print(f"使用设备: {device}")

# 初始化模型
model = MBTIClassifier()
model.to(device)

# 定义优化器和损失函数
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# 训练参数 - 减小batch_size和max_length
batch_size = 8  # 减小batch size
max_length = 256  # 减小序列长度

# 重新创建数据集实例，使用较小的max_length
train_dataset = MBTIDataset(train_df['posts'].values, train_df['type'].values, tokenizer, max_length=max_length)
valid_dataset = MBTIDataset(valid_df['posts'].values, valid_df['type'].values, tokenizer, max_length=max_length)

# 创建数据加载器
train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True
)
valid_loader = DataLoader(
    valid_dataset, 
    batch_size=batch_size
)

print(f"训练批次大小: {batch_size}")
print(f"最大序列长度: {max_length}")
print(f"训练集批次数: {len(train_loader)}")
print(f"验证集批次数: {len(valid_loader)}")

# 添加梯度裁剪以提高稳定性
def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    pbar = tqdm(train_loader, desc="Training")
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        # 添加梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        # 清理缓存
        if device.type == 'mps':
            torch.mps.empty_cache()
        
        pbar.set_postfix({
            'loss': f'{total_loss/total:.4f}',
            'acc': f'{correct/total:.4f}'
        })
    
    return total_loss / len(train_loader), correct / total

def evaluate(model, valid_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in tqdm(valid_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            # 清理缓存
            if device.type == 'mps':
                torch.mps.empty_cache()
    
    return total_loss / len(valid_loader), correct / total

# 训练循环
print("\n开始训练...")
for epoch in range(num_epochs):
    print(f'\nEpoch {epoch+1}/{num_epochs}')
    
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, valid_loader, criterion, device)
    
    print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
    print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model = model.state_dict().copy()
        print(f'新的最佳验证准确率: {best_val_acc:.4f}')
    print('-' * 50)

print(f'\n训练完成! 最佳验证准确率: {best_val_acc:.4f}')