# 中文问题对语义相似度判断

## 任务说明
开发算法模型，判断给定中文问题对在语义上的相似程度。
- 输入：问题对(question1, question2)
- 输出：0(不相似)或1(相似)

## 评价指标
最终得分 = 0.6×F1 + 0.2×Accuracy + 0.2×Recall

In [11]:
import pandas as pd
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score, recall_score
import numpy as np
import time
from datetime import timedelta
from torch.amp import GradScaler  # 更新导入方式

# 设置随机种子
torch.manual_seed(42)
np.random.seed(42)

# 检查CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

def format_time(seconds):
    return str(timedelta(seconds=int(seconds)))

使用设备: cuda


In [12]:
import os
import hashlib

# 数据集类
class SentencePairDataset(Dataset):
    def __init__(self, sentences1, sentences2, labels, model):
        self.labels = labels  # 在开始时就保存labels
        
        # 生成唯一哈希值作为缓存标识
        data_hash = hashlib.md5(("||".join(sentences1) + "||".join(sentences2)).encode()).hexdigest()
        cache_dir = "cache"
        cache_path = f"{cache_dir}/{data_hash}.pt"
        
        # 确保缓存目录存在
        os.makedirs(cache_dir, exist_ok=True)
    
        if os.path.exists(cache_path):  # 直接加载缓存
            print("发现缓存文件，正在加载...")
            try:
                cache = torch.load(cache_path)
                self.embeddings1, self.embeddings2 = cache
                print("缓存加载成功！")
            except Exception as e:
                print(f"缓存加载失败：{str(e)}")
                self._process_and_cache(sentences1, sentences2, model, cache_path)
        else:
            print("未找到缓存，开始处理数据...")
            self._process_and_cache(sentences1, sentences2, model, cache_path)
    
    def _process_and_cache(self, sentences1, sentences2, model, cache_path):
        """处理数据并保存缓存的辅助方法"""
        print(f"开始处理 {len(sentences1)} 条数据...")
        total_sentences = len(sentences1)
        batch_size = 256
        total_batches = (total_sentences + batch_size - 1) // batch_size
        
        print(f"将分 {total_batches} 批处理，每批 {batch_size} 条数据")
        start_time = time.time()
        
        # 初始化进度条
        from tqdm import tqdm
        
        # 批量处理句子1
        print("\n处理第一组句子...")
        all_embeddings1 = []
        for i in tqdm(range(0, total_sentences, batch_size), total=total_batches, desc="Group 1"):
            batch = sentences1[i:i + batch_size]
            with torch.no_grad():
                embeddings = model.encode(
                    batch, 
                    convert_to_tensor=True, 
                    batch_size=batch_size, 
                    show_progress_bar=False
                ).cpu()
                all_embeddings1.append(embeddings)
        self.embeddings1 = torch.cat(all_embeddings1)
        
        # 批量处理句子2
        print("\n处理第二组句子...")
        all_embeddings2 = []
        for i in tqdm(range(0, total_sentences, batch_size), total=total_batches, desc="Group 2"):
            batch = sentences2[i:i + batch_size]
            with torch.no_grad():
                embeddings = model.encode(
                    batch, 
                    convert_to_tensor=True, 
                    batch_size=batch_size, 
                    show_progress_bar=False
                ).cpu()
                all_embeddings2.append(embeddings)
        self.embeddings2 = torch.cat(all_embeddings2)
        
        # 保存缓存
        print("\n保存缓存文件...")
        try:
            torch.save((self.embeddings1, self.embeddings2), cache_path)
            print("缓存保存成功！")
        except Exception as e:
            print(f"警告：缓存保存失败：{str(e)}")
        
        process_time = time.time() - start_time
        print(f"\n处理完成统计:")
        print(f"总用时: {format_time(process_time)}")
        print(f"平均每批({batch_size}条数据)处理时间: {process_time/total_batches/2:.2f}秒")
        print(f"平均每条数据处理时间: {process_time/total_sentences/2:.3f}秒")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'embedding1': self.embeddings1[idx],
            'embedding2': self.embeddings2[idx],
            'label': float(self.labels[idx])
        }

# 训练函数
def train_epoch(model, train_loader, criterion, optimizer, scaler, device):
    model.train()
    total_loss = 0
    start_time = time.time()
    
    for batch_idx, batch in enumerate(train_loader):
        # 获取数据
        emb1 = batch['embedding1'].to(device)
        emb2 = batch['embedding2'].to(device)
        labels = batch['label'].to(device)
        
        # 使用混合精度训练
        with torch.cuda.amp.autocast():
            similarity = torch.cosine_similarity(emb1, emb2, dim=1)
            loss = criterion(similarity, labels)
        
        # 优化
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
        
        # 显示进度
        if (batch_idx + 1) % 10 == 0:
            elapsed = time.time() - start_time
            avg_time = elapsed / (batch_idx + 1)
            remaining = avg_time * (len(train_loader) - batch_idx - 1)
            print(f"Batch [{batch_idx+1}/{len(train_loader)}] "
                  f"Loss: {loss.item():.4f} "
                  f"预计剩余时间: {format_time(remaining)}", end='\r')
    
    return total_loss / len(train_loader)

# 评估函数
def evaluate(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            emb1 = batch['embedding1'].to(device)
            emb2 = batch['embedding2'].to(device)
            labels = batch['label'].cpu().numpy()
            
            similarity = torch.cosine_similarity(emb1, emb2, dim=1)
            preds = (similarity > 0.5).cpu().numpy().astype(int)
            
            all_preds.extend(preds)
            all_labels.extend(labels)
    
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    score = 0.6 * f1 + 0.2 * accuracy + 0.2 * recall
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'recall': recall,
        'score': score
    }

# 损失函数
class FocalLoss(nn.Module):
    """
    二分类的Focal Loss实现
    
    参数:
        alpha (float): 正样本的权重系数，用于处理样本不平衡
        gamma (float): 聚焦参数，用于降低易分样本的权重
    """
    def __init__(self, alpha=0.28, gamma=2.2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        """
        计算focal loss
        
        参数:
            inputs (torch.Tensor): 模型预测值，形状为 (N, )
            targets (torch.Tensor): 真实标签，形状为 (N, )，值为0或1
        """
        # 确保输入和目标在同一设备上
        if inputs.device != targets.device:
            targets = targets.to(inputs.device)
            
        # 计算二元交叉熵损失
        BCE_loss = F.binary_cross_entropy_with_logits(
            inputs, 
            targets.float(),  # 确保目标是浮点类型
            reduction='none'
        )
        
        # 计算概率
        pt = torch.exp(-BCE_loss)
        
        # 计算alpha权重
        alpha_weight = self.alpha * targets + (1 - self.alpha) * (1 - targets)
        
        # 计算focal loss
        focal_loss = alpha_weight * (1 - pt) ** self.gamma * BCE_loss
        
        return focal_loss.mean()
    

In [None]:
def main():
    # 1. 加载数据
    print("加载数据...")
    train_data = pd.read_csv('csv/train.tsv', sep='\t', header=None, names=['q1', 'q2', 'label'])
    test_data = pd.read_csv('csv/test.csv', sep='\t', header=None, names=['q1', 'q2', 'label'])
    
    print(f"训练集大小: {len(train_data)}")
    print(f"测试集大小: {len(test_data)}")
    print("\n类别分布:")
    print(train_data['label'].value_counts())
    
    # 2. 加载模型
    print("\n加载预训练模型...")
    model = SentenceTransformer('shibing624/text2vec-base-chinese')
    model = model.to(device)
    
    # 3. 准备数据集
    print("\n准备训练集...")
    train_dataset = SentencePairDataset(
        train_data['q1'].tolist(),
        train_data['q2'].tolist(),
        train_data['label'].tolist(),
        model
    )
    
    # 4. 数据加载器（保持pin_memory=True因为现在数据在CPU上）
    train_loader = DataLoader(
        train_dataset,
        batch_size=128,
        shuffle=True,
        pin_memory=True,  # 现在可以安全使用pin_memory
        prefetch_factor=4,  # 预取因子
        num_workers=6,     # 可以使用多个工作进程加速数据加载
        persistent_workers=True  # 持久化工作进程
    )
    
    # 5. 训练设置
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.BCEWithLogitsLoss()
    scaler = GradScaler('cuda')  # 使用新的初始化方式
    num_epochs = 3
    best_score = 0
    
    # 6. 训练循环
    print("\n开始训练...")
    for epoch in range(num_epochs):
        epoch_start = time.time()
        
        # 训练一个epoch
        train_loss = train_epoch(model, train_loader, criterion, optimizer, scaler, device)
        epoch_time = time.time() - epoch_start
        
        # 在训练集上评估
        metrics = evaluate(model, train_loader, device)
        
        print(f"\nEpoch {epoch+1}/{num_epochs}:")
        print(f"训练损失: {train_loss:.4f}")
        print(f"训练集评估: Accuracy={metrics['accuracy']:.4f}, "
              f"F1={metrics['f1']:.4f}, Recall={metrics['recall']:.4f}")
        print(f"当前得分: {metrics['score']:.4f}")
        print(f"本轮用时: {format_time(epoch_time)}")
        
        # 保存最佳模型
        if metrics['score'] > best_score:
            best_score = metrics['score']
            torch.save(model.state_dict(), 'best_model.pth')
    
    # 7. 加载最佳模型进行测试
    print("\n加载最佳模型进行测试...")
    model.load_state_dict(torch.load('best_model.pth'))
    
    # 8. 准备测试集
    print("准备测试集...")
    test_dataset = SentencePairDataset(
        test_data['q1'].tolist(),
        test_data['q2'].tolist(),
        test_data['label'].tolist(),
        model
    )
    test_loader = DataLoader(test_dataset, batch_size=128, pin_memory=True)
    
    # 9. 最终测试
    print("进行最终测试...")
    test_metrics = evaluate(model, test_loader, device)
    
    print("\n最终测试结果:")
    print(f"Accuracy: {test_metrics['accuracy']:.4f}")
    print(f"F1-score: {test_metrics['f1']:.4f}")
    print(f"Recall: {test_metrics['recall']:.4f}")
    print(f"最终得分: {test_metrics['score']:.4f}")
    
    # 10. 保存最终模型
    model.save('chinese_semantic_model_final')
    print("\n模型已保存为: chinese_semantic_model_final")

if __name__ == "__main__":
    main()

加载数据...
训练集大小: 238766
测试集大小: 4401

类别分布:
label
1    138574
0    100192
Name: count, dtype: int64

加载预训练模型...

准备训练集...
未找到缓存，开始处理数据...
开始处理 238766 条数据...
将分 933 批处理，每批 256 条数据

处理第一组句子...


Group 1: 100%|██████████| 933/933 [21:44<00:00,  1.40s/it]



处理第二组句子...


Group 2:  83%|████████▎ | 778/933 [17:07<03:53,  1.51s/it]  