# 中文问题对语义相似度判断

## 任务说明
开发算法模型，判断给定中文问题对在语义上的相似程度。
- 输入：问题对(question1, question2)
- 输出：0(不相似)或1(相似)

## 评价指标
最终得分 = 0.6×F1 + 0.2×Accuracy + 0.2×Recall

In [8]:
import pandas as pd
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score, recall_score
import numpy as np
import time
from datetime import timedelta

# 设置随机种子
torch.manual_seed(42)
np.random.seed(42)

# 检查CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

def format_time(seconds):
    return str(timedelta(seconds=int(seconds)))

使用设备: cuda


In [9]:
# 数据集类
class SentencePairDataset(Dataset):
    def __init__(self, sentences1, sentences2, labels, model):
        self.sentences1 = sentences1
        self.sentences2 = sentences2
        self.labels = labels
        
        print(f"开始处理 {len(sentences1)} 条数据...")
        start_time = time.time()
        
        # 设置更大的批处理大小
        batch_size = 256
        
        # 计算总批次数
        total_batches = (len(sentences1) + batch_size - 1) // batch_size
        
        # 初始化进度条
        from tqdm import tqdm
        
        # 批量处理句子1
        print("处理第一组句子...")
        all_embeddings1 = []
        for i in tqdm(range(0, len(sentences1), batch_size), total=total_batches):
            batch = sentences1[i:i + batch_size]
            with torch.no_grad():
                embeddings = model.encode(batch, convert_to_tensor=True, batch_size=batch_size, show_progress_bar=False)
                all_embeddings1.append(embeddings)
        self.embeddings1 = torch.cat(all_embeddings1)
        
        # 批量处理句子2
        print("处理第二组句子...")
        all_embeddings2 = []
        for i in tqdm(range(0, len(sentences2), batch_size), total=total_batches):
            batch = sentences2[i:i + batch_size]
            with torch.no_grad():
                embeddings = model.encode(batch, convert_to_tensor=True, batch_size=batch_size, show_progress_bar=False)
                all_embeddings2.append(embeddings)
        self.embeddings2 = torch.cat(all_embeddings2)
            
        process_time = time.time() - start_time
        print(f"数据处理完成! 总用时: {format_time(process_time)}")
        print(f"平均每条数据处理时间: {process_time/len(sentences1):.3f}秒")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'embedding1': self.embeddings1[idx],
            'embedding2': self.embeddings2[idx],
            'label': float(self.labels[idx])
        }

# 训练函数
def train_epoch(model, train_loader, criterion, optimizer, scaler, device):
    model.train()
    total_loss = 0
    start_time = time.time()
    
    for batch_idx, batch in enumerate(train_loader):
        # 获取数据
        emb1 = batch['embedding1'].to(device)
        emb2 = batch['embedding2'].to(device)
        labels = batch['label'].to(device)
        
        # 使用混合精度训练
        with torch.cuda.amp.autocast():
            similarity = torch.cosine_similarity(emb1, emb2, dim=1)
            loss = criterion(similarity, labels)
        
        # 优化
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
        
        # 显示进度
        if (batch_idx + 1) % 10 == 0:
            elapsed = time.time() - start_time
            avg_time = elapsed / (batch_idx + 1)
            remaining = avg_time * (len(train_loader) - batch_idx - 1)
            print(f"Batch [{batch_idx+1}/{len(train_loader)}] "
                  f"Loss: {loss.item():.4f} "
                  f"预计剩余时间: {format_time(remaining)}", end='\r')
    
    return total_loss / len(train_loader)

# 评估函数
def evaluate(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            emb1 = batch['embedding1'].to(device)
            emb2 = batch['embedding2'].to(device)
            labels = batch['label'].cpu().numpy()
            
            similarity = torch.cosine_similarity(emb1, emb2, dim=1)
            preds = (similarity > 0.5).cpu().numpy().astype(int)
            
            all_preds.extend(preds)
            all_labels.extend(labels)
    
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    score = 0.6 * f1 + 0.2 * accuracy + 0.2 * recall
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'recall': recall,
        'score': score
    }

In [None]:
def main():
    # 1. 加载数据
    print("加载数据...")
    train_data = pd.read_csv('csv/train.tsv', sep='\t', header=None, names=['q1', 'q2', 'label'])
    test_data = pd.read_csv('csv/test.csv', sep='\t', header=None, names=['q1', 'q2', 'label'])
    
    print(f"训练集大小: {len(train_data)}")
    print(f"测试集大小: {len(test_data)}")
    print("\n类别分布:")
    print(train_data['label'].value_counts())
    
    # 2. 加载模型
    print("\n加载预训练模型...")
    model = SentenceTransformer('shibing624/text2vec-base-chinese')
    model = model.to(device)
    
    # 3. 准备数据集
    print("\n准备训练集...")
    train_dataset = SentencePairDataset(
        train_data['q1'].tolist(),
        train_data['q2'].tolist(),
        train_data['label'].tolist(),
        model
    )
    
    # 4. 数据加载器
    train_loader = DataLoader(
        train_dataset,
        batch_size=128,
        shuffle=True,
        pin_memory=True
    )
    
    # 5. 训练设置
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.BCEWithLogitsLoss()
    scaler = torch.cuda.amp.GradScaler()
    num_epochs = 3
    best_score = 0
    
    # 6. 训练循环
    print("\n开始训练...")
    for epoch in range(num_epochs):
        epoch_start = time.time()
        
        # 训练一个epoch
        train_loss = train_epoch(model, train_loader, criterion, optimizer, scaler, device)
        epoch_time = time.time() - epoch_start
        
        # 在训练集上评估
        metrics = evaluate(model, train_loader, device)
        
        print(f"\nEpoch {epoch+1}/{num_epochs}:")
        print(f"训练损失: {train_loss:.4f}")
        print(f"训练集评估: Accuracy={metrics['accuracy']:.4f}, "
              f"F1={metrics['f1']:.4f}, Recall={metrics['recall']:.4f}")
        print(f"当前得分: {metrics['score']:.4f}")
        print(f"本轮用时: {format_time(epoch_time)}")
        
        # 保存最佳模型
        if metrics['score'] > best_score:
            best_score = metrics['score']
            torch.save(model.state_dict(), 'best_model.pth')
    
    # 7. 加载最佳模型进行测试
    print("\n加载最佳模型进行测试...")
    model.load_state_dict(torch.load('best_model.pth'))
    
    # 8. 准备测试集
    print("准备测试集...")
    test_dataset = SentencePairDataset(
        test_data['q1'].tolist(),
        test_data['q2'].tolist(),
        test_data['label'].tolist(),
        model
    )
    test_loader = DataLoader(test_dataset, batch_size=128, pin_memory=True)
    
    # 9. 最终测试
    print("进行最终测试...")
    test_metrics = evaluate(model, test_loader, device)
    
    print("\n最终测试结果:")
    print(f"Accuracy: {test_metrics['accuracy']:.4f}")
    print(f"F1-score: {test_metrics['f1']:.4f}")
    print(f"Recall: {test_metrics['recall']:.4f}")
    print(f"最终得分: {test_metrics['score']:.4f}")
    
    # 10. 保存最终模型
    model.save('chinese_semantic_model_final')
    print("\n模型已保存为: chinese_semantic_model_final")

if __name__ == "__main__":
    main()

加载数据...
训练集大小: 238766
测试集大小: 4401

类别分布:
label
1    138574
0    100192
Name: count, dtype: int64

加载预训练模型...
训练集大小: 238766
测试集大小: 4401

类别分布:
label
1    138574
0    100192
Name: count, dtype: int64

加载预训练模型...

准备训练集...
开始处理 238766 条数据...
处理第一组句子...


 14%|█▎        | 126/933 [00:31<03:18,  4.07it/s]