In [8]:
import pandas as pd
import torch
import numpy as np
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.model_selection import train_test_split
import time
from datetime import timedelta
import os
from torch.optim.lr_scheduler import ReduceLROnPlateau
import requests

# 设置随机种子
#torch.manual_seed(42)
#np.random.seed(42)

# 检查CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

def format_time(seconds):
    return str(timedelta(seconds=int(seconds)))

def check_internet_connection():
    try:
        requests.get("https://huggingface.co", timeout=5)
        print("网络连接正常。")
    except requests.ConnectionError:
        print("网络连接失败，请检查网络。")
        return False
    return True

使用设备: cuda


In [9]:
def custom_collate_fn(batch):
    """
    自定义collate_fn，将InputExample对象转换为适合模型输入的格式。
    """
    sentences1 = [example.texts[0] for example in batch]
    sentences2 = [example.texts[1] for example in batch]
    labels = [example.label for example in batch]
    return {
        'sentences1': sentences1,
        'sentences2': sentences2,
        'labels': labels
    }

class SentencePairDataset:
    def __init__(self, sentences1, sentences2, labels):
        self.examples = [InputExample(texts=[s1, s2], label=float(label))
                         for s1, s2, label in zip(sentences1, sentences2, labels)]
        
    def __len__(self):
        return len(self.examples)
        
    def __getitem__(self, idx):
        return self.examples[idx]

def evaluate(model, data_loader, device, threshold=0.84):
    """
    高效评估函数，批量编码所有句子对并计算相似度。
    """
    model.eval()
    all_embeddings1 = []
    all_embeddings2 = []
    all_labels = []

    # 批量编码所有句子对
    with torch.no_grad():
        for batch in data_loader:
            sentences1 = batch['sentences1']
            sentences2 = batch['sentences2']
            labels = batch['labels']

            embeddings1 = model.encode(sentences1, convert_to_tensor=True, device=device)
            embeddings2 = model.encode(sentences2, convert_to_tensor=True, device=device)

            all_embeddings1.append(embeddings1)
            all_embeddings2.append(embeddings2)
            all_labels.extend(labels)

    # 将所有嵌入拼接为单个张量
    all_embeddings1 = torch.cat(all_embeddings1, dim=0)
    all_embeddings2 = torch.cat(all_embeddings2, dim=0)
    all_labels = torch.tensor(all_labels, dtype=torch.int, device=device)

    # 一次性计算余弦相似度
    similarity = torch.cosine_similarity(all_embeddings1, all_embeddings2, dim=1)
    preds = (similarity > threshold).cpu().numpy().astype(int)

    # 计算评估指标
    accuracy = accuracy_score(all_labels.cpu().numpy(), preds)
    f1 = f1_score(all_labels.cpu().numpy(), preds)
    recall = recall_score(all_labels.cpu().numpy(), preds)
    score = 0.6 * f1 + 0.2 * accuracy + 0.2 * recall

    return {
        'accuracy': accuracy,
        'f1': f1,
        'recall': recall,
        'score': score
    }

def train_model(model, train_data, val_data, device, batch_size=64, epochs=3, patience=3, min_delta=1e-4):
    """
    完全重写的训练函数，包含学习率调度器
    """
    model.to(device)
    print(f"模型是否在GPU上: {next(model.parameters()).device}")
    
    # 准备数据集
    print("准备训练数据集...")
    train_dataset = SentencePairDataset(
        train_data['q1'].tolist(),
        train_data['q2'].tolist(),
        train_data['label'].tolist()
    )
    print("准备验证数据集...")
    val_dataset = SentencePairDataset(
        val_data['q1'].tolist(),
        val_data['q2'].tolist(),
        val_data['label'].tolist()
    )
    
    # 数据加载器
    print("创建数据加载器...")
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        pin_memory=True,
        num_workers=0,  # Windows下使用0避免多进程问题
        collate_fn=custom_collate_fn
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        pin_memory=True,
        num_workers=0,
        collate_fn=custom_collate_fn
    )
    
    # 优化器和损失函数
    print("初始化优化器和损失函数...")
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    # 添加学习率调度器
    scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=2, 
                                verbose=True, min_lr=1e-6)
    train_loss = losses.CosineSimilarityLoss(model)
    best_score = 0
    no_improve_epochs = 0
    from tqdm import tqdm
    
    print("开始训练循环...")
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        batch_count = 0
        progress_bar = tqdm(total=len(train_dataloader), desc=f"Epoch {epoch+1}/{epochs}")
        
        try:
            for batch_idx, batch in enumerate(train_dataloader):
                # 获取数据
                sentences1 = batch['sentences1']
                sentences2 = batch['sentences2']
                labels = torch.tensor(batch['labels'], dtype=torch.float32, device=device)

                # 清除梯度
                optimizer.zero_grad()

                # 将句子转换为模型输入格式
                features1 = model.tokenize(sentences1)
                features2 = model.tokenize(sentences2)

                # 将输入移动到设备上
                features1 = {key: val.to(device) for key, val in features1.items()}
                features2 = {key: val.to(device) for key, val in features2.items()}

                # 计算余弦相似度损失
                loss = train_loss([features1, features2], labels)

                # 反向传播
                loss.backward()
                optimizer.step()

                # 更新进度条
                total_loss += loss.item()
                progress_bar.update(1)
                progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
            
            # 进行验证
            val_metrics = evaluate(model, val_dataloader, device)
            current_score = val_metrics['score']
            
            # 更新学习率
            scheduler.step(current_score)
            current_lr = optimizer.param_groups[0]['lr']
            
            print(f"\nEpoch {epoch+1}/{epochs}")
            print(f"当前学习率: {current_lr:.2e}")
            print(f"验证集评估: Accuracy={val_metrics['accuracy']:.4f}, "
                  f"F1={val_metrics['f1']:.4f}, Recall={val_metrics['recall']:.4f}")
            print(f"当前得分: {current_score:.4f}")
            
            # 保存每个epoch的模型
            epoch_save_path = f'model_epoch_{epoch+1}'
            print(f"保存当前epoch模型到: {epoch_save_path}")
            model.save(epoch_save_path)
            
            # 检查是否需要保存最佳模型
            if current_score > best_score + min_delta:
                best_score = current_score
                no_improve_epochs = 0
                print(f"保存最佳模型，得分: {best_score:.4f}")
                model.save('best_model')
            else:
                no_improve_epochs += 1
                print(f"模型表现未提升，已经 {no_improve_epochs}/{patience} 个epoch")
                if no_improve_epochs >= patience:
                    print("\nEarly stopping triggered")
                    break
        
        except Exception as e:
            print(f"训练过程中出错: {e}")
            progress_bar.close()
            import traceback
            traceback.print_exc()
            break

    return model

In [10]:
def main():
    # 1. 加载数据
    print("加载数据...")
    data = pd.read_csv('csv/train.tsv', sep='\t', header=None, names=['q1', 'q2', 'label'])
    test_data = pd.read_csv('csv/test.csv', sep='\t', header=None, names=['q1', 'q2', 'label'])

    # 拆分训练集和验证集
    train_data, val_data = train_test_split(data, test_size=0.1)

    print(f"训练集大小: {len(train_data)}")
    print(f"验证集大小: {len(val_data)}")
    print(f"测试集大小: {len(test_data)}")
    print("\n训练集类别分布:")
    print(train_data['label'].value_counts())

    # 2. 检查网络连接
    if not check_internet_connection():
        return

    # 3. 加载模型#model = SentenceTransformer('bge-large-zh-v1.5')
    print("\n加载预训练模型...")
    try:
        model = SentenceTransformer('shibing624/text2vec-base-chinese')
        print("预训练模型加载成功！")
    except Exception as e:
        print(f"预训练模型加载失败: {e}")
        print("请检查网络连接或模型名称是否正确。")
        return

    # 4. 训练模型
    print("\n开始训练...")
    model = train_model(model, train_data, val_data, device, 
                   batch_size=64,  # 可以根据GPU显存调整
                   epochs=3,
                   patience=3)

    # 5. 加载最佳模型进行测试
    print("\n加载最佳模型进行测试...")
    try:
        model = SentenceTransformer('best_model')
        print("模型加载成功")
    except Exception as e:
        print(f"模型加载失败: {e}")
        return

    # 6. 准备测试集
    print("准备测试集...")
    test_dataset = SentencePairDataset(
        test_data['q1'].tolist(),
        test_data['q2'].tolist(),
        test_data['label'].tolist()
    )
    test_loader = DataLoader(
        test_dataset, 
        batch_size=96, 
        pin_memory=True,
        num_workers=0,  # 不使用多进程
        collate_fn=custom_collate_fn  # 使用自定义的collate_fn
    )

    # 7. 最终测试
    print("进行最终测试...")
    try:
        test_metrics = evaluate(model, test_loader, device)
        print("测试完成")
    except Exception as e:
        print(f"测试失败: {e}")
        return

    print("\n最终测试结果:")
    print(f"Accuracy: {test_metrics['accuracy']:.4f}")
    print(f"F1-score: {test_metrics['f1']:.4f}")
    print(f"Recall: {test_metrics['recall']:.4f}")
    print(f"最终得分: {test_metrics['score']:.4f}")

    # 8. 保存最终模型
    model.save('chinese_semantic_model_final')
    print("\n模型已保存为: chinese_semantic_model_final")

if __name__ == "__main__":
    main()

加载数据...
训练集大小: 214889
验证集大小: 23877
测试集大小: 4401

训练集类别分布:
label
1    124672
0     90217
Name: count, dtype: int64
网络连接正常。

加载预训练模型...
预训练模型加载成功！

开始训练...
模型是否在GPU上: cuda:0
准备训练数据集...




准备验证数据集...
创建数据加载器...
初始化优化器和损失函数...
开始训练循环...


Epoch 1/3:   2%|▏         | 80/3358 [12:51<10:54:26, 11.98s/it, loss=0.1094]  

KeyboardInterrupt: 

In [None]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, InputExample
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score, recall_score

def custom_collate_fn(batch):
    """
    自定义collate_fn，将InputExample对象转换为适合模型输入的格式。
    """
    sentences1 = [example.texts[0] for example in batch]
    sentences2 = [example.texts[1] for example in batch]
    labels = [example.label for example in batch]
    return {
        'sentences1': sentences1,
        'sentences2': sentences2,
        'labels': labels
    }

class SentencePairDataset:
    def __init__(self, sentences1, sentences2, labels):
        self.examples = [InputExample(texts=[s1, s2], label=float(label))
                         for s1, s2, label in zip(sentences1, sentences2, labels)]
        
    def __len__(self):
        return len(self.examples)
        
    def __getitem__(self, idx):
        return self.examples[idx]

def evaluate(model, data_loader, device, threshold=0.84):
    """
    高效评估函数，批量编码所有句子对并计算相似度。
    """
    model.eval()
    all_embeddings1 = []
    all_embeddings2 = []
    all_labels = []

    # 批量编码所有句子对
    with torch.no_grad():
        for batch in data_loader:
            sentences1 = batch['sentences1']
            sentences2 = batch['sentences2']
            labels = batch['labels']

            embeddings1 = model.encode(sentences1, convert_to_tensor=True, device=device)
            embeddings2 = model.encode(sentences2, convert_to_tensor=True, device=device)

            all_embeddings1.append(embeddings1)
            all_embeddings2.append(embeddings2)
            all_labels.extend(labels)

    # 将所有嵌入拼接为单个张量
    all_embeddings1 = torch.cat(all_embeddings1, dim=0)
    all_embeddings2 = torch.cat(all_embeddings2, dim=0)
    all_labels = torch.tensor(all_labels, dtype=torch.int, device=device)

    # 一次性计算余弦相似度
    similarity = torch.cosine_similarity(all_embeddings1, all_embeddings2, dim=1)
    preds = (similarity > threshold).cpu().numpy().astype(int)

    # 计算评估指标
    accuracy = accuracy_score(all_labels.cpu().numpy(), preds)
    f1 = f1_score(all_labels.cpu().numpy(), preds)
    recall = recall_score(all_labels.cpu().numpy(), preds)
    score = 0.6 * f1 + 0.2 * accuracy + 0.2 * recall

    return {
        'accuracy': accuracy,
        'f1': f1,
        'recall': recall,
        'score': score,
        'similarity': similarity.cpu().numpy(),  # 返回相似度分数用于进一步分析
        'labels': all_labels.cpu().numpy()       # 返回真实标签用于进一步分析
    }

def test_best_model(test_file='csv/test.csv', model_path='best_model', batch_size=96, threshold=0.84):
    """
    加载最佳模型并在测试集上验证性能
    
    Args:
        test_file: 测试集文件路径
        model_path: 模型保存路径
        batch_size: 批处理大小
        threshold: 相似度阈值，用于确定样本是否属于同一类
        
    Returns:
        dict: 包含测试指标的字典
    """
    # 检查CUDA
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"使用设备: {device}")
    
    # 1. 加载测试数据
    print("加载测试数据...")
    test_data = pd.read_csv(test_file, sep='\t', header=None, names=['q1', 'q2', 'label'])
    print(f"测试集大小: {len(test_data)}")
    
    # 2. 加载最佳模型
    print(f"\n加载模型 {model_path}...")
    try:
        model = SentenceTransformer(model_path)
        model.to(device)
        print("模型加载成功")
        print(f"模型是否在GPU上: {next(model.parameters()).device}")
    except Exception as e:
        print(f"模型加载失败: {e}")
        raise
    
    # 3. 准备测试集
    print("准备测试集...")
    test_dataset = SentencePairDataset(
        test_data['q1'].tolist(),
        test_data['q2'].tolist(),
        test_data['label'].tolist()
    )
    test_loader = DataLoader(
        test_dataset, 
        batch_size=batch_size, 
        pin_memory=True,
        num_workers=0,  # 不使用多进程
        collate_fn=custom_collate_fn
    )
    
    # 4. 进行测试评估
    print("进行测试评估...")
    test_metrics = evaluate(model, test_loader, device, threshold)
    
    # 5. 输出结果
    print("\n测试结果:")
    print(f"Accuracy: {test_metrics['accuracy']:.4f}")
    print(f"F1-score: {test_metrics['f1']:.4f}")
    print(f"Recall: {test_metrics['recall']:.4f}")
    print(f"最终得分: {test_metrics['score']:.4f}")
    
    # 6. 分析不同阈值的结果
    print("\n尝试不同阈值:")
    for th in [0.84, 0.845, 0.85, 0.855, 0.86]:
        preds = (test_metrics['similarity'] > th).astype(int)
        acc = accuracy_score(test_metrics['labels'], preds)
        f1 = f1_score(test_metrics['labels'], preds)
        recall = recall_score(test_metrics['labels'], preds)
        score = 0.6 * f1 + 0.2 * acc + 0.2 * recall
        print(f"阈值={th:.3f}: Accuracy={acc:.4f}, F1={f1:.4f}, Recall={recall:.4f}, Score={score:.4f}")
    
    return test_metrics

if __name__ == "__main__":
    # 运行测试
    test_metrics = test_best_model()

使用设备: cuda
加载测试数据...
测试集大小: 4401

加载模型 best_model...
模型加载成功
模型是否在GPU上: cuda:0
准备测试集...
进行测试评估...

测试结果:
Accuracy: 0.8603
F1-score: 0.8510
Recall: 0.7850
最终得分: 0.8396

尝试不同阈值:
阈值=0.840: Accuracy=0.8603, F1=0.8510, Recall=0.7850, Score=0.8396
阈值=0.845: Accuracy=0.8573, F1=0.8470, Recall=0.7769, Score=0.8350
阈值=0.850: Accuracy=0.8564, F1=0.8456, Recall=0.7734, Score=0.8333
阈值=0.855: Accuracy=0.8564, F1=0.8450, Recall=0.7702, Score=0.8323
阈值=0.860: Accuracy=0.8532, F1=0.8410, Recall=0.7635, Score=0.8279


threshold=0.84  0.8947 0.8358

threshold=0.85 0.8929 0.8359

threshold=0.855 0.8868 0.8369

threshold=0.86 0.8858 0.8350

threshold=0.845 0.8893 0.8351