In [3]:
import pandas as pd
import torch
import numpy as np
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.model_selection import train_test_split
import time
from datetime import timedelta
import os
from torch.optim.lr_scheduler import ReduceLROnPlateau
import requests

# 设置随机种子
#torch.manual_seed(42)
#np.random.seed(42)

# 检查CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

def format_time(seconds):
    return str(timedelta(seconds=int(seconds)))

def check_internet_connection():
    try:
        requests.get("https://huggingface.co", timeout=5)
        print("网络连接正常。")
    except requests.ConnectionError:
        print("网络连接失败，请检查网络。")
        return False
    return True

使用设备: cuda


In [4]:
def custom_collate_fn(batch):
    """
    自定义collate_fn，将InputExample对象转换为适合模型输入的格式。
    """
    sentences1 = [example.texts[0] for example in batch]
    sentences2 = [example.texts[1] for example in batch]
    labels = [example.label for example in batch]
    return {
        'sentences1': sentences1,
        'sentences2': sentences2,
        'labels': labels
    }

class SentencePairDataset:
    def __init__(self, sentences1, sentences2, labels):
        self.examples = [InputExample(texts=[s1, s2], label=float(label))
                         for s1, s2, label in zip(sentences1, sentences2, labels)]
        
    def __len__(self):
        return len(self.examples)
        
    def __getitem__(self, idx):
        return self.examples[idx]

def evaluate(model, data_loader, device, threshold=0.84):
    """
    高效评估函数，批量编码所有句子对并计算相似度。
    """
    model.eval()
    all_embeddings1 = []
    all_embeddings2 = []
    all_labels = []

    # 批量编码所有句子对
    with torch.no_grad():
        for batch in data_loader:
            sentences1 = batch['sentences1']
            sentences2 = batch['sentences2']
            labels = batch['labels']

            embeddings1 = model.encode(sentences1, convert_to_tensor=True, device=device)
            embeddings2 = model.encode(sentences2, convert_to_tensor=True, device=device)

            all_embeddings1.append(embeddings1)
            all_embeddings2.append(embeddings2)
            all_labels.extend(labels)

    # 将所有嵌入拼接为单个张量
    all_embeddings1 = torch.cat(all_embeddings1, dim=0)
    all_embeddings2 = torch.cat(all_embeddings2, dim=0)
    all_labels = torch.tensor(all_labels, dtype=torch.int, device=device)

    # 一次性计算余弦相似度
    similarity = torch.cosine_similarity(all_embeddings1, all_embeddings2, dim=1)
    preds = (similarity > threshold).cpu().numpy().astype(int)

    # 计算评估指标
    accuracy = accuracy_score(all_labels.cpu().numpy(), preds)
    f1 = f1_score(all_labels.cpu().numpy(), preds)
    recall = recall_score(all_labels.cpu().numpy(), preds)
    score = 0.6 * f1 + 0.2 * accuracy + 0.2 * recall

    return {
        'accuracy': accuracy,
        'f1': f1,
        'recall': recall,
        'score': score
    }

def train_model(model, train_data, val_data, device, batch_size=64, epochs=3, patience=3, min_delta=1e-4):
    """
    完全重写的训练函数，包含学习率调度器
    """
    model.to(device)
    print(f"模型是否在GPU上: {next(model.parameters()).device}")
    
    # 准备数据集
    print("准备训练数据集...")
    train_dataset = SentencePairDataset(
        train_data['q1'].tolist(),
        train_data['q2'].tolist(),
        train_data['label'].tolist()
    )
    print("准备验证数据集...")
    val_dataset = SentencePairDataset(
        val_data['q1'].tolist(),
        val_data['q2'].tolist(),
        val_data['label'].tolist()
    )
    
    # 数据加载器
    print("创建数据加载器...")
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        pin_memory=True,
        num_workers=0,  # Windows下使用0避免多进程问题
        collate_fn=custom_collate_fn
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        pin_memory=True,
        num_workers=0,
        collate_fn=custom_collate_fn
    )
    
    # 优化器和损失函数
    print("初始化优化器和损失函数...")
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    # 添加学习率调度器
    scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=2, 
                                verbose=True, min_lr=1e-6)
    train_loss = losses.CosineSimilarityLoss(model)
    best_score = 0
    no_improve_epochs = 0
    from tqdm import tqdm
    
    print("开始训练循环...")
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        batch_count = 0
        progress_bar = tqdm(total=len(train_dataloader), desc=f"Epoch {epoch+1}/{epochs}")
        
        try:
            for batch_idx, batch in enumerate(train_dataloader):
                # 获取数据
                sentences1 = batch['sentences1']
                sentences2 = batch['sentences2']
                labels = torch.tensor(batch['labels'], dtype=torch.float32, device=device)

                # 清除梯度
                optimizer.zero_grad()

                # 将句子转换为模型输入格式
                features1 = model.tokenize(sentences1)
                features2 = model.tokenize(sentences2)

                # 将输入移动到设备上
                features1 = {key: val.to(device) for key, val in features1.items()}
                features2 = {key: val.to(device) for key, val in features2.items()}

                # 计算余弦相似度损失
                loss = train_loss([features1, features2], labels)

                # 反向传播
                loss.backward()
                optimizer.step()

                # 更新进度条
                total_loss += loss.item()
                progress_bar.update(1)
                progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
            
            # 进行验证
            val_metrics = evaluate(model, val_dataloader, device)
            current_score = val_metrics['score']
            
            # 更新学习率
            scheduler.step(current_score)
            current_lr = optimizer.param_groups[0]['lr']
            
            print(f"\nEpoch {epoch+1}/{epochs}")
            print(f"当前学习率: {current_lr:.2e}")
            print(f"验证集评估: Accuracy={val_metrics['accuracy']:.4f}, "
                  f"F1={val_metrics['f1']:.4f}, Recall={val_metrics['recall']:.4f}")
            print(f"当前得分: {current_score:.4f}")
            
            # 保存每个epoch的模型
            epoch_save_path = f'model_epoch_{epoch+1}'
            print(f"保存当前epoch模型到: {epoch_save_path}")
            model.save(epoch_save_path)
            
            # 检查是否需要保存最佳模型
            if current_score > best_score + min_delta:
                best_score = current_score
                no_improve_epochs = 0
                print(f"保存最佳模型，得分: {best_score:.4f}")
                model.save('best_model')
            else:
                no_improve_epochs += 1
                print(f"模型表现未提升，已经 {no_improve_epochs}/{patience} 个epoch")
                if no_improve_epochs >= patience:
                    print("\nEarly stopping triggered")
                    break
        
        except Exception as e:
            print(f"训练过程中出错: {e}")
            progress_bar.close()
            import traceback
            traceback.print_exc()
            break

    return model

In [None]:
def main_v2():
    # 1. 加载数据
    print("加载数据...")
    data = pd.read_csv('csv/train.tsv', sep='\t', header=None, names=['q1', 'q2', 'label'])
    test_data = pd.read_csv('csv/test.csv', sep='\t', header=None, names=['q1', 'q2', 'label'])

    # 拆分训练集和验证集 (添加随机种子确保可复现)
    train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

    print(f"训练集大小: {len(train_data)}")
    print(f"验证集大小: {len(val_data)}")
    print(f"测试集大小: {len(test_data)}")
    print("\n训练集类别分布:")
    print(train_data['label'].value_counts())

    # 2. 检查网络连接
    if not check_internet_connection():
        return

    # 3. 加载模型
    print("\n加载预训练模型...")
    try:
        model = SentenceTransformer('shibing624/text2vec-base-chinese')
        print("预训练模型加载成功！")
    except Exception as e:
        print(f"预训练模型加载失败: {e}")
        print("请检查网络连接或模型名称是否正确。")
        return

    # 4. 训练模型（改进的超参数）
    print("\n开始训练...")
    model = train_model(model, train_data, val_data, device, 
                   batch_size=32,     # 减小批次大小以提高泛化性
                   epochs=10,         # 增加轮次以便更好地观察模型表现
                   patience=2,        # 降低patience以更快响应性能下降
                   min_delta=1e-3)    # 提高阈值以确保显著改进

    # 5. 加载最佳模型进行测试
    print("\n加载最佳模型进行测试...")
    try:
        model = SentenceTransformer('best_model')
        print("模型加载成功")
    except Exception as e:
        print(f"模型加载失败: {e}")
        return

    # 6. 准备测试集（保持与训练时相同的batch_size）
    print("准备测试集...")
    test_dataset = SentencePairDataset(
        test_data['q1'].tolist(),
        test_data['q2'].tolist(),
        test_data['label'].tolist()
    )
    test_loader = DataLoader(
        test_dataset, 
        batch_size=32,  # 与训练时保持一致
        pin_memory=True,
        num_workers=0,
        collate_fn=custom_collate_fn
    )

    # 7. 最终测试（使用多个阈值）
    print("进行最终测试...")
    thresholds = [0.80, 0.82, 0.84, 0.85, 0.86, 0.88]
    best_threshold = 0.84
    best_score = 0

    try:
        base_metrics = evaluate(model, test_loader, device)
        print("\n基准测试结果:")
        print(f"阈值=0.84 (默认):")
        print(f"Accuracy: {base_metrics['accuracy']:.4f}")
        print(f"F1-score: {base_metrics['f1']:.4f}")
        print(f"Recall: {base_metrics['recall']:.4f}")
        print(f"得分: {base_metrics['score']:.4f}")

        # 测试不同阈值
        print("\n尝试不同阈值:")
        for threshold in thresholds:
            metrics = evaluate(model, test_loader, device, threshold=threshold)
            score = metrics['score']
            print(f"\n阈值={threshold:.3f}:")
            print(f"Accuracy: {metrics['accuracy']:.4f}")
            print(f"F1-score: {metrics['f1']:.4f}")
            print(f"Recall: {metrics['recall']:.4f}")
            print(f"得分: {score:.4f}")
            
            if score > best_score:
                best_score = score
                best_threshold = threshold
                
        print(f"\n最佳阈值: {best_threshold:.3f}, 最佳得分: {best_score:.4f}")
        
    except Exception as e:
        print(f"测试失败: {e}")
        return

    # 8. 使用最佳阈值保存最终评估结果
    final_metrics = evaluate(model, test_loader, device, threshold=best_threshold)
    print("\n使用最佳阈值的最终结果:")
    print(f"Accuracy: {final_metrics['accuracy']:.4f}")
    print(f"F1-score: {final_metrics['f1']:.4f}")
    print(f"Recall: {final_metrics['recall']:.4f}")
    print(f"最终得分: {final_metrics['score']:.4f}")

    # 9. 保存最终模型
    model.save('chinese_semantic_model_final')
    print("\n模型已保存为: chinese_semantic_model_final")

if __name__ == "__main__":
    main_v2()

加载数据...
训练集大小: 214889
验证集大小: 23877
测试集大小: 4401

训练集类别分布:
label
1    124695
0     90194
Name: count, dtype: int64
训练集大小: 214889
验证集大小: 23877
测试集大小: 4401

训练集类别分布:
label
1    124695
0     90194
Name: count, dtype: int64
网络连接正常。

加载预训练模型...
网络连接正常。

加载预训练模型...
预训练模型加载成功！

开始训练...
模型是否在GPU上: cuda:0
准备训练数据集...
预训练模型加载成功！

开始训练...
模型是否在GPU上: cuda:0
准备训练数据集...




准备验证数据集...
创建数据加载器...
初始化优化器和损失函数...
开始训练循环...


Epoch 1/10: 100%|██████████| 6716/6716 [26:49<00:00,  5.33it/s, loss=0.0685]


Epoch 1/10
当前学习率: 2.00e-05
验证集评估: Accuracy=0.8726, F1=0.8804, Recall=0.8069
当前得分: 0.8642
保存当前epoch模型到: model_epoch_1
保存最佳模型，得分: 0.8642
保存最佳模型，得分: 0.8642


Epoch 1/10: 100%|██████████| 6716/6716 [27:32<00:00,  4.06it/s, loss=0.0685]




Epoch 2/10
当前学习率: 2.00e-05
验证集评估: Accuracy=0.8957, F1=0.9049, Recall=0.8541
当前得分: 0.8929
保存当前epoch模型到: model_epoch_2
保存最佳模型，得分: 0.8929
保存最佳模型，得分: 0.8929


Epoch 2/10: 100%|██████████| 6716/6716 [25:54<00:00,  4.32it/s, loss=0.0636]
Epoch 3/10:   0%|          | 1/6716 [00:00<22:41,  4.93it/s, loss=0.0691]
Epoch 3/10: 100%|██████████| 6716/6716 [24:52<00:00,  5.13it/s, loss=0.0682]


Epoch 3/10
当前学习率: 2.00e-05
验证集评估: Accuracy=0.8933, F1=0.9025, Recall=0.8493
当前得分: 0.8900
保存当前epoch模型到: model_epoch_3
模型表现未提升，已经 1/2 个epoch


Epoch 3/10: 100%|██████████| 6716/6716 [25:36<00:00,  4.37it/s, loss=0.0682][A


In [None]:
# 拆分训练集和验证集 (添加随机种子确保可复现)
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

print(f"训练集大小: {len(train_data)}")
print(f"验证集大小: {len(val_data)}")
print(f"测试集大小: {len(test_data)}")
print("\n训练集类别分布:")
print(train_data['label'].value_counts())

# 2. 检查网络连接
if not check_internet_connection():
    return

# 3. 加载模型
print("\n加载预训练模型...")
try:
    model = SentenceTransformer('shibing624/text2vec-base-chinese')
    print("预训练模型加载成功！")
except Exception as e:
    print(f"预训练模型加载失败: {e}")
    print("请检查网络连接或模型名称是否正确。")
    return

# 4. 训练模型（改进的超参数）
print("\n开始训练...")
model = train_model(model, train_data, val_data, device, 
               batch_size=32,     # 减小批次大小以提高泛化性
               epochs=10,         # 增加轮次以便更好地观察模型表现
               patience=2,        # 降低patience以更快响应性能下降
               min_delta=1e-3)    # 提高阈值以确保显著改进

# 5. 加载最佳模型进行测试
print("\n加载最佳模型进行测试...")
try:
    model = SentenceTransformer('best_model')
    print("模型加载成功")
except Exception as e:
    print(f"模型加载失败: {e}")
    return

# 6. 准备测试集（保持与训练时相同的batch_size）
print("准备测试集...")
test_dataset = SentencePairDataset(
    test_data['q1'].tolist(),
    test_data['q2'].tolist(),
    test_data['label'].tolist()
)
test_loader = DataLoader(
    test_dataset, 
    batch_size=32,  # 与训练时保持一致
    pin_memory=True,
    num_workers=0,
    collate_fn=custom_collate_fn
)

# 7. 最终测试（使用多个阈值）
print("进行最终测试...")
thresholds = [0.80, 0.82, 0.84, 0.85, 0.86, 0.88]
best_threshold = 0.84
best_score = 0

try:
    base_metrics = evaluate(model, test_loader, device)
    print("\n基准测试结果:")
    print(f"阈值=0.84 (默认):")
    print(f"Accuracy: {base_metrics['accuracy']:.4f}")
    print(f"F1-score: {base_metrics['f1']:.4f}")
    print(f"Recall: {base_metrics['recall']:.4f}")
    print(f"得分: {base_metrics['score']:.4f}")

    # 测试不同阈值
    print("\n尝试不同阈值:")
    for threshold in thresholds:
        metrics = evaluate(model, test_loader, device, threshold=threshold)
        score = metrics['score']
        print(f"\n阈值={threshold:.3f}:")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"F1-score: {metrics['f1']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"得分: {score:.4f}")
        
        if score > best_score:
            best_score = score
            best_threshold = threshold
            
    print(f"\n最佳阈值: {best_threshold:.3f}, 最佳得分: {best_score:.4f}")
    
except Exception as e:
    print(f"测试失败: {e}")
    return

# 8. 使用最佳阈值保存最终评估结果
final_metrics = evaluate(model, test_loader, device, threshold=best_threshold)
print("\n使用最佳阈值的最终结果:")
print(f"Accuracy: {final_metrics['accuracy']:.4f}")
print(f"F1-score: {final_metrics['f1']:.4f}")
print(f"Recall: {final_metrics['recall']:.4f}")
print(f"最终得分: {final_metrics['score']:.4f}")

# 9. 保存最终模型
model.save('chinese_semantic_model_final')
print("\n模型已保存为: chinese_semantic_model_final")

In [1]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, InputExample
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score, recall_score

def custom_collate_fn(batch):
    """
    自定义collate_fn，将InputExample对象转换为适合模型输入的格式。
    """
    sentences1 = [example.texts[0] for example in batch]
    sentences2 = [example.texts[1] for example in batch]
    labels = [example.label for example in batch]
    return {
        'sentences1': sentences1,
        'sentences2': sentences2,
        'labels': labels
    }

class SentencePairDataset:
    def __init__(self, sentences1, sentences2, labels):
        self.examples = [InputExample(texts=[s1, s2], label=float(label))
                         for s1, s2, label in zip(sentences1, sentences2, labels)]
        
    def __len__(self):
        return len(self.examples)
        
    def __getitem__(self, idx):
        return self.examples[idx]

def evaluate(model, data_loader, device, threshold=0.84):
    """
    高效评估函数，批量编码所有句子对并计算相似度。
    """
    model.eval()
    all_embeddings1 = []
    all_embeddings2 = []
    all_labels = []

    # 批量编码所有句子对
    with torch.no_grad():
        for batch in data_loader:
            sentences1 = batch['sentences1']
            sentences2 = batch['sentences2']
            labels = batch['labels']

            embeddings1 = model.encode(sentences1, convert_to_tensor=True, device=device)
            embeddings2 = model.encode(sentences2, convert_to_tensor=True, device=device)

            all_embeddings1.append(embeddings1)
            all_embeddings2.append(embeddings2)
            all_labels.extend(labels)

    # 将所有嵌入拼接为单个张量
    all_embeddings1 = torch.cat(all_embeddings1, dim=0)
    all_embeddings2 = torch.cat(all_embeddings2, dim=0)
    all_labels = torch.tensor(all_labels, dtype=torch.int, device=device)

    # 一次性计算余弦相似度
    similarity = torch.cosine_similarity(all_embeddings1, all_embeddings2, dim=1)
    preds = (similarity > threshold).cpu().numpy().astype(int)

    # 计算评估指标
    accuracy = accuracy_score(all_labels.cpu().numpy(), preds)
    f1 = f1_score(all_labels.cpu().numpy(), preds)
    recall = recall_score(all_labels.cpu().numpy(), preds)
    score = 0.6 * f1 + 0.2 * accuracy + 0.2 * recall

    return {
        'accuracy': accuracy,
        'f1': f1,
        'recall': recall,
        'score': score,
        'similarity': similarity.cpu().numpy(),  # 返回相似度分数用于进一步分析
        'labels': all_labels.cpu().numpy()       # 返回真实标签用于进一步分析
    }

def test_best_model(test_file='csv/test.csv', model_path='best_model', batch_size=64, threshold=0.84):
    """
    加载最佳模型并在测试集上验证性能
    
    Args:
        test_file: 测试集文件路径
        model_path: 模型保存路径
        batch_size: 批处理大小
        threshold: 相似度阈值，用于确定样本是否属于同一类
        
    Returns:
        dict: 包含测试指标的字典
    """
    # 检查CUDA
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"使用设备: {device}")
    
    # 1. 加载测试数据
    print("加载测试数据...")
    test_data = pd.read_csv(test_file, sep='\t', header=None, names=['q1', 'q2', 'label'])
    print(f"测试集大小: {len(test_data)}")
    
    # 2. 加载最佳模型
    print(f"\n加载模型 {model_path}...")
    try:
        model = SentenceTransformer(model_path)
        model.to(device)
        print("模型加载成功")
        print(f"模型是否在GPU上: {next(model.parameters()).device}")
    except Exception as e:
        print(f"模型加载失败: {e}")
        raise
    
    # 3. 准备测试集
    print("准备测试集...")
    test_dataset = SentencePairDataset(
        test_data['q1'].tolist(),
        test_data['q2'].tolist(),
        test_data['label'].tolist()
    )
    test_loader = DataLoader(
        test_dataset, 
        batch_size=batch_size, 
        pin_memory=True,
        num_workers=0,  # 不使用多进程
        collate_fn=custom_collate_fn
    )
    
    # 4. 进行测试评估
    print("进行测试评估...")
    test_metrics = evaluate(model, test_loader, device, threshold)
    
    # 5. 输出结果
    print("\n测试结果:")
    print(f"Accuracy: {test_metrics['accuracy']:.4f}")
    print(f"F1-score: {test_metrics['f1']:.4f}")
    print(f"Recall: {test_metrics['recall']:.4f}")
    print(f"最终得分: {test_metrics['score']:.4f}")
    
    # 6. 分析不同阈值的结果
    print("\n尝试不同阈值:")
    for th in [0.84, 0.845, 0.85, 0.855, 0.86]:
        preds = (test_metrics['similarity'] > th).astype(int)
        acc = accuracy_score(test_metrics['labels'], preds)
        f1 = f1_score(test_metrics['labels'], preds)
        recall = recall_score(test_metrics['labels'], preds)
        score = 0.6 * f1 + 0.2 * acc + 0.2 * recall
        print(f"阈值={th:.3f}: Accuracy={acc:.4f}, F1={f1:.4f}, Recall={recall:.4f}, Score={score:.4f}")
    
    return test_metrics

if __name__ == "__main__":
    # 运行测试
    test_metrics = test_best_model()

使用设备: cuda
加载测试数据...
测试集大小: 4401

加载模型 best_model...
模型加载成功
模型是否在GPU上: cuda:0
准备测试集...
进行测试评估...

测试结果:
Accuracy: 0.8278
F1-score: 0.8087
Recall: 0.7161
最终得分: 0.7940

尝试不同阈值:
阈值=0.840: Accuracy=0.8278, F1=0.8087, Recall=0.7161, Score=0.7940
阈值=0.845: Accuracy=0.8241, F1=0.8034, Recall=0.7068, Score=0.7882
阈值=0.850: Accuracy=0.8234, F1=0.8017, Recall=0.7023, Score=0.7862
阈值=0.855: Accuracy=0.8207, F1=0.7975, Recall=0.6947, Score=0.7816
阈值=0.860: Accuracy=0.8187, F1=0.7941, Recall=0.6880, Score=0.7778


threshold=0.84  0.8947 0.8358

threshold=0.85 0.8929 0.8359

threshold=0.855 0.8868 0.8369

threshold=0.86 0.8858 0.8350

threshold=0.845 0.8893 0.8351