# 中文语义相似度判断模型

使用sentence-transformers训练中文语义相似度判断模型,判断两个中文句子的语义是否相同。

## 主要步骤:
1. 加载数据集
2. 加载预训练模型
3. 模型训练
4. 模型评估

In [None]:
# 导入必要的库
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score, recall_score
import numpy as np
from tqdm.auto import tqdm

# 检查CUDA是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

In [None]:
# 加载数据集
def load_data(train_path, dev_path, test_path):
    train_data = pd.read_csv(train_path, sep='\t', header=None, names=['q1', 'q2', 'label'])
    dev_data = pd.read_csv(dev_path, sep='\t', header=None, names=['q1', 'q2', 'label'])
    test_data = pd.read_csv(test_path, sep='\t', header=None, names=['q1', 'q2', 'label'])
    
    print(f"训练集大小: {len(train_data)}")
    print(f"验证集大小: {len(dev_data)}")
    print(f"测试集大小: {len(test_data)}")
    return train_data, dev_data, test_data

# 加载数据
train_data, dev_data, test_data = load_data(
    'csv/train.tsv',
    'csv/dev.csv',
    'csv/test.csv'
)

In [None]:
# 加载预训练模型
model = SentenceTransformer('shibing624/text2vec-base-chinese')
model = model.to(device)

# 准备训练数据
train_examples = []
for _, row in train_data.iterrows():
    train_examples.append(InputExample(
        texts=[row['q1'], row['q2']],
        label=float(row['label'])
    ))

# 设置训练参数
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.CosineSimilarityLoss(model)

# 训练模型
num_epochs = 3
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)

print("开始训练...")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    show_progress_bar=True
)

print("训练完成!")

In [None]:
# 模型评估函数
def evaluate_model(model, data, threshold=0.5):
    # 获取句子对的嵌入向量
    sentences1 = data['q1'].tolist()
    sentences2 = data['q2'].tolist()
    
    # 批量编码
    embeddings1 = model.encode(sentences1, convert_to_tensor=True, batch_size=32)
    embeddings2 = model.encode(sentences2, convert_to_tensor=True, batch_size=32)
    
    # 计算余弦相似度
    cosine_scores = torch.cosine_similarity(embeddings1, embeddings2)
    predictions = (cosine_scores > threshold).cpu().numpy().astype(int)
    
    # 计算评估指标
    accuracy = accuracy_score(data['label'], predictions)
    f1 = f1_score(data['label'], predictions)
    recall = recall_score(data['label'], predictions)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'recall': recall,
        'final_score': 0.6 * f1 + 0.2 * accuracy + 0.2 * recall
    }

# 在测试集上评估模型
print("在测试集上评估模型...")
test_metrics = evaluate_model(model, test_data)

print("\n测试集评估结果:")
print(f"Accuracy: {test_metrics['accuracy']:.4f}")
print(f"F1-score: {test_metrics['f1']:.4f}")
print(f"Recall: {test_metrics['recall']:.4f}")
print(f"最终得分: {test_metrics['final_score']:.4f}")

In [None]:
# 保存模型
model_save_path = 'chinese_semantic_model'
model.save(model_save_path)
print(f"模型已保存到: {model_save_path}")