In [12]:
import json
import random
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.datasets import SentencesDataset
from sentence_transformers import InputExample
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

# 定义数据文件路径
DATA_FILE = 'labeled_data.json'

# 加载标注数据
with open(DATA_FILE, 'r', encoding='utf-8') as f:
    data = json.load(f)

# 创建 InputExample 列表
train_examples = []
for item in data:
    substring = item['substring'].strip()
    matched_chunk = item['matched_chunk'].strip()
    if not substring or not matched_chunk:
        continue  # 跳过空的句子对
    train_examples.append(InputExample(
        texts=[substring, matched_chunk],
        label=float(item['label'])  # 确保标签为 float 类型
    ))


# 创建 SentencesDataset 和 DataLoader
train_dataset = SentencesDataset(train_examples, model=SentenceTransformer('multi-qa-MiniLM-L6-cos-v1'))
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=1)


In [13]:
from sentence_transformers import losses

# 定义损失函数
train_loss = losses.CosineSimilarityLoss(model=SentenceTransformer('multi-qa-MiniLM-L6-cos-v1'))


In [15]:
# 初始化模型
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

# 配置优化器参数
optimizer_params = {
    'lr': 1e-5  # 学习率可以根据需要调整，例如 1e-5, 3e-5, 5e-5 等
}

# 启动微调
model.fit(    
    train_objectives=[(train_dataloader, train_loss)],    
    epochs=3,    
    warmup_steps=100,    
    optimizer_params=optimizer_params,  # 使用默认优化器 AdamW 并设置学习率
    use_amp=None  # 启用自动混合精度（如果支持）
)


100%|██████████| 30/30 [00:01<00:00, 26.06it/s]

{'train_runtime': 1.1503, 'train_samples_per_second': 26.081, 'train_steps_per_second': 26.081, 'train_loss': 0.06850192546844483, 'epoch': 3.0}





In [None]:
model.save('fine_tuned_simcse_model')
