In [4]:

import os
os.environ["WANDB_DISABLED"] = "true"

from sentence_transformers import SentenceTransformer, InputExample, losses, models
from torch.utils.data import DataLoader
import pandas as pd

# 1. 加载数据
df = pd.read_csv("Labeled_Substitution_Pairs.csv")
samples = [
    InputExample(texts=[row['Item_Desc'], row['Sub_Desc']], label=float(row['Valid']))
    for _, row in df.iterrows()
]

# 2. 建立模型结构（编码器 + pooling）
bert = models.Transformer('sentence-transformers/all-MiniLM-L6-v2')  # ✅ 正确写法

pooling = models.Pooling(bert.get_word_embedding_dimension())
model = SentenceTransformer(modules=[bert, pooling])

# 3. 建立训练数据集
train_dataloader = DataLoader(samples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

# 4. 模型训练
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=4,
    warmup_steps=100,
    show_progress_bar=True,
    output_path="output_siamese_model"
)
emb1 = model.encode("LID PLUG")
emb2 = model.encode("WOODEN LID PLUG")
from numpy import dot
from numpy.linalg import norm
cos_sim = dot(emb1, emb2) / (norm(emb1) * norm(emb2))
print("Cosine Similarity:", cos_sim)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  return forward_call(*args, **kwargs)


Step,Training Loss


Cosine Similarity: 0.69084436


In [5]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# 加载完整表格（你之前上传过的）
df_full = pd.read_csv("substitution_table_cleaned_for_labeling.csv")

# 加载刚训练完的模型（Colab 已保存在该路径）
model = SentenceTransformer("output_siamese_model")
# 展开所有 (Item, Sub_Code) 对
pairs = []
for i in range(1, 4):
    temp = pd.DataFrame({
        "Item": df_full["Item"],
        "Item_Desc": df_full["Description"],
        "Sub_Code": df_full[f"Sub_{i}_Code"],
        "Sub_Desc": df_full[f"Sub_{i}_Desc"],
    })
    pairs.append(temp)

pairs_df = pd.concat(pairs, ignore_index=True)
pairs_df = pairs_df[pairs_df["Item"] != pairs_df["Sub_Code"]]  # 去除自身
pairs_df = pairs_df.drop_duplicates(subset=["Item", "Sub_Code"])  # 去重
item_vecs = model.encode(pairs_df["Item_Desc"].tolist(), convert_to_tensor=True, show_progress_bar=True)
sub_vecs = model.encode(pairs_df["Sub_Desc"].tolist(), convert_to_tensor=True, show_progress_bar=True)
scores = util.cos_sim(item_vecs, sub_vecs).diagonal().cpu().numpy()
pairs_df["Score"] = scores
top3_df = (
    pairs_df.sort_values(["Item", "Score"], ascending=[True, False])
    .groupby("Item")
    .head(3)
    .reset_index(drop=True)
)

top3_df["Rank"] = top3_df.groupby("Item").cumcount() + 1
final = top3_df.pivot(index="Item", columns="Rank", values=["Sub_Code", "Score"]).reset_index()
final.columns = ["Item", "Sub_1", "Sub_2", "Sub_3", "Score_1", "Score_2", "Score_3"]

# 查看前几行
final.head()


Batches:   0%|          | 0/237 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Batches:   0%|          | 0/237 [00:00<?, ?it/s]

Unnamed: 0,Item,Sub_1,Sub_2,Sub_3,Score_1,Score_2,Score_3
0,01440825DNU,1446305,1449108,,0.962323,0.940105,
1,01449315-1,1449326,1449320,,0.99403,0.989039,
2,06165-500,06165-900,6166001,06166-500,0.536422,0.513839,0.510709
3,06165-900,6166001,06166-500,06165-500,0.84839,0.777837,0.536422
4,06166-500,6166001,06165-900,06165-500,0.808913,0.777837,0.510709


In [6]:
# 从原始表格中提取 货号-描述 映射字典
code_to_desc = pd.Series(df_full["Description"].values, index=df_full["Item"]).to_dict()

# 使用映射给 Sub_1~3 添加对应描述
final["Sub_1_Desc"] = final["Sub_1"].map(code_to_desc)
final["Sub_2_Desc"] = final["Sub_2"].map(code_to_desc)
final["Sub_3_Desc"] = final["Sub_3"].map(code_to_desc)

# 显示最终表格
final[["Item", "Sub_1", "Sub_1_Desc", "Score_1", "Sub_2", "Sub_2_Desc", "Score_2", "Sub_3", "Sub_3_Desc", "Score_3"]].head()


Unnamed: 0,Item,Sub_1,Sub_1_Desc,Score_1,Sub_2,Sub_2_Desc,Score_2,Sub_3,Sub_3_Desc,Score_3
0,01440825DNU,1446305,38-400 WHITE RIBBED PP CAPS F217 LINER (1/EA),0.962323,1449108,70-400 BLACK PP CAPS F217 LINER (1/EA),0.940105,,,
1,01449315-1,1449326,70-450G WHITE METAL CAP NO BUTTON PLASTISOL LI...,0.99403,1449320,70-450G BLACK METAL CAP W/ BUTTON PLASTISOL LI...,0.989039,,,
2,06165-500,06165-900,WHITE LINEN DINNER NAPKIN (900/CS),0.536422,6166001,NATURAL LINEN DINNER NAPKIN (600/CS),0.513839,06166-500,HOFFMASTER NATURAL LINEN DINNER NAPKIN (500/CS),0.510709
3,06165-900,6166001,NATURAL LINEN DINNER NAPKIN (600/CS),0.84839,06166-500,HOFFMASTER NATURAL LINEN DINNER NAPKIN (500/CS),0.777837,06165-500,LT1217-6 WHITE LINEN DINNER NAPKIN / GUEST TOW...,0.536422
4,06166-500,6166001,NATURAL LINEN DINNER NAPKIN (600/CS),0.808913,06165-900,WHITE LINEN DINNER NAPKIN (900/CS),0.777837,06165-500,LT1217-6 WHITE LINEN DINNER NAPKIN / GUEST TOW...,0.510709
