In [2]:
import pandas as pd

df = pd.read_csv('PMCXML6.tsv', delimiter='\t')

x = []
sections = []
data_size = 177918
for ind, row in df.iterrows():
    if len(x) >= data_size:
        break
    parts = row['title'].split('>')
    desired_text = parts[2].strip()
    if desired_text != "":
        x.append(desired_text)
        sections.append({row['section'], row['altsection']})

len(x)


177918

In [3]:

from sentence_transformers import SentenceTransformer, InputExample, losses
fine_tuned_model = SentenceTransformer("models/fine_tuned_sentence_bert_model_ContrastiveLoss")

# 測試新模型
embeddings = fine_tuned_model.encode(x)

In [7]:
import torch
import torch.nn as nn

# 重新定義 Classifier 類別
class Classifier(nn.Module):
    def __init__(self, embedding_dim, num_classes):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x  # 這裡不做 softmax，因為 CrossEntropyLoss 內部已經處理

# 允許 PyTorch 加載 Classifier（PyTorch 2.6 限制安全性）
torch.serialization.add_safe_globals([Classifier])


In [8]:
import torch

model = torch.load("classifier_model.pth", weights_only=False)

model.eval()  # 設置為評估模式

# 3. **定義 predict 函數**
def predict(model, new_embeddings):
    new_embeddings = torch.tensor(new_embeddings, dtype=torch.float32)  # 轉換成 tensor

    with torch.no_grad():  # 禁用梯度計算，加速推理
        outputs = model(new_embeddings)  # 前向傳播
        predictions = torch.argmax(outputs, dim=1)  # 獲取每個樣本的最大索引（即預測類別）

    return predictions.numpy()  # 轉回 NumPy 方便查看

# 4. **測試新資料**
import numpy as np

predictions_supervised = predict(model, new_embeddings=embeddings)
print("Predicted labels:", predictions_supervised)


Predicted labels: [3 1 1 ... 0 0 0]


In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

averages = np.load('averages.npy')

similarity_matrix = cosine_similarity(embeddings, averages)
predicted_averages = np.argmax(similarity_matrix, axis=1) 
print("Predicted labels:", predicted_averages)


Predicted labels: [3 1 1 ... 0 0 0]


In [10]:
def get_acc(pred, y_test):
    return np.sum(y_test == pred) / len(y_test) * 100

get_acc(predicted_averages, predictions_supervised)

99.56834047145314