## whole

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses
import pandas

In [None]:
import pickle
with open('structured_abstract_sections.pkl', 'rb') as f:
    normalized_sections = pickle.load(f)

sentences = []
indices = []
for normal_sect in normalized_sections:
    print(f'{normal_sect}: {len(normalized_sections[normal_sect])}')
    index = [len(sentences)]
    sentences += [text.lower() for text in normalized_sections[normal_sect]]
    index.append(len(sentences))
    indices.append(index)
    
print(indices)

methods: 1492
background: 215
results: 314
conclusions: 488
objective: 523
[[0, 1492], [1492, 1707], [1707, 2021], [2021, 2509], [2509, 3032]]


In [None]:

from collections import defaultdict
from tqdm import tqdm
fine_tuned_model = SentenceTransformer("models/fine_tuned_sentence_bert_model_ContrastiveLoss_lower")

# 測試新模型
embeddings = fine_tuned_model.encode(sentences)

import numpy as np
# np.save("embeddings_lower_test.npy", embeddings)

# 读取 embeddings
# loaded_embeddings = np.load("embeddings_lower.npy")


In [4]:
averages = []
for start, end in indices:
    embedding = embeddings[start:end]
    averages.append(np.mean(embedding, axis=0))

In [5]:
averages = np.array(averages)
np.save("averages_lower.npy", averages)

In [6]:
print(averages)

[[ 0.03558871  0.03356181 -0.06619266 ...  0.10495782  0.01723036
   0.0366231 ]
 [-0.13124542  0.07400809 -0.09880937 ...  0.03050551 -0.00203388
   0.02440936]
 [-0.07807221  0.11255819 -0.10282388 ...  0.00877835  0.00285315
  -0.00887733]
 [-0.09178653  0.079993   -0.07731932 ...  0.05086267  0.01297714
  -0.02654091]
 [-0.07913794  0.06918538 -0.09001073 ...  0.09340516 -0.0497851
   0.01945007]]


## test

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses
import pandas

In [2]:
import pickle
import numpy as np
with open('structured_abstract_sections.pkl', 'rb') as f:
    normalized_sections = pickle.load(f)

sentences = []
indices = []
for normal_sect in normalized_sections:
    print(f'{normal_sect}: {len(normalized_sections[normal_sect])}')
    index = [len(sentences)]
    sentences += [text.lower() for text in normalized_sections[normal_sect]]
    index.append(len(sentences))
    indices.append(index)
    
labels = np.zeros(indices[-1][1], dtype=int)  # 創建一個與最大索引一致的 0 陣列

# 根據 indices 填充標籤
for i, (start, end) in enumerate(indices):
    labels[start:end] = i


methods: 1492
background: 215
results: 314
conclusions: 488
objective: 523


In [3]:
from collections import defaultdict
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses
import pandas

fine_tuned_model = SentenceTransformer("models/fine_tuned_sentence_bert_model_ContrastiveLoss_test_lower")

# 測試新模型
embeddings = fine_tuned_model.encode(sentences)


In [4]:
from sklearn.model_selection import train_test_split

num_samples = embeddings.shape[0]
embedding_dim = embeddings.shape[1]
num_classes = 5

train_embeddings, test_embeddings, train_labels, test_labels = train_test_split(
    embeddings, labels, test_size=0.1, random_state=42, shuffle=True
)

In [5]:
averages = []
for i in range(5):
    mask = train_labels == i
    averages.append(np.mean(train_embeddings[mask], axis=0))

In [6]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


averages = np.load('averages_lower.npy')
threshold = 0.95
similarity_matrix = cosine_similarity(test_embeddings, averages)
predicted_averages = np.argmax(similarity_matrix, axis=1)
print("Predicted labels:", predicted_averages)

Predicted labels: [0 0 0 3 2 3 0 2 4 1 3 1 0 0 0 2 0 0 0 1 4 0 0 0 2 4 0 0 0 0 1 0 2 4 2 2 0
 4 1 0 2 2 0 3 0 4 0 0 2 3 4 2 4 3 3 3 0 1 0 1 0 0 0 3 0 3 0 3 0 0 0 0 0 3
 0 0 4 0 0 3 2 0 4 4 4 0 0 4 4 0 3 0 0 1 3 0 0 4 4 4 2 0 0 0 0 3 4 3 3 0 0
 0 0 3 1 0 3 0 0 4 0 3 4 4 4 3 2 4 0 2 0 0 1 2 0 3 0 0 2 0 3 0 2 3 4 0 0 3
 0 3 0 0 4 0 1 0 0 3 4 0 0 4 3 0 2 0 4 0 0 0 0 0 0 0 4 1 0 4 0 0 0 1 0 4 1
 3 0 3 3 1 2 0 3 4 0 2 4 0 3 3 4 0 0 0 0 1 0 0 0 4 3 4 0 0 2 0 0 0 0 0 4 0
 0 0 3 0 4 3 0 0 0 2 0 4 0 2 2 0 4 2 4 2 0 1 4 0 0 0 0 4 3 1 0 1 0 4 0 2 4
 0 4 4 2 0 2 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 4 0 0 0 0 4 0 0 3 0 0 0 4 0 2
 0 4 4 1 0 2 0 0]


In [8]:
def get_acc(pred, y_test):
    return np.sum(y_test == pred) / len(y_test)

acc = get_acc(predicted_averages, test_labels)

from sklearn.metrics import precision_score, recall_score, f1_score
precision = precision_score(test_labels, predicted_averages, average='weighted')  # 二分类
recall = recall_score(test_labels, predicted_averages, average='weighted')
f1 = f1_score(test_labels, predicted_averages, average='weighted')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"accuracy: {acc:.4f}")


Precision: 0.9233
Recall: 0.9211
F1-score: 0.9219
accuracy: 0.9211
