In [2]:
import pickle
import numpy as np
with open('structured_abstract_sections.pkl', 'rb') as f:
    normalized_sections = pickle.load(f)

sentences = []
indices = []
for normal_sect in normalized_sections:
    print(f'{normal_sect}: {len(normalized_sections[normal_sect])}')
    index = [len(sentences)]
    sentences += normalized_sections[normal_sect]
    index.append(len(sentences))
    indices.append(index)
    
labels = np.zeros(indices[-1][1], dtype=int)  # 創建一個與最大索引一致的 0 陣列

# 根據 indices 填充標籤
for i, (start, end) in enumerate(indices):
    labels[start:end] = i


methods: 1492
background: 215
results: 314
conclusions: 488
objective: 523


In [4]:
from collections import defaultdict
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses
import pandas

fine_tuned_model = SentenceTransformer("models/fine_tuned_sentence_bert_model_ContrastiveLoss")

# 測試新模型
embeddings = fine_tuned_model.encode(sentences)


In [8]:
embeddings.shape

(3032, 384)

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np

# 1. 定義 PyTorch Dataset，先 shuffle 再存入
class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        embeddings = np.array(embeddings, dtype=np.float32)
        labels = np.array(labels, dtype=np.longlong)

        # 先隨機打亂索引
        indices = np.arange(len(labels))
        np.random.shuffle(indices)

        # 根據打亂的索引重新排列 embeddings 和 labels
        self.embeddings = torch.tensor(embeddings[indices], dtype=torch.float32)
        self.labels = torch.tensor(labels[indices], dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

In [6]:
# 2. 定義分類模型
class Classifier(nn.Module):
    def __init__(self, embedding_dim, num_classes):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, 128)  # 隱藏層 128 維
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, num_classes)  # 輸出層
        self.softmax = nn.LogSoftmax(dim=1)  # 可改用 CrossEntropyLoss 不需明確 softmax

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x  # CrossEntropyLoss 會處理 softmax，所以這裡不需 softmax

In [18]:
# 3. 創建 Dataset & DataLoader
from tqdm import tqdm
def train_model(train_loader, model, epochs=10, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    model.train()
    for epoch in range(epochs):
        print(f'epoch {epoch}:')
        total_loss = 0
        for batch_embeddings, batch_labels in tqdm(train_loader):
            optimizer.zero_grad()
            outputs = model(batch_embeddings)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

    return model

In [19]:
def evaluate_model(test_loader, model):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch_embeddings, batch_labels in test_loader:
            outputs = model(batch_embeddings)
            predictions = torch.argmax(outputs, dim=1)
            correct += (predictions == batch_labels).sum().item()
            total += batch_labels.size(0)

    accuracy = correct / total
    print(f"Test Accuracy: {accuracy:.4f}")
    return accuracy

In [21]:
from sklearn.model_selection import train_test_split

num_samples = embeddings.shape[0]
embedding_dim = embeddings.shape[1]
num_classes = 5

train_embeddings, test_embeddings, train_labels, test_labels = train_test_split(
    embeddings, labels, test_size=0.1, random_state=42, shuffle=True
)

# 7. 創建 PyTorch Dataset & DataLoader
train_dataset = EmbeddingDataset(train_embeddings, train_labels)
test_dataset = EmbeddingDataset(test_embeddings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


# 8. 初始化並訓練模型
model = Classifier(embedding_dim, num_classes)
trained_model = train_model(train_loader, model)

# 9. 測試模型
evaluate_model(test_loader, trained_model)


epoch 0:


100%|██████████| 86/86 [00:00<00:00, 812.12it/s]


Epoch 1/10, Loss: 64.5294
epoch 1:


100%|██████████| 86/86 [00:00<00:00, 853.04it/s]


Epoch 2/10, Loss: 5.2218
epoch 2:


100%|██████████| 86/86 [00:00<00:00, 864.02it/s]


Epoch 3/10, Loss: 1.1370
epoch 3:


100%|██████████| 86/86 [00:00<00:00, 833.37it/s]


Epoch 4/10, Loss: 0.5337
epoch 4:


100%|██████████| 86/86 [00:00<00:00, 836.63it/s]


Epoch 5/10, Loss: 0.3191
epoch 5:


100%|██████████| 86/86 [00:00<00:00, 886.72it/s]


Epoch 6/10, Loss: 0.2147
epoch 6:


100%|██████████| 86/86 [00:00<00:00, 839.29it/s]


Epoch 7/10, Loss: 0.1543
epoch 7:


100%|██████████| 86/86 [00:00<00:00, 838.62it/s]


Epoch 8/10, Loss: 0.1167
epoch 8:


100%|██████████| 86/86 [00:00<00:00, 801.49it/s]


Epoch 9/10, Loss: 0.0918
epoch 9:


100%|██████████| 86/86 [00:00<00:00, 797.04it/s]

Epoch 10/10, Loss: 0.0742
Test Accuracy: 1.0000





1.0

In [22]:
dataset = EmbeddingDataset(embeddings, labels)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

model = Classifier(embedding_dim, num_classes)
trained_model = train_model(loader, model)

torch.save(model, "classifier_model.pth")  # 或者用 .pt


epoch 0:


100%|██████████| 95/95 [00:00<00:00, 863.90it/s]


Epoch 1/10, Loss: 62.9716
epoch 1:


100%|██████████| 95/95 [00:00<00:00, 894.10it/s]


Epoch 2/10, Loss: 3.4928
epoch 2:


100%|██████████| 95/95 [00:00<00:00, 870.94it/s]


Epoch 3/10, Loss: 0.8722
epoch 3:


100%|██████████| 95/95 [00:00<00:00, 848.59it/s]


Epoch 4/10, Loss: 0.4247
epoch 4:


100%|██████████| 95/95 [00:00<00:00, 843.94it/s]


Epoch 5/10, Loss: 0.2569
epoch 5:


100%|██████████| 95/95 [00:00<00:00, 876.00it/s]


Epoch 6/10, Loss: 0.1736
epoch 6:


100%|██████████| 95/95 [00:00<00:00, 876.99it/s]


Epoch 7/10, Loss: 0.1258
epoch 7:


100%|██████████| 95/95 [00:00<00:00, 846.97it/s]


Epoch 8/10, Loss: 0.0955
epoch 8:


100%|██████████| 95/95 [00:00<00:00, 813.48it/s]


Epoch 9/10, Loss: 0.0750
epoch 9:


100%|██████████| 95/95 [00:00<00:00, 839.56it/s]

Epoch 10/10, Loss: 0.0606



