## Takes all

In [1]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
# !pip install pandas
# !pip install -U sentence-transformers
# !pip install --upgrade transformers

In [2]:
# !pip show datasets

In [3]:
# !pip uninstall torchvision
# !pip install torchvision


In [4]:
import torch
print(torch.__version__)  
print(torch.version.cuda)  
print(torch.cuda.is_available())  
import torch
print(torch.__config__.show())
print(torch.cuda.device_count())  
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")  
print(torch.backends.cudnn.enabled)  


2.6.0+cu126
12.6
True
PyTorch built with:
  - GCC 11.2
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.5.3 (Git Hash 66f0cb9eb66affd2da3bf5f8d897376f04aae6af)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX512
  - CUDA Runtime 12.6
  - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90
  - CuDNN 90.5.1
  - Magma 2.6.1
  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=2236df1770800ffea5697b11b0bb0d910b2e59e1, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvi

In [5]:
import pickle

with open('structured_abstract_sections.pkl', 'rb') as f:
    normalized_sections = pickle.load(f)

normalized_sections 


defaultdict(list,
            {'methods': ['a case report',
              'acquisition of evidence',
              'action',
              'actions',
              'activities',
              'admission findings',
              'ams subject classification',
              'analyses',
              'analysis',
              'analytical approach',
              'analytical techniques',
              'anatomy',
              'anesthesia',
              'animal',
              'animal or sample population',
              'animal population',
              'animal studied',
              'animal studies',
              'animal(s)',
              'animals',
              'animals and interventions',
              'animals and methods',
              'animals or sample population',
              'animals studied',
              'animals, materials and methods',
              'animals, methods',
              'approach',
              'approach and methods',
              'area covered',
      

In [6]:
# def jaccard_similarity(set1, set2):
#     intersection = len(set1 & set2)
#     union = len(set1 | set2)
#     return intersection / union if union != 0 else 0

# jaccard_similarity({'Results', 'Results'}, {'Other', 'Conclusion'})


In [7]:
import random
from itertools import combinations
import pandas as pd


# Generate pairs with labels
pairs = []
for category, texts in normalized_sections.items():
    # Pairs within the same category
    category_pairs = list(combinations(texts, 2))
    sampled_pairs = random.sample(category_pairs, min(len(category_pairs), 20000))  # Randomly sample up to 5 pairs
    pairs.extend([(pair[0].lower(), pair[1].lower(), 1) for pair in sampled_pairs])  # Ensure label 1 for same-category pairs


# Pairs across different categories
categories = list(normalized_sections.keys())
for i in range(len(categories)):
    for j in range(i + 1, len(categories)):
        cross_category_pairs = [(text1.lower(), text2.lower(), 0) for text1 in normalized_sections[categories[i]] for text2 in normalized_sections[categories[j]]]
        pairs.extend(random.sample(cross_category_pairs, min(len(cross_category_pairs), 20000)))  # Randomly sample up to 5 pairs

random.shuffle(pairs)

len(pairs)

300000

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses

class SentencePairDataset(Dataset):
    def __init__(self, sentence_pairs):
        self.samples = [InputExample(texts=[s1, s2], label=float(score)) for s1, s2, score in sentence_pairs]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]


# 建立 Dataset
train_dataset = SentencePairDataset(pairs)

# sentence-transformers 需要一個特殊的 DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)


In [9]:
from sentence_transformers import SentenceTransformer, losses

model = SentenceTransformer("all-MiniLM-L6-v2")
loss = losses.ContrastiveLoss(model)


In [10]:
from datasets import Dataset

In [11]:
# 訓練模型
model.fit(
    train_objectives=[(train_dataloader, loss)],
    epochs=3,
    warmup_steps=100
)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0288
1000,0.0134
1500,0.0078
2000,0.0046
2500,0.0027
3000,0.0019
3500,0.0012
4000,0.0011
4500,0.0008
5000,0.0008


In [12]:
# !pip install transformers[torch]

In [13]:
# 載入模型
model.save("models/fine_tuned_sentence_bert_model_ContrastiveLoss_lower")
fine_tuned_model = SentenceTransformer("models/fine_tuned_sentence_bert_model_ContrastiveLoss_lower")

# 測試新模型
sentences = ["app-group", "Clinical trials assessing bsAbs targeting immunomodulatory checkpoints"]
embeddings = fine_tuned_model.encode(sentences)

print(embeddings.shape)  # (2, 384) -> 2 個句子，每個有 384 維度的嵌入


(2, 384)


In [14]:
embeddings

array([[ 3.81922238e-02,  3.16624939e-02, -6.62201941e-02,
        -2.82742903e-02, -6.49375170e-02, -5.59838861e-03,
        -9.41137783e-03,  2.86402423e-02, -4.05042470e-02,
         5.26936613e-02,  6.55639917e-02, -4.11509126e-02,
        -7.74642546e-03,  3.77339683e-02, -3.14994454e-02,
        -1.78947672e-02, -8.22214857e-02,  5.32596884e-03,
        -9.29981247e-02,  1.81015301e-02,  1.04592443e-01,
        -1.74847133e-02, -6.05730414e-02, -2.37231720e-02,
        -5.20856120e-02,  7.31796026e-02,  2.75901631e-02,
         2.74554752e-02, -2.94812508e-02, -1.71331301e-01,
        -4.00610678e-02,  1.64926425e-02, -2.00661088e-05,
         9.91813187e-03, -5.47005087e-02,  3.74852531e-02,
         6.93096220e-02, -2.40421519e-02,  4.46278676e-02,
         2.62302905e-02, -3.85145992e-02, -7.76876509e-02,
        -1.45605893e-03, -5.00101037e-02,  6.56801462e-02,
         2.70292144e-02,  3.44689898e-02, -1.70487668e-02,
         2.87249368e-02,  1.49954846e-02, -7.27025047e-0

## Split TrainingSet for test

In [15]:
import pickle

with open('structured_abstract_sections.pkl', 'rb') as f:
    normalized_sections = pickle.load(f)

# normalized_sections 


In [16]:
import pickle
import numpy as np
with open('structured_abstract_sections.pkl', 'rb') as f:
    normalized_sections = pickle.load(f)

sentences = []
indices = []
for normal_sect in normalized_sections:
    index = [len(sentences)]
    sentences += [text.lower() for text in normalized_sections[normal_sect]]
    index.append(len(sentences))
    indices.append(index)
    
labels = np.zeros(indices[-1][1], dtype=int)  # 創建一個與最大索引一致的 0 陣列

# 根據 indices 填充標籤
for i, (start, end) in enumerate(indices):
    labels[start:end] = i


In [17]:
from sklearn.model_selection import train_test_split

train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    sentences, labels, test_size=0.1, random_state=42, shuffle=True
)

len(train_sentences), len(test_sentences), len(train_labels), len(test_labels)

(2728, 304, 2728, 304)

In [18]:
import itertools
import random

sentence_pairs = list(itertools.combinations(range(len(train_sentences)), 2))

# 設定要抽取的數量
num_samples = min(300000, len(sentence_pairs))  # 確保不超出總數

# 隨機抽取 num_samples 個 pair
random_pairs = random.sample(sentence_pairs, num_samples)

# 產生 pair 資料
paired_data = [
    (train_sentences[i], train_sentences[j], 1 if train_labels[i] == train_labels[j] else 0)
    for i, j in random_pairs
]

# 顯示結果
for pair in paired_data:
    print(pair)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



('design and sample', 'experimental design and results', 0)
('sampling', 'recommendation 3', 0)
('problems and aims', 'qualifying statements', 0)
('the aim of our study', 'innovation', 0)
('purpose/aim', 'global importance', 0)
('question', 'aims and objective', 1)
('participants and setting', 'method & material', 1)
('material/subjects and methods', 'material', 1)
('summary and conclusion', 'evolution', 0)
('aims and purpose', 'suggestions', 0)
('aims/objective', 'implications for health care provision', 0)
('response', 'summary and background', 0)
('in summary', 'outcomes and measurements', 0)
('purpose of the investigation', 'background and study aim', 1)
('main method', 'impact on industry', 0)
('methodological design and justification', 'implication for practice', 0)
('object and background', 'sample and methodology', 0)
('summary/conclusions', 'linking evidence to action', 1)
('mean outcome measure', 'backgrounds and purpose', 0)
('animals studied', 'background/rationale', 0)
('s

In [19]:
import torch
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses

class SentencePairDataset(Dataset):
    def __init__(self, sentence_pairs):
        self.samples = [InputExample(texts=[s1, s2], label=float(score)) for s1, s2, score in sentence_pairs]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]


# 建立 Dataset
train_dataset = SentencePairDataset(paired_data)

# sentence-transformers 需要一個特殊的 DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)


In [20]:
from sentence_transformers import SentenceTransformer, losses

model = SentenceTransformer("all-MiniLM-L6-v2")
loss = losses.ContrastiveLoss(model)


In [21]:
from datasets import Dataset

In [22]:
# 訓練模型
model.fit(
    train_objectives=[(train_dataloader, loss)],
    epochs=3,
    warmup_steps=100
)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0219
1000,0.0078
1500,0.0052
2000,0.0039
2500,0.0029
3000,0.0024
3500,0.0022
4000,0.0014
4500,0.0014
5000,0.0011


In [23]:
# !pip install transformers[torch]

In [24]:
# 載入模型
model.save("models/fine_tuned_sentence_bert_model_ContrastiveLoss_test_lower")
fine_tuned_model = SentenceTransformer("models/fine_tuned_sentence_bert_model_ContrastiveLoss_test_lower")

# 測試新模型
sentences = ["app-group", "Clinical trials assessing bsAbs targeting immunomodulatory checkpoints"]
embeddings = fine_tuned_model.encode(sentences)

print(embeddings.shape)  # (2, 384) -> 2 個句子，每個有 384 維度的嵌入


(2, 384)


In [25]:
embeddings

array([[-1.73796564e-02,  6.00601770e-02, -1.01303004e-01,
         6.66818675e-03, -2.97952276e-02, -1.27259572e-03,
         1.59807317e-02,  4.41389112e-03,  5.23447357e-02,
         1.75563674e-02,  5.22087589e-02, -8.08838159e-02,
        -3.54103657e-04,  9.07596550e-04, -1.06292926e-01,
        -5.30305970e-03,  3.24896653e-03,  1.81797165e-02,
        -5.90559244e-02, -2.75385268e-02,  5.91224469e-02,
        -1.84478518e-02, -1.04863839e-02, -2.16055498e-03,
        -5.03064319e-03,  1.36983320e-01, -9.41397715e-03,
         2.15070918e-02, -2.35166792e-02, -1.65800408e-01,
         1.42532140e-02, -3.52050690e-03,  4.97850738e-02,
         2.96302810e-02, -1.36532053e-01, -1.78069174e-02,
         8.30272064e-02, -3.28578837e-02,  2.41799615e-02,
         8.51735380e-03, -5.07006496e-02, -1.60126597e-01,
        -1.54736657e-02, -6.09617913e-03,  7.84252658e-02,
         6.15704758e-03,  5.26723973e-02,  5.29377498e-02,
         3.63083184e-02,  4.72613648e-02, -3.59471552e-0