## Takes all

In [1]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
# !pip install pandas
# !pip install -U sentence-transformers
# !pip install --upgrade transformers

In [2]:
# !pip show datasets

In [3]:
# !pip uninstall torchvision
# !pip install torchvision


In [4]:
import torch
print(torch.__version__)  
print(torch.version.cuda)  
print(torch.cuda.is_available())  
import torch
print(torch.__config__.show())
print(torch.cuda.device_count())  
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")  
print(torch.backends.cudnn.enabled)  


2.6.0+cu126
12.6
True
PyTorch built with:
  - GCC 11.2
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.5.3 (Git Hash 66f0cb9eb66affd2da3bf5f8d897376f04aae6af)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX512
  - CUDA Runtime 12.6
  - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90
  - CuDNN 90.5.1
  - Magma 2.6.1
  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=2236df1770800ffea5697b11b0bb0d910b2e59e1, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvi

In [5]:
import pickle

with open('structured_abstract_sections.pkl', 'rb') as f:
    normalized_sections = pickle.load(f)

normalized_sections 


defaultdict(list,
            {'methods': ['a case report',
              'acquisition of evidence',
              'action',
              'actions',
              'activities',
              'admission findings',
              'ams subject classification',
              'analyses',
              'analysis',
              'analytical approach',
              'analytical techniques',
              'anatomy',
              'anesthesia',
              'animal',
              'animal or sample population',
              'animal population',
              'animal studied',
              'animal studies',
              'animal(s)',
              'animals',
              'animals and interventions',
              'animals and methods',
              'animals or sample population',
              'animals studied',
              'animals, materials and methods',
              'animals, methods',
              'approach',
              'approach and methods',
              'area covered',
      

In [6]:
# def jaccard_similarity(set1, set2):
#     intersection = len(set1 & set2)
#     union = len(set1 | set2)
#     return intersection / union if union != 0 else 0

# jaccard_similarity({'Results', 'Results'}, {'Other', 'Conclusion'})


In [7]:
import random
from itertools import combinations
import pandas as pd


# Generate pairs with labels
pairs = []
for category, texts in normalized_sections.items():
    # Pairs within the same category
    category_pairs = list(combinations(texts, 2))
    sampled_pairs = random.sample(category_pairs, min(len(category_pairs), 20000))  # Randomly sample up to 5 pairs
    pairs.extend([(pair[0].lower(), pair[1].lower(), 1) for pair in sampled_pairs])  # Ensure label 1 for same-category pairs


# Pairs across different categories
categories = list(normalized_sections.keys())
for i in range(len(categories)):
    for j in range(i + 1, len(categories)):
        cross_category_pairs = [(text1.lower(), text2.lower(), 0) for text1 in normalized_sections[categories[i]] for text2 in normalized_sections[categories[j]]]
        pairs.extend(random.sample(cross_category_pairs, min(len(cross_category_pairs), 20000)))  # Randomly sample up to 5 pairs

random.shuffle(pairs)

len(pairs)

300000

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses

class SentencePairDataset(Dataset):
    def __init__(self, sentence_pairs):
        self.samples = [InputExample(texts=[s1, s2], label=float(score)) for s1, s2, score in sentence_pairs]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]


# 建立 Dataset
train_dataset = SentencePairDataset(pairs)

# sentence-transformers 需要一個特殊的 DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)


In [9]:
from sentence_transformers import SentenceTransformer, losses

model = SentenceTransformer("all-MiniLM-L6-v2")
loss = losses.ContrastiveLoss(model)


In [10]:
from datasets import Dataset

In [11]:
# 訓練模型
model.fit(
    train_objectives=[(train_dataloader, loss)],
    epochs=3,
    warmup_steps=100
)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0288
1000,0.0134
1500,0.0078
2000,0.0046
2500,0.0027
3000,0.0019
3500,0.0012
4000,0.0011
4500,0.0008
5000,0.0008


In [12]:
# !pip install transformers[torch]

In [13]:
# 載入模型
model.save("models/fine_tuned_sentence_bert_model_ContrastiveLoss_lower")
fine_tuned_model = SentenceTransformer("models/fine_tuned_sentence_bert_model_ContrastiveLoss_lower")

# 測試新模型
sentences = ["app-group", "Clinical trials assessing bsAbs targeting immunomodulatory checkpoints"]
embeddings = fine_tuned_model.encode(sentences)

print(embeddings.shape)  # (2, 384) -> 2 個句子，每個有 384 維度的嵌入


(2, 384)


In [14]:
embeddings

array([[ 3.81922238e-02,  3.16624939e-02, -6.62201941e-02,
        -2.82742903e-02, -6.49375170e-02, -5.59838861e-03,
        -9.41137783e-03,  2.86402423e-02, -4.05042470e-02,
         5.26936613e-02,  6.55639917e-02, -4.11509126e-02,
        -7.74642546e-03,  3.77339683e-02, -3.14994454e-02,
        -1.78947672e-02, -8.22214857e-02,  5.32596884e-03,
        -9.29981247e-02,  1.81015301e-02,  1.04592443e-01,
        -1.74847133e-02, -6.05730414e-02, -2.37231720e-02,
        -5.20856120e-02,  7.31796026e-02,  2.75901631e-02,
         2.74554752e-02, -2.94812508e-02, -1.71331301e-01,
        -4.00610678e-02,  1.64926425e-02, -2.00661088e-05,
         9.91813187e-03, -5.47005087e-02,  3.74852531e-02,
         6.93096220e-02, -2.40421519e-02,  4.46278676e-02,
         2.62302905e-02, -3.85145992e-02, -7.76876509e-02,
        -1.45605893e-03, -5.00101037e-02,  6.56801462e-02,
         2.70292144e-02,  3.44689898e-02, -1.70487668e-02,
         2.87249368e-02,  1.49954846e-02, -7.27025047e-0

## Split TrainingSet for test

In [8]:
import pickle

with open('structured_abstract_sections.pkl', 'rb') as f:
    normalized_sections = pickle.load(f)

# normalized_sections 


In [9]:
import pickle
import numpy as np
with open('structured_abstract_sections.pkl', 'rb') as f:
    normalized_sections = pickle.load(f)

sentences = []
indices = []
for normal_sect in normalized_sections:
    index = [len(sentences)]
    sentences += [text.lower() for text in normalized_sections[normal_sect]]
    index.append(len(sentences))
    indices.append(index)
    
labels = np.zeros(indices[-1][1], dtype=int)  # 創建一個與最大索引一致的 0 陣列

# 根據 indices 填充標籤
for i, (start, end) in enumerate(indices):
    labels[start:end] = i


In [10]:
from sklearn.model_selection import train_test_split

train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    sentences, labels, test_size=0.1, random_state=42, shuffle=True, stratify=labels
)

len(train_sentences), len(test_sentences), len(train_labels), len(test_labels)

(2728, 304, 2728, 304)

In [11]:
import itertools
import random

sentence_pairs = list(itertools.combinations(range(len(train_sentences)), 2))

# 設定要抽取的數量
num_samples = min(300000, len(sentence_pairs))  # 確保不超出總數

# 隨機抽取 num_samples 個 pair
random_pairs = random.sample(sentence_pairs, num_samples)

# 產生 pair 資料
paired_data = [
    (train_sentences[i], train_sentences[j], 1 if train_labels[i] == train_labels[j] else 0)
    for i, j in random_pairs
]

# 顯示結果
for pair in paired_data:
    print(pair)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



('materials and methods/results', 'methodology and principle findings', 1)
('animal studies', 'dependent variable', 1)
('aim of the work', 'study group', 0)
('study design, size and duration', 'studies included', 1)
('treatment and further course', 'context of the problem', 0)
('knowledge translation', 'participants and interventions', 0)
('solutions', 'ethnopharmacological evidence', 0)
('preventive measures', 'background data and objective', 0)
('exclusions', 'strategies for improvement', 1)
('tools', 'study questions', 0)
('key conclusions', 'setting, participants, and measurements', 0)
('design and patient', 'case management', 0)
('extraction', 'initial assessment', 1)
('objectives and background', 'main measurements', 0)
('treatment and further course', 'main findings', 0)
('participants, design, and setting', 'context', 0)
('outcome measurements and statistical analysis', 'methods and principal findings', 0)
('trial registration numbers', 'design, setting, and methods', 0)
('main

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses

class SentencePairDataset(Dataset):
    def __init__(self, sentence_pairs):
        self.samples = [InputExample(texts=[s1, s2], label=float(score)) for s1, s2, score in sentence_pairs]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]


# 建立 Dataset
train_dataset = SentencePairDataset(paired_data)

# sentence-transformers 需要一個特殊的 DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)


In [13]:
from sentence_transformers import SentenceTransformer, losses

model = SentenceTransformer("all-MiniLM-L6-v2")
loss = losses.ContrastiveLoss(model)


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

In [14]:
from datasets import Dataset

In [15]:
# 訓練模型
model.fit(
    train_objectives=[(train_dataloader, loss)],
    epochs=3,
    warmup_steps=100
)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0227
1000,0.0078
1500,0.0051
2000,0.0039
2500,0.0029
3000,0.0023
3500,0.0019
4000,0.0017
4500,0.0013
5000,0.0011


In [16]:
# !pip install transformers[torch]

In [17]:
# 載入模型
model.save("models/fine_tuned_sentence_bert_model_ContrastiveLoss_test_lower")
fine_tuned_model = SentenceTransformer("models/fine_tuned_sentence_bert_model_ContrastiveLoss_test_lower")

# 測試新模型
sentences = ["app-group", "Clinical trials assessing bsAbs targeting immunomodulatory checkpoints"]
embeddings = fine_tuned_model.encode(sentences)

print(embeddings.shape)  # (2, 384) -> 2 個句子，每個有 384 維度的嵌入


(2, 384)


In [18]:
embeddings

array([[ 4.17924765e-03,  3.45877223e-02, -9.21235532e-02,
         1.44595020e-02, -3.96834910e-02, -1.86315421e-02,
        -6.50714338e-03,  2.53910720e-02,  1.79709296e-03,
         2.60717124e-02,  5.75391613e-02, -8.87471288e-02,
         1.00253765e-02,  1.94314849e-02, -8.96238983e-02,
        -1.58346463e-02, -2.86811660e-03,  4.30291928e-02,
        -4.04424444e-02, -2.73826607e-02,  5.41427769e-02,
         1.35404000e-03, -1.93879716e-02, -2.29367823e-03,
        -7.84898736e-03,  1.00642987e-01,  1.10555971e-02,
         5.48007106e-03, -1.86366923e-02, -1.65223181e-01,
        -3.02798674e-03,  5.59376646e-03,  8.65042880e-02,
         3.79058793e-02, -1.03149295e-01,  9.49927978e-03,
         7.09327981e-02, -1.97094008e-02,  3.68797965e-02,
         2.35274713e-02, -5.49881458e-02, -9.07984227e-02,
         1.08563807e-03, -2.47757938e-02,  7.69404247e-02,
         1.60226915e-02,  4.88440320e-03,  6.32022023e-02,
         4.57598455e-02,  7.04873502e-02, -3.35025974e-0

## Split TrainingSet for val (threshold)

In [19]:
import pickle

with open('structured_abstract_sections.pkl', 'rb') as f:
    normalized_sections = pickle.load(f)

# normalized_sections 


In [20]:
import pickle
import numpy as np
with open('structured_abstract_sections.pkl', 'rb') as f:
    normalized_sections = pickle.load(f)

sentences = []
indices = []
for normal_sect in normalized_sections:
    index = [len(sentences)]
    sentences += [text.lower() for text in normalized_sections[normal_sect]]
    index.append(len(sentences))
    indices.append(index)
    
labels = np.zeros(indices[-1][1], dtype=int)  # 創建一個與最大索引一致的 0 陣列

# 根據 indices 填充標籤
for i, (start, end) in enumerate(indices):
    labels[start:end] = i


In [30]:
from sklearn.model_selection import train_test_split

train_val_sentences, test_sentences, train_val_labels, test_labels = train_test_split(
    sentences, labels, test_size=0.1, random_state=42, shuffle=True, stratify=labels
)

train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_val_sentences, train_val_labels, test_size=1/9, random_state=42, shuffle=True, stratify=train_val_labels
)

# np.unique(train_labels, return_counts=True)

In [31]:
import itertools
import random

sentence_pairs = list(itertools.combinations(range(len(train_sentences)), 2))

# 設定要抽取的數量
num_samples = min(300000, len(sentence_pairs))  # 確保不超出總數

# 隨機抽取 num_samples 個 pair
random_pairs = random.sample(sentence_pairs, num_samples)

# 產生 pair 資料
paired_data = [
    (train_sentences[i], train_sentences[j], 1 if train_labels[i] == train_labels[j] else 0)
    for i, j in random_pairs
]

# 顯示結果
for pair in paired_data:
    print(pair)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)




('data retrieval', 'historical background', 0)
('object & method', 'aim and purpose', 1)
('advantages', 'recommendation and perspective', 1)
('equipment', 'sample size', 1)
('discussion and results', 'outcomes measured', 0)
('experimental studies', 'implications for public health practice', 0)
('surgical procedures', 'investigation', 1)
('interventions', 'future and projects', 0)
('outputs', 'research methodology', 0)
('equipment', 'main observations', 1)
('implementation', 'data source and methods', 1)
('aims/purpose', 'relevance/impact', 0)
('aims & results', 'baseline data', 0)
('measurements and methods', 'results/findings', 0)
('implications for nursing management', 'rationale and aims', 0)
('cases report', 'the technology', 1)
('main outcomes measures', 'main components of program', 1)
('design(s)', 'main variables examined', 1)
('outcome parameters', 'comparison with existing method', 0)
('anamnesis', 'techniques', 0)
('data collection', 'main results', 0)
('background and meth

In [32]:
import torch
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses

class SentencePairDataset(Dataset):
    def __init__(self, sentence_pairs):
        self.samples = [InputExample(texts=[s1, s2], label=float(score)) for s1, s2, score in sentence_pairs]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]


# 建立 Dataset
train_dataset = SentencePairDataset(paired_data)

# sentence-transformers 需要一個特殊的 DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)


In [33]:
from sentence_transformers import SentenceTransformer, losses

model = SentenceTransformer("all-MiniLM-L6-v2")
loss = losses.ContrastiveLoss(model)


In [34]:
from datasets import Dataset

In [35]:
# 訓練模型
model.fit(
    train_objectives=[(train_dataloader, loss)],
    epochs=3,
    warmup_steps=100
)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0219
1000,0.008
1500,0.005
2000,0.0038
2500,0.0027
3000,0.0021
3500,0.0018
4000,0.0013
4500,0.0011
5000,0.0009


In [36]:
# !pip install transformers[torch]

In [37]:
# 載入模型
model.save("models/fine_tuned_sentence_bert_model_ContrastiveLoss_val_lower")
fine_tuned_model = SentenceTransformer("models/fine_tuned_sentence_bert_model_ContrastiveLoss_val_lower")

# 測試新模型
sentences = ["app-group", "Clinical trials assessing bsAbs targeting immunomodulatory checkpoints"]
embeddings = fine_tuned_model.encode(sentences)

print(embeddings.shape)  # (2, 384) -> 2 個句子，每個有 384 維度的嵌入


(2, 384)


In [38]:
embeddings

array([[ 4.31893347e-03,  2.82279011e-02, -8.73085782e-02,
        -5.86429425e-03, -1.77082680e-02, -5.44984406e-03,
         6.77867327e-03,  1.66296139e-02,  2.88612917e-02,
         1.64461471e-02,  4.93651740e-02, -9.32786241e-02,
         2.11312454e-02,  1.94746573e-02, -8.28971714e-02,
        -1.34033728e-02, -3.17748450e-02,  8.90460387e-02,
        -5.20083867e-02, -3.73761691e-02,  6.14107735e-02,
        -1.93957333e-02, -1.94289198e-03, -3.31252581e-03,
        -2.31436305e-02,  9.40469429e-02, -8.35296989e-04,
         1.25182047e-02, -2.85523273e-02, -1.76783144e-01,
        -2.71311565e-03, -5.43274032e-03,  8.56659710e-02,
         4.30930741e-02, -9.21915993e-02,  6.69465400e-03,
         7.16858134e-02, -9.48328432e-03,  3.44643928e-02,
         2.58485656e-02, -5.26101962e-02, -1.02019705e-01,
         7.74042308e-03, -1.20331552e-02,  8.83939788e-02,
         1.72494147e-02,  1.45379957e-02,  4.93470058e-02,
         6.24467842e-02,  5.58870733e-02, -5.14357723e-0