In [1]:
import pandas as pd

df_test = pd.read_csv("../../data/all_articles5-v2.csv")
df_test

Unnamed: 0,article_id,pmid,title,abstract,full_text,section_names,sections,missing_fields
0,PMC498926,26530308,Erratum,"<S> lopez de lapuente a, pinto-medel mj, astob...","in the above-mentioned article, published in m...",['Body'],"['in the above-mentioned article, published in...",[]
1,PMC516104,28018390,Emerging Technologies for the Production of Re...,<S> plant cell walls are composed predominantl...,the demand for chemical energy is projected to...,"['Introduction', 'Biofuel Feedstocks', 'Engine...",['the demand for chemical energy is projected ...,[]
2,PMC503397,27721798,Cardiac Troponin and Tropomyosin: Structural a...,<S> inherited myopathies affect both skeletal ...,cardiomyopathies represent a collection of dis...,"['Introduction', 'From code to message', 'The ...",['cardiomyopathies represent a collection of d...,[]
3,PMC831342,34327328,Corrigendum: A Frequency-Domain Machine Learni...,,results of a bivariate regression of oef0 agai...,['Body'],['results of a bivariate regression of oef0 ag...,['abstract']
4,PMC878323,35110838,Evolving Techniques in RSI: Can the Choice of ...,"<S> how to cite this article: george b, joachi...",,['O'],[''],[]
...,...,...,...,...,...,...,...,...
71915,PMC746268,32130345,2020 Brazilian Thoracic Association recommenda...,<S> abstract\nthe pharmacological management o...,the pharmacological management of asthma has c...,"['INTRODUCTION', 'Concept', 'Epidemiology of a...",['the pharmacological management of asthma has...,[]
71916,PMC875428,35082458,Change is Inevitable Progress is Optional,<S> how to cite this article: peter j. change ...,the most important factor that determined the ...,"['T', 'B', 'D']",['the most important factor that determined th...,[]
71917,PMC343773,22932762,Novel Vectors of Malaria Parasite in the Weste...,<S> the main method of malaria control is base...,to the editor: the primary malaria control tec...,['Body'],['to the editor: the primary malaria control t...,[]
71918,PMC986711,36677594,Natural Products for Cosmetic Applications,,natural products provide an interesting and la...,['Body'],['natural products provide an interesting and ...,['abstract']


In [2]:
df_test = df_test.rename(columns={
            "full_text": "article",
            "abstract": "abstract"
        })
df_test = df_test.dropna(subset=["article", "abstract"])
df_test = df_test[df_test["article"].str.strip().astype(bool)]
df_test = df_test[df_test["abstract"].str.strip().astype(bool)]
df_test["article"] = df_test["article"].astype(str)
df_test["abstract"] = df_test["abstract"].astype(str)
len(df_test)

46309

In [4]:
import ast
df_test['section_names'] = df_test['section_names'].apply(ast.literal_eval)
df_test['section_names']

0                                                   [Body]
1        [Introduction, Biofuel Feedstocks, Engineering...
2        [Introduction, From code to message, The sarco...
5        [DATA AVAILABILITY STATEMENT, CONFLICT OF INTE...
6        [1. Application of the Next-Generation Sequenc...
                               ...                        
71910    [Background, Main body, Conclusion, Supplement...
71914                                               [Body]
71915    [INTRODUCTION, Concept, Epidemiology of asthma...
71916                                            [T, B, D]
71917                                               [Body]
Name: section_names, Length: 46309, dtype: object

In [5]:
from itertools import chain
texts = list(chain.from_iterable(df_test["section_names"]))
texts

['Body',
 'Introduction',
 'Biofuel Feedstocks',
 'Engineering Plants To Enhance Cellulosic Biomass',
 'Challenges In Bioethanol Production – From Pre-Treatment To Fermentation',
 'Pre-treatments',
 'Biological Pre-treatments',
 'Chemical Pre-treatments',
 'Physical Pre-treatments',
 'Thermal Pre-treatments',
 'Enzymatic Pre-treatment',
 'Fermentation',
 'Integration Of Multiple Technologies For Lignocellulose Conversion',
 'Cellulosic Biofuel Conversion Methods And The Prospect Of Emerging Technologies',
 'The Importance of Government Policies for a Successful Biofuels Industry',
 'Future Directions',
 'Author Contributions',
 'Conflict of Interest Statement',
 'Introduction',
 'From code to message',
 'The sarcomere',
 'The thin filament regulators',
 'Troponin I',
 'Troponin T',
 'Troponin C',
 'Tropomyosin',
 'Allosteric communication defines the hierarchy of functional and pathological states',
 'Structural biology efforts and treatment in the sarcomere',
 'Signaling in hypertroph

In [7]:
from sentence_transformers import SentenceTransformer, InputExample, losses
fine_tuned_bert_model = SentenceTransformer("../../../section_normalization/models/fine_tuned_sentence_bert_model_ContrastiveLoss_test_lower")



In [8]:
import torch
import torch.nn as nn

class Classifier(nn.Module):
    def __init__(self, embedding_dim, num_classes):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x  

torch.serialization.add_safe_globals([Classifier])


In [9]:
import torch
import pickle

model = torch.load("../../../section_normalization/classsifer_model/classifier_model_lower_test.pth", weights_only=False)

model.eval()  
threshold = 0.4

def predict(model, new_embeddings):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 自動選擇 GPU 或 CPU
    model.to(device)  # 將模型移到對應設備
    new_embeddings = torch.tensor(new_embeddings, dtype=torch.float32).to(device)  # 轉換為 tensor 並移到設備

    with torch.no_grad():  # 禁用梯度計算，加速推理
        outputs = model(new_embeddings)  # 模型輸出 logits
        probabilities = torch.softmax(outputs, dim=1)  # 轉為機率分佈
        
        predictions = torch.argmax(outputs, dim=1)  # 取得預測類別
        max_probs, max_indices = torch.max(probabilities, dim=1)  # 取得最大機率及其對應的類別索引
        predictions = torch.where(max_probs >= threshold, max_indices, torch.tensor(-1, device=outputs.device))


    return predictions.cpu().numpy() 


In [10]:
label2id = {
     "methods" : 0,
     "background" : 1,
     "results" : 2,
     "conclusions" : 3, 
     "objective": 4,
    "none": -1
}
id2label = {v: k for k, v in label2id.items()}

In [11]:
predictions_all = []

for texts in df_test['section_names']:
    clean_texts = [text.lower() if text is not None else "" for text in texts]
    
    embeddings = fine_tuned_bert_model.encode(clean_texts)
    predictions_supervised = predict(model, new_embeddings=embeddings)
    
    row_predictions = []
    for raw_text, pred in zip(texts, predictions_supervised):
        if raw_text is None:
            row_predictions.append("none") 
        else:
            row_predictions.append(id2label[pred])
            
    predictions_all.append(row_predictions)

df_test['predicted_section_names'] = predictions_all
df_test

Unnamed: 0,article_id,pmid,title,abstract,article,section_names,sections,missing_fields,predicted_section_names
0,PMC498926,26530308,Erratum,"<S> lopez de lapuente a, pinto-medel mj, astob...","in the above-mentioned article, published in m...",[Body],"['in the above-mentioned article, published in...",[],[methods]
1,PMC516104,28018390,Emerging Technologies for the Production of Re...,<S> plant cell walls are composed predominantl...,the demand for chemical energy is projected to...,"[Introduction, Biofuel Feedstocks, Engineering...",['the demand for chemical energy is projected ...,[],"[background, methods, methods, objective, meth..."
2,PMC503397,27721798,Cardiac Troponin and Tropomyosin: Structural a...,<S> inherited myopathies affect both skeletal ...,cardiomyopathies represent a collection of dis...,"[Introduction, From code to message, The sarco...",['cardiomyopathies represent a collection of d...,[],"[background, conclusions, background, methods,..."
5,PMC997282,36865013,Unmet needs in clinical trials in CKD: questio...,<S> abstract\nmany advances have been made in ...,data sharing is not applicable to this article...,"[DATA AVAILABILITY STATEMENT, CONFLICT OF INTE...",['data sharing is not applicable to this artic...,[],"[methods, background]"
6,PMC922807,35743947,Microbial Community Composition of the Antarct...,<S> antarctica represents a unique environment...,"before the mid-1980s, microbiological taxonomy...",[1. Application of the Next-Generation Sequenc...,"['before the mid-1980s, microbiological taxono...",[],"[conclusions, background, background, methods,..."
...,...,...,...,...,...,...,...,...,...
71910,PMC811586,33980313,Patient engagement in fertility research: benc...,<S> background\npatient and public involvement...,patient and public involvement (ppi) in resear...,"[Background, Main body, Conclusion, Supplement...",['patient and public involvement (ppi) in rese...,[],"[background, methods, conclusions, background,..."
71914,PMC721866,32400027,Immunological fortification at our barrier org...,<S> summary\n\nour barrier surfaces are fundam...,the barriers of the human body represent our f...,[Body],['the barriers of the human body represent our...,[],[methods]
71915,PMC746268,32130345,2020 Brazilian Thoracic Association recommenda...,<S> abstract\nthe pharmacological management o...,the pharmacological management of asthma has c...,"[INTRODUCTION, Concept, Epidemiology of asthma...",['the pharmacological management of asthma has...,[],"[background, objective, background, results, m..."
71916,PMC875428,35082458,Change is Inevitable Progress is Optional,<S> how to cite this article: peter j. change ...,the most important factor that determined the ...,"[T, B, D]",['the most important factor that determined th...,[],"[methods, objective, methods]"


In [12]:
df_test = df_test[['article_id', 'pmid', 'title', 'abstract', 'section_names', 'sections', 'predicted_section_names']]
df_test

Unnamed: 0,article_id,pmid,title,abstract,section_names,sections,predicted_section_names
0,PMC498926,26530308,Erratum,"<S> lopez de lapuente a, pinto-medel mj, astob...",[Body],"['in the above-mentioned article, published in...",[methods]
1,PMC516104,28018390,Emerging Technologies for the Production of Re...,<S> plant cell walls are composed predominantl...,"[Introduction, Biofuel Feedstocks, Engineering...",['the demand for chemical energy is projected ...,"[background, methods, methods, objective, meth..."
2,PMC503397,27721798,Cardiac Troponin and Tropomyosin: Structural a...,<S> inherited myopathies affect both skeletal ...,"[Introduction, From code to message, The sarco...",['cardiomyopathies represent a collection of d...,"[background, conclusions, background, methods,..."
5,PMC997282,36865013,Unmet needs in clinical trials in CKD: questio...,<S> abstract\nmany advances have been made in ...,"[DATA AVAILABILITY STATEMENT, CONFLICT OF INTE...",['data sharing is not applicable to this artic...,"[methods, background]"
6,PMC922807,35743947,Microbial Community Composition of the Antarct...,<S> antarctica represents a unique environment...,[1. Application of the Next-Generation Sequenc...,"['before the mid-1980s, microbiological taxono...","[conclusions, background, background, methods,..."
...,...,...,...,...,...,...,...
71910,PMC811586,33980313,Patient engagement in fertility research: benc...,<S> background\npatient and public involvement...,"[Background, Main body, Conclusion, Supplement...",['patient and public involvement (ppi) in rese...,"[background, methods, conclusions, background,..."
71914,PMC721866,32400027,Immunological fortification at our barrier org...,<S> summary\n\nour barrier surfaces are fundam...,[Body],['the barriers of the human body represent our...,[methods]
71915,PMC746268,32130345,2020 Brazilian Thoracic Association recommenda...,<S> abstract\nthe pharmacological management o...,"[INTRODUCTION, Concept, Epidemiology of asthma...",['the pharmacological management of asthma has...,"[background, objective, background, results, m..."
71916,PMC875428,35082458,Change is Inevitable Progress is Optional,<S> how to cite this article: peter j. change ...,"[T, B, D]",['the most important factor that determined th...,"[methods, objective, methods]"


In [13]:
df_test.to_parquet("../dataset/all_articles5-v2_predicted_section_names_lower.parquet")

In [14]:
df_test

Unnamed: 0,article_id,pmid,title,abstract,section_names,sections,predicted_section_names
0,PMC498926,26530308,Erratum,"<S> lopez de lapuente a, pinto-medel mj, astob...",[Body],"['in the above-mentioned article, published in...",[methods]
1,PMC516104,28018390,Emerging Technologies for the Production of Re...,<S> plant cell walls are composed predominantl...,"[Introduction, Biofuel Feedstocks, Engineering...",['the demand for chemical energy is projected ...,"[background, methods, methods, objective, meth..."
2,PMC503397,27721798,Cardiac Troponin and Tropomyosin: Structural a...,<S> inherited myopathies affect both skeletal ...,"[Introduction, From code to message, The sarco...",['cardiomyopathies represent a collection of d...,"[background, conclusions, background, methods,..."
5,PMC997282,36865013,Unmet needs in clinical trials in CKD: questio...,<S> abstract\nmany advances have been made in ...,"[DATA AVAILABILITY STATEMENT, CONFLICT OF INTE...",['data sharing is not applicable to this artic...,"[methods, background]"
6,PMC922807,35743947,Microbial Community Composition of the Antarct...,<S> antarctica represents a unique environment...,[1. Application of the Next-Generation Sequenc...,"['before the mid-1980s, microbiological taxono...","[conclusions, background, background, methods,..."
...,...,...,...,...,...,...,...
71910,PMC811586,33980313,Patient engagement in fertility research: benc...,<S> background\npatient and public involvement...,"[Background, Main body, Conclusion, Supplement...",['patient and public involvement (ppi) in rese...,"[background, methods, conclusions, background,..."
71914,PMC721866,32400027,Immunological fortification at our barrier org...,<S> summary\n\nour barrier surfaces are fundam...,[Body],['the barriers of the human body represent our...,[methods]
71915,PMC746268,32130345,2020 Brazilian Thoracic Association recommenda...,<S> abstract\nthe pharmacological management o...,"[INTRODUCTION, Concept, Epidemiology of asthma...",['the pharmacological management of asthma has...,"[background, objective, background, results, m..."
71916,PMC875428,35082458,Change is Inevitable Progress is Optional,<S> how to cite this article: peter j. change ...,"[T, B, D]",['the most important factor that determined th...,"[methods, objective, methods]"
