In [1]:
import pandas as pd
import numpy as np
import json, ast

def parse_listlike(s):
    if pd.isna(s):               
        return []
    s = str(s).strip()
    if s == '':
        return []
    if s[0] in '[{':
        try:
            return json.loads(s)
        except Exception:
            pass
    try:
        return ast.literal_eval(s)
    except Exception:
        return [s]               


In [2]:
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting pandas
  Downloading pandas-1.1.5-cp36-cp36m-manylinux1_x86_64.whl (9.5 MB)
     |████████████████████████████████| 9.5 MB 2.0 MB/s            
Installing collected packages: pandas
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
detecting-scientific-claim 1.0.0 requires fastText, which is not installed.
detecting-scientific-claim 1.0.0 requires allennlp==0.9.0, but you have allennlp 0.6.1 which is incompatible.[0m
Successfully installed pandas-1.1.5


In [1]:
import pandas as pd
df = pd.read_pickle('for_section_normalization.pkl')
df

Unnamed: 0,article_id,title,abstract,section_names,sections,missing_fields
0,PMC498926,Erratum,"<S> lopez de lapuente a, pinto-medel mj, astob...",[Body],"[[in the above-mentioned article, published in...",[]
1,PMC516104,Emerging Technologies for the Production of Re...,<S> plant cell walls are composed predominantl...,"[Introduction, Biofuel Feedstocks, Engineering...",[[the demand for chemical energy is projected ...,[]
2,PMC503397,Cardiac Troponin and Tropomyosin: Structural a...,<S> inherited myopathies affect both skeletal ...,"[Introduction, From code to message, The sarco...",[[cardiomyopathies represent a collection of d...,[]
3,PMC831342,Corrigendum: A Frequency-Domain Machine Learni...,,[Body],[[results of a bivariate regression of oef0 ag...,[abstract]
4,PMC878323,Evolving Techniques in RSI: Can the Choice of ...,"<S> how to cite this article: george b, joachi...",[O],[[]],[]
...,...,...,...,...,...,...
71916,PMC746268,2020 Brazilian Thoracic Association recommenda...,<S> abstract\nthe pharmacological management o...,"[INTRODUCTION, Concept, Epidemiology of asthma...",[[the pharmacological management of asthma has...,[]
71917,PMC875428,Change is Inevitable Progress is Optional,<S> how to cite this article: peter j. change ...,"[T, B, D]",[[the most important factor that determined th...,[]
71918,PMC343773,Novel Vectors of Malaria Parasite in the Weste...,<S> the main method of malaria control is base...,[Body],[[to the editor: the primary malaria control t...,[]
71919,PMC986711,Natural Products for Cosmetic Applications,,[Body],[[natural products provide an interesting and ...,[abstract]


In [4]:
!pip install pyarrow

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting pyarrow
  Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m221.6 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-22.0.0


In [2]:
df.to_parquet('for_section_normalization.parquet', engine='pyarrow')

In [None]:
import pandas as pd
def combine_sentence_with_section_debug(row):
    try:
        l1 = row['token']
        l2 = row['section']
        return_list = []
        
        # 你的 combine 邏輯
        if isinstance(l1, list):
            for sent in l1:
                if isinstance(sent, str): return_list.append(sent)
                elif isinstance(sent, list): return_list.append(sent)
        if isinstance(l2, list):
            for sec in l2:
                if isinstance(sec, str): return_list.append([sec])
                elif isinstance(sec, list): return_list.append(sec)
                
        return return_list
    except Exception as e:
        return np.nan  # 如果連 combine 都爆，回傳 NaN
def process_token_preserve_sentences(x):
    # 如果已經是 list，嘗試保留兩層結構 (句子 -> 單字)
    if isinstance(x, list):
        clean_sentences = []
        for item in x:
            if isinstance(item, list):
                # 遞迴或攤平 list 裡的字串
                sub_sent = []
                for sub in item:
                    if isinstance(sub, str):
                        sub_sent.extend(sub.split())
                if sub_sent: clean_sentences.append(sub_sent)
            elif isinstance(item, str):
                clean_sentences.append(item.split())
        return clean_sentences
    
    # 如果是字串，用 eval 解開
    if isinstance(x, str):
        try:
            return process_token_preserve_sentences(ast.literal_eval(x))
        except:
            return []
    return []

def clean_section(lst):
    if not isinstance(lst, list): return []
    return [str(x) for x in lst if x is not None]

print("正在合併與炸開資料...")
# 為了速度，你可以先取前 10000 筆測試，或是跑全量 (只跑處理很快)
df_new = pd.read_pickle("for_section_normalization.pkl")
df_new = df_new.rename(columns={"article_id": "pmcids", "section_names": "section", "sections": "token"})
df_new["sid"] = df_new.index

# 資料清洗
print("data cleaning...")
df_new["section"] = df_new["section"].apply(clean_section)
df_new["token"] = df_new["token"].apply(process_token_preserve_sentences)

debug_df = df_new.copy() # 或者 df_new.head(10000).copy()

# 套用 combine
debug_df['combined'] = debug_df.apply(combine_sentence_with_section_debug, axis=1)

# 3. 執行 Explode (這是最容易產生 NaN 的地方)
exploded_debug = debug_df.explode('combined')

# 4. 關鍵檢查：找出 'combined' 欄位不是 List 的資料
# 正常的資料應該要是 list (例如 ['word', 'word'])
# 壞掉的資料通常是 float (NaN) 或 None
bad_rows = exploded_debug[~exploded_debug['combined'].apply(lambda x: isinstance(x, list))]

# --- 顯示結果 ---
print(f"總資料筆數 (Exploded): {len(exploded_debug)}")
print(f"壞掉的資料筆數 (非 List): {len(bad_rows)}")

if len(bad_rows) > 0:
    print("\n=== 壞資料範例 (前 5 筆) ===")
    print(bad_rows[['pmcids', 'combined']].head(5))
    
    print("\n=== 壞資料的原始樣貌 (回推前 5 筆) ===")
    # 透過 index 找回原始 df_new 的樣子，看看原本長怎樣
    bad_indices = bad_rows.index[:5]
    print(df_new.loc[bad_indices, ['pmcids', 'token', 'section']])
else:
    print("\n恭喜！沒有發現非 List 的壞資料，可能是之前的清洗已經生效，或是 explode 行為符合預期。")

正在合併與炸開資料...
data cleaning...
總資料筆數 (Exploded): 936898
壞掉的資料筆數 (非 List): 3

=== 壞資料範例 (前 5 筆) ===
          pmcids combined
24005  PMC185215      NaN
31308  PMC586152      NaN
64908  PMC836436      NaN

=== 壞資料的原始樣貌 (回推前 5 筆) ===
          pmcids token section
24005  PMC185215    []      []
31308  PMC586152    []      []
64908  PMC836436    []      []


: 

In [3]:
import numpy as np
import pandas as pd

# === 1) 取 label 區間（第 8 個到倒數第 2 個；0-based 所以是 7:-1）===
label_cols = df.columns[8:-1].tolist()
Y = df.loc[:, label_cols].astype('uint8').to_numpy(copy=True)   # n x d, 0/1

# === 2) 用 NLL 排除離群（選中間 10%~90% 分位）===
p = Y.mean(axis=0).clip(1e-6, 1-1e-6)
logp, log1mp = np.log(p), np.log(1 - p)
nll = -(Y @ logp + (1 - Y) @ log1mp)          # 每列的「不尋常度」
q_lo, q_hi = np.quantile(nll, [0.10, 0.90])   # 可調，例如 0.2~0.8 更嚴
cand_idx = np.where((nll >= q_lo) & (nll <= q_hi))[0]

# （可選）若資料超大，先隨機下採樣候選集，計算更快
max_candidates = 100_000
if cand_idx.size > max_candidates:
    rng = np.random.default_rng(0)
    cand_idx = rng.choice(cand_idx, size=max_candidates, replace=False)

X = Y[cand_idx].astype(bool)   # 用 bool 讓 XOR 高效
n_cand = X.shape[0]

# === 3) 在候選中做 farthest-point sampling（Hamming 距離）===
k = 60
# 從「最接近 NLL 中位數」的點開始（代表性較高）
start_local = np.argmin(np.abs(nll[cand_idx] - np.median(nll[cand_idx])))
selected_local = [start_local]

min_dist = (X ^ X[start_local]).sum(axis=1)
div_score_at_pick = [np.nan]  

for _ in range(k - 1):
    nxt = np.argmax(min_dist)              
    selected_local.append(nxt)
    div_score_at_pick.append(min_dist[nxt])
    d = (X ^ X[nxt]).sum(axis=1)
    min_dist = np.minimum(min_dist, d)

selected_idx = cand_idx[np.array(selected_local)]

result = (df.iloc[selected_idx, :].copy()
            .assign(nll=nll[selected_idx],
                    min_hamming_at_pick=div_score_at_pick))

result


Unnamed: 0,article_id,article_text,abstract_text,labels,section_names,sections,PMID,ids,autobiography,bibliography,...,scientific_integrity_review,systematic_review,systematic_reviews_as_topic,twin_study,validation_study,veterinary_clinical_trial,veterinary_randomized_controlled_trial,dp,nll,min_hamming_at_pick
1,PMC5330001,"['tardive dystonia ( td ) , a rarer side effec...",['<S> tardive dystonia ( td ) is a serious sid...,,"[INTRODUCTION, CASE REPORT, DISCUSSION, Declar...","[[tardive dystonia ( td ) , a rarer side effec...",28250568,28250568,0,0,...,0,0,0,0,0,0,0,2017,1.458817,
84,PMC4968912,['this was an observational cohort study of he...,['<S> purposethe purpose of this study was to ...,,"[Methods, Study Population, Standard Automated...",[[this was an observational cohort study of he...,27409505,27409505,0,0,...,0,0,0,0,0,0,0,2016,8.367222,3.0
98,PMC4893752,['early diagnosis of rib fractures can rapidly...,['<S> introduction : the potential benefit of ...,,"[Introduction:, Methods:, Results:, Discussion...",[[early diagnosis of rib fractures can rapidly...,27274514,27274514,0,0,...,0,1,0,0,0,0,0,2016,6.939461,3.0
155,PMC4309867,['this retrospective study included 343 glauco...,['<S> purposeto compare the clinical character...,,"[Materials and Methods, Subjects, Glaucoma pro...",[[this retrospective study included 343 glauco...,25646059,25646059,0,0,...,0,0,0,0,0,0,0,2015,7.787892,3.0
254,PMC5115213,"['the main causes of snhl are advanced age , t...",['<S> backgroundthe purpose of our study was t...,,"[Background, Material and Methods, Subjects, E...","[[the main causes of snhl are advanced age , t...",27846196,27846196,0,0,...,0,0,0,0,0,0,0,2016,8.405738,3.0
5,PMC2778185,['agriculture has been one of the primary econ...,['<S> this was a cross - sectional study that ...,,"[1. Introduction, 2. Methodology, 3. Results a...",[[agriculture has been one of the primary econ...,20041012,20041012,0,0,...,0,0,0,0,0,0,0,2009,5.160773,2.0
8,PMC3155785,['the centers for disease control and preventi...,['<S> \n objective . to examine risk factors f...,,"[1. Introduction, 2. Materials and Methods, 3....",[[the centers for disease control and preventi...,21860798,21860798,0,0,...,0,0,0,0,0,0,0,2012,5.750929,2.0
45,PMC4828917,[''],['<S> mypro is a software pipeline for high - ...,,[Supplementary Material],[[]],25911337,25911337,0,0,...,0,0,0,0,1,0,0,2015,7.006896,2.0
105,PMC3364427,['all study subjects were recruited in a conse...,['<S> purposethe aim of this study was to inve...,,"[Materials and Methods, Optical coherence tomo...",[[all study subjects were recruited in a conse...,22670072,22670072,0,0,...,0,0,0,0,0,0,0,2012,8.36629,2.0
109,PMC3866000,"['between 1903 and 1906 , oswaldo cruz , direc...",['<S> this article addresses the discussion ab...,,[Malaria Campaigns in Brazil and Resistance to...,"[[between 1903 and 1906 , oswaldo cruz , direc...",24331212,24331212,0,0,...,0,0,0,0,0,0,0,2014,6.196268,2.0


In [4]:
for i, row in result.iterrows():
    if row['article_id'] == 'PMC2481468':
        for c in result.columns:
            if row[c] == 1:
                print(c)


comment


In [5]:
# 需要檢查的欄位
cols_to_check = [
    'review',
    'scientific_integrity_review',
    'systematic_review',
    'historical_article',
    'comment'
]

# 建立布林遮罩：只要任一欄位為 1
mask = result[cols_to_check].eq(1).any(axis=1)

# 保留不是這些的
result_filtered = result[~mask].copy()


In [6]:
result_filtered

Unnamed: 0,article_id,article_text,abstract_text,labels,section_names,sections,PMID,ids,autobiography,bibliography,...,scientific_integrity_review,systematic_review,systematic_reviews_as_topic,twin_study,validation_study,veterinary_clinical_trial,veterinary_randomized_controlled_trial,dp,nll,min_hamming_at_pick
1,PMC5330001,"['tardive dystonia ( td ) , a rarer side effec...",['<S> tardive dystonia ( td ) is a serious sid...,,"[INTRODUCTION, CASE REPORT, DISCUSSION, Declar...","[[tardive dystonia ( td ) , a rarer side effec...",28250568,28250568,0,0,...,0,0,0,0,0,0,0,2017,1.458817,
84,PMC4968912,['this was an observational cohort study of he...,['<S> purposethe purpose of this study was to ...,,"[Methods, Study Population, Standard Automated...",[[this was an observational cohort study of he...,27409505,27409505,0,0,...,0,0,0,0,0,0,0,2016,8.367222,3.0
155,PMC4309867,['this retrospective study included 343 glauco...,['<S> purposeto compare the clinical character...,,"[Materials and Methods, Subjects, Glaucoma pro...",[[this retrospective study included 343 glauco...,25646059,25646059,0,0,...,0,0,0,0,0,0,0,2015,7.787892,3.0
254,PMC5115213,"['the main causes of snhl are advanced age , t...",['<S> backgroundthe purpose of our study was t...,,"[Background, Material and Methods, Subjects, E...","[[the main causes of snhl are advanced age , t...",27846196,27846196,0,0,...,0,0,0,0,0,0,0,2016,8.405738,3.0
5,PMC2778185,['agriculture has been one of the primary econ...,['<S> this was a cross - sectional study that ...,,"[1. Introduction, 2. Methodology, 3. Results a...",[[agriculture has been one of the primary econ...,20041012,20041012,0,0,...,0,0,0,0,0,0,0,2009,5.160773,2.0
8,PMC3155785,['the centers for disease control and preventi...,['<S> \n objective . to examine risk factors f...,,"[1. Introduction, 2. Materials and Methods, 3....",[[the centers for disease control and preventi...,21860798,21860798,0,0,...,0,0,0,0,0,0,0,2012,5.750929,2.0
45,PMC4828917,[''],['<S> mypro is a software pipeline for high - ...,,[Supplementary Material],[[]],25911337,25911337,0,0,...,0,0,0,0,1,0,0,2015,7.006896,2.0
105,PMC3364427,['all study subjects were recruited in a conse...,['<S> purposethe aim of this study was to inve...,,"[Materials and Methods, Optical coherence tomo...",[[all study subjects were recruited in a conse...,22670072,22670072,0,0,...,0,0,0,0,0,0,0,2012,8.36629,2.0
112,PMC3176397,"['in recent years , the results after surgery ...",['<S> backgrounda national surveillance progra...,,"[Introduction, Material and methods, Surgery, ...","[[in recent years , the results after surgery ...",21562744,21562744,0,0,...,0,0,0,0,0,0,0,2011,5.606926,2.0
123,PMC4484657,['chronic obstructive pulmonary disease ( copd...,['<S> objectivethe nutritional status of chron...,,"[Introduction, Materials and methods, Patients...",[[chronic obstructive pulmonary disease ( copd...,26150712,26150712,0,0,...,0,0,0,0,0,0,0,2015,5.68371,2.0


In [7]:
keep = df.columns[0:8].tolist()
result_filtered = result_filtered[keep]
result_filtered

Unnamed: 0,article_id,article_text,abstract_text,labels,section_names,sections,PMID,ids
1,PMC5330001,"['tardive dystonia ( td ) , a rarer side effec...",['<S> tardive dystonia ( td ) is a serious sid...,,"[INTRODUCTION, CASE REPORT, DISCUSSION, Declar...","[[tardive dystonia ( td ) , a rarer side effec...",28250568,28250568
84,PMC4968912,['this was an observational cohort study of he...,['<S> purposethe purpose of this study was to ...,,"[Methods, Study Population, Standard Automated...",[[this was an observational cohort study of he...,27409505,27409505
155,PMC4309867,['this retrospective study included 343 glauco...,['<S> purposeto compare the clinical character...,,"[Materials and Methods, Subjects, Glaucoma pro...",[[this retrospective study included 343 glauco...,25646059,25646059
254,PMC5115213,"['the main causes of snhl are advanced age , t...",['<S> backgroundthe purpose of our study was t...,,"[Background, Material and Methods, Subjects, E...","[[the main causes of snhl are advanced age , t...",27846196,27846196
5,PMC2778185,['agriculture has been one of the primary econ...,['<S> this was a cross - sectional study that ...,,"[1. Introduction, 2. Methodology, 3. Results a...",[[agriculture has been one of the primary econ...,20041012,20041012
8,PMC3155785,['the centers for disease control and preventi...,['<S> \n objective . to examine risk factors f...,,"[1. Introduction, 2. Materials and Methods, 3....",[[the centers for disease control and preventi...,21860798,21860798
45,PMC4828917,[''],['<S> mypro is a software pipeline for high - ...,,[Supplementary Material],[[]],25911337,25911337
105,PMC3364427,['all study subjects were recruited in a conse...,['<S> purposethe aim of this study was to inve...,,"[Materials and Methods, Optical coherence tomo...",[[all study subjects were recruited in a conse...,22670072,22670072
112,PMC3176397,"['in recent years , the results after surgery ...",['<S> backgrounda national surveillance progra...,,"[Introduction, Material and methods, Surgery, ...","[[in recent years , the results after surgery ...",21562744,21562744
123,PMC4484657,['chronic obstructive pulmonary disease ( copd...,['<S> objectivethe nutritional status of chron...,,"[Introduction, Materials and methods, Patients...",[[chronic obstructive pulmonary disease ( copd...,26150712,26150712


## section annotation

In [8]:
import re
import numpy as np
import pickle

with open('../../section_normalization/structured_abstract_sections.pkl', 'rb') as f:
    normalized_sections = pickle.load(f)

def map_normalized_sections(row, norm_sections):
    mapping = {'background': 'introduction', 'objective': 'introduction', 'methods': 'methods', 'results': 'results', 'conclusions': 'discussion'}
    
    return_list = []
    for section_name in row['section_names']:
        added = False
        for norm_sec in norm_sections:
            compare = section_name.lower()
            compare = re.sub(r'^\s*\d+(?:\.\d+)*[)\.\u3001\uFF0E]?\s*', '', compare).strip()
            if compare in norm_sections[norm_sec]:
                return_list.append(mapping[norm_sec]) 
                added = True
                break
        if not added:
            return_list.append(None)
            
    return return_list

result_filtered['labels'] = df.apply(lambda row: map_normalized_sections(row, normalized_sections), axis=1)
result_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_filtered['labels'] = df.apply(lambda row: map_normalized_sections(row, normalized_sections), axis=1)


Unnamed: 0,article_id,article_text,abstract_text,labels,section_names,sections,PMID,ids
1,PMC5330001,"['tardive dystonia ( td ) , a rarer side effec...",['<S> tardive dystonia ( td ) is a serious sid...,"[introduction, methods, discussion, None, None...","[INTRODUCTION, CASE REPORT, DISCUSSION, Declar...","[[tardive dystonia ( td ) , a rarer side effec...",28250568,28250568
84,PMC4968912,['this was an observational cohort study of he...,['<S> purposethe purpose of this study was to ...,"[methods, methods, None, None, None, methods, ...","[Methods, Study Population, Standard Automated...",[[this was an observational cohort study of he...,27409505,27409505
155,PMC4309867,['this retrospective study included 343 glauco...,['<S> purposeto compare the clinical character...,"[methods, methods, None, methods, results, dis...","[Materials and Methods, Subjects, Glaucoma pro...",[[this retrospective study included 343 glauco...,25646059,25646059
254,PMC5115213,"['the main causes of snhl are advanced age , t...",['<S> backgroundthe purpose of our study was t...,"[introduction, methods, methods, methods, None...","[Background, Material and Methods, Subjects, E...","[[the main causes of snhl are advanced age , t...",27846196,27846196
5,PMC2778185,['agriculture has been one of the primary econ...,['<S> this was a cross - sectional study that ...,"[introduction, methods, discussion, discussion...","[1. Introduction, 2. Methodology, 3. Results a...",[[agriculture has been one of the primary econ...,20041012,20041012
8,PMC3155785,['the centers for disease control and preventi...,['<S> \n objective . to examine risk factors f...,"[introduction, methods, results, discussion]","[1. Introduction, 2. Materials and Methods, 3....",[[the centers for disease control and preventi...,21860798,21860798
45,PMC4828917,[''],['<S> mypro is a software pipeline for high - ...,[None],[Supplementary Material],[[]],25911337,25911337
105,PMC3364427,['all study subjects were recruited in a conse...,['<S> purposethe aim of this study was to inve...,"[methods, None, methods, results, discussion]","[Materials and Methods, Optical coherence tomo...",[[all study subjects were recruited in a conse...,22670072,22670072
112,PMC3176397,"['in recent years , the results after surgery ...",['<S> backgrounda national surveillance progra...,"[introduction, methods, methods, results, meth...","[Introduction, Material and methods, Surgery, ...","[[in recent years , the results after surgery ...",21562744,21562744
123,PMC4484657,['chronic obstructive pulmonary disease ( copd...,['<S> objectivethe nutritional status of chron...,"[introduction, methods, methods, None, None, N...","[Introduction, Materials and methods, Patients...",[[chronic obstructive pulmonary disease ( copd...,26150712,26150712


## limitation annotation

In [9]:
result_filtered

Unnamed: 0,article_id,article_text,abstract_text,labels,section_names,sections,PMID,ids
1,PMC5330001,"['tardive dystonia ( td ) , a rarer side effec...",['<S> tardive dystonia ( td ) is a serious sid...,"[introduction, methods, discussion, None, None...","[INTRODUCTION, CASE REPORT, DISCUSSION, Declar...","[[tardive dystonia ( td ) , a rarer side effec...",28250568,28250568
84,PMC4968912,['this was an observational cohort study of he...,['<S> purposethe purpose of this study was to ...,"[methods, methods, None, None, None, methods, ...","[Methods, Study Population, Standard Automated...",[[this was an observational cohort study of he...,27409505,27409505
155,PMC4309867,['this retrospective study included 343 glauco...,['<S> purposeto compare the clinical character...,"[methods, methods, None, methods, results, dis...","[Materials and Methods, Subjects, Glaucoma pro...",[[this retrospective study included 343 glauco...,25646059,25646059
254,PMC5115213,"['the main causes of snhl are advanced age , t...",['<S> backgroundthe purpose of our study was t...,"[introduction, methods, methods, methods, None...","[Background, Material and Methods, Subjects, E...","[[the main causes of snhl are advanced age , t...",27846196,27846196
5,PMC2778185,['agriculture has been one of the primary econ...,['<S> this was a cross - sectional study that ...,"[introduction, methods, discussion, discussion...","[1. Introduction, 2. Methodology, 3. Results a...",[[agriculture has been one of the primary econ...,20041012,20041012
8,PMC3155785,['the centers for disease control and preventi...,['<S> \n objective . to examine risk factors f...,"[introduction, methods, results, discussion]","[1. Introduction, 2. Materials and Methods, 3....",[[the centers for disease control and preventi...,21860798,21860798
45,PMC4828917,[''],['<S> mypro is a software pipeline for high - ...,[None],[Supplementary Material],[[]],25911337,25911337
105,PMC3364427,['all study subjects were recruited in a conse...,['<S> purposethe aim of this study was to inve...,"[methods, None, methods, results, discussion]","[Materials and Methods, Optical coherence tomo...",[[all study subjects were recruited in a conse...,22670072,22670072
112,PMC3176397,"['in recent years , the results after surgery ...",['<S> backgrounda national surveillance progra...,"[introduction, methods, methods, results, meth...","[Introduction, Material and methods, Surgery, ...","[[in recent years , the results after surgery ...",21562744,21562744
123,PMC4484657,['chronic obstructive pulmonary disease ( copd...,['<S> objectivethe nutritional status of chron...,"[introduction, methods, methods, None, None, N...","[Introduction, Materials and methods, Patients...",[[chronic obstructive pulmonary disease ( copd...,26150712,26150712


### SAL_Type_Classification

In [1]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pdt = pd.read_csv('SAL_Type_Classification/limitation_identification/large_scale_data_with_predictions.csv')
pdt

Unnamed: 0,pmcids,sentence,prediction,sid,section
0,PMC498926,"[['in', 'the', 'above-mentioned', 'article,', ...","[[0], [0]]",[0],['Body']
1,PMC516104,"[['the', 'demand', 'for', 'chemical', 'energy'...","[[0], [0], [0], [0], [0], [0], [0], [0], [0], ...",[1],"['Introduction', 'Biofuel Feedstocks', 'Engine..."
2,PMC503397,"[['cardiomyopathies', 'represent', 'a', 'colle...","[[0], [0], [0], [0], [0], [0], [0], [0], [0], ...",[2],"['Introduction', 'From code to message', 'The ..."
3,PMC831342,"[['results', 'of', 'a', 'bivariate', 'regressi...","[[0], [0]]",[3],['Body']
4,PMC878323,"[['O'], ['the', 'first', 'scientific', 'journa...","[[0], [0], [0]]",[4],['O']
...,...,...,...,...,...
71916,PMC746268,"[['the', 'pharmacological', 'management', 'of'...","[[0], [0], [0], [0], [0], [0], [0], [0], [0], ...",[71916],"['INTRODUCTION', 'Concept', 'Epidemiology of a..."
71917,PMC875428,"[['the', 'most', 'important', 'factor', 'that'...","[[0], [0], [0], [0], [0], [0]]",[71917],"['T', 'B', 'D']"
71918,PMC343773,"[['to', 'the', 'editor:', 'the', 'primary', 'm...","[[0], [0]]",[71918],['Body']
71919,PMC986711,"[['natural', 'products', 'provide', 'an', 'int...","[[0], [0]]",[71919],['Body']


In [2]:
import pandas as pd
import ast

# 假設你的 DataFrame 叫做 df
# df = pd.read_csv(...) 

def check_length_match(row):
    try:
        # 1. 嘗試 eval (解析) sentence 和 prediction
        # 先檢查是否為字串，如果是字串才做 eval，避免原本已經是 list 報錯
        sent_list = ast.literal_eval(row['sentence']) if isinstance(row['sentence'], str) else row['sentence']
        pred_list = ast.literal_eval(row['prediction']) if isinstance(row['prediction'], str) else row['prediction']
        
        # 2. 檢查兩者長度是否相同
        return len(sent_list) == len(pred_list)
        
    except Exception as e:
        # 如果解析失敗 (例如格式錯誤或 NaN)，回傳 False
        return False

# 3. 應用到 DataFrame 並建立 'debug' column
pdt['debug'] = pdt.apply(check_length_match, axis=1)

# 查看結果：印出 debug 為 False 的行（如果有長度不一致的情況）
print("不一致的行數：", len(pdt[pdt['debug'] == False]))
print(pdt[['sentence', 'prediction', 'debug']].head())

不一致的行數： 3
                                            sentence  \
0  [['in', 'the', 'above-mentioned', 'article,', ...   
1  [['the', 'demand', 'for', 'chemical', 'energy'...   
2  [['cardiomyopathies', 'represent', 'a', 'colle...   
3  [['results', 'of', 'a', 'bivariate', 'regressi...   
4  [['O'], ['the', 'first', 'scientific', 'journa...   

                                          prediction  debug  
0                                         [[0], [0]]   True  
1  [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...   True  
2  [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...   True  
3                                         [[0], [0]]   True  
4                                    [[0], [0], [0]]   True  


In [3]:
pdt[:30]

Unnamed: 0,pmcids,sentence,prediction,sid,section,debug
0,PMC498926,"[['in', 'the', 'above-mentioned', 'article,', ...","[[0], [0]]",[0],['Body'],True
1,PMC516104,"[['the', 'demand', 'for', 'chemical', 'energy'...","[[0], [0], [0], [0], [0], [0], [0], [0], [0], ...",[1],"['Introduction', 'Biofuel Feedstocks', 'Engine...",True
2,PMC503397,"[['cardiomyopathies', 'represent', 'a', 'colle...","[[0], [0], [0], [0], [0], [0], [0], [0], [0], ...",[2],"['Introduction', 'From code to message', 'The ...",True
3,PMC831342,"[['results', 'of', 'a', 'bivariate', 'regressi...","[[0], [0]]",[3],['Body'],True
4,PMC878323,"[['O'], ['the', 'first', 'scientific', 'journa...","[[0], [0], [0]]",[4],['O'],True
5,PMC878323,"[['O'], ['the', 'first', 'scientific', 'journa...","[[0], [0], [0]]",[4],['Body'],True
6,PMC997282,"[['data', 'sharing', 'is', 'not', 'applicable'...","[[1], [0], [0]]",[5],"['DATA AVAILABILITY STATEMENT', 'CONFLICT OF I...",True
7,PMC922807,"[['before', 'the', 'mid-1980s,', 'microbiologi...","[[0], [0], [0], [0], [0], [0], [0], [0], [0], ...",[6],['1. Application of the Next-Generation Sequen...,True
8,PMC530132,"[['rs', 'was', 'responsible', 'for', 'the', 'c...","[[0], [0], [0], [0], [0], [0], [0], [0]]",[7],"['Funding', 'Availability of data and material...",True
9,PMC392874,"[[""meckel's"", 'diverticulum', '(md)', 'is', 't...","[[0], [0], [0], [0], [0], [0]]",[8],"['INTRODUCTION', 'CASE REPORT', 'DISCUSSION']",True


### to auto annotator

In [10]:

def to_lines(section_name, sentences):
    
    lines = []
    # lines = [f"# {section_name} \n"]
    lines += [s.strip().replace("\n"," ").strip() for s in sentences if str(s).strip()]
    lines.append("")  # 空行分隔 section
    return lines


import pandas as pd
from pathlib import Path
import json

out_dir = Path("limitation-recognizer/my_papers/"); out_dir.mkdir(exist_ok=True)

for i, row in result_filtered.iterrows():
    sec_names = row["section_names"]
    sec_lists = row["sections"]  
    lines = []
    for name, sent_list in zip(sec_names, sec_lists):
        lines += to_lines(name, sent_list)
    
    # print(f"Writing doc_{i+1}.txt with {len(lines)} lines.")
    (out_dir / f"{row['article_id']}.txt").write_text("\n".join(lines), encoding="utf-8")


### mapping

In [11]:
file_path = 'limitation-recognizer/output.json'

try:
    with open(file_path, 'r', encoding='utf-8') as f:
        mapping = json.load(f)
except FileNotFoundError:
    print(f"File {file_path} not found.")

mapping = {item['docId']: item['sents'] for item in mapping}
mapping

{'PMC4985425': [],
 'PMC2908860': [],
 'PMC3536953': ['this report has limitations that should be considered in interpreting these data .',
  'as noted above , the rate of physician growth relative to the population was significant'],
 'PMC2948082': [],
 'PMC4523068': ['these limitations significantly weaken the statistical power of the findings .',
  'secondly , to develop an effective cell therapy strategy , several factors , including eligibility criteria of the patients , timing , and route and dose of cell transplantation , should be considered in clinical practice .'],
 'PMC3524718': [],
 'PMC4368710': [],
 'PMC4600110': ['perhaps the greatest limitation is that microbiological fields tend to be defined by common interests rather than by theoretical concepts .',
  'in microbiology , field definitions are often microbe - centric rather than focused on specific problems .',
  'the social organization of human enterprises that includes scientific fields may therefore be a direct ref

In [12]:
import json


def calculate_lcs_length(s1, s2):
    m, n = len(s1), len(s2)
    
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s1[i - 1] == s2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
            else:
                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

    return dp[m][n]


def equal(s1, s2, threshold=0.8):
    lcs_len = calculate_lcs_length(s1, s2)
    
    len1 = len(s1)
    len2 = len(s2)
    
    if len1 == 0 and len2 == 0:
        return True  
    if len1 == 0 or len2 == 0:
        return False

    if (lcs_len / len1) >= threshold or (lcs_len / len2) >= threshold:
        return True
    else:
        return False

In [13]:
section_labels = []

for i, row in result_filtered.iterrows():
    row_labels = []
    doc_id = row['article_id']
    for section in row['sections']:
        sec_labels = []
        for sent in section:
            ref_sentences = mapping[doc_id]
            added = False
            for ref_sent in ref_sentences:
                if equal(sent, ref_sent):
                    sec_labels.append('limitation')
                    added = True
                    break
            if not added:
                sec_labels.append(None)
        row_labels.append(sec_labels)
    section_labels.append(row_labels)

result_filtered['section_labels'] = section_labels
result_filtered
            

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_filtered['section_labels'] = section_labels


Unnamed: 0,article_id,article_text,abstract_text,labels,section_names,sections,PMID,ids,section_labels
1,PMC5330001,"['tardive dystonia ( td ) , a rarer side effec...",['<S> tardive dystonia ( td ) is a serious sid...,"[introduction, methods, discussion, None, None...","[INTRODUCTION, CASE REPORT, DISCUSSION, Declar...","[[tardive dystonia ( td ) , a rarer side effec...",28250568,28250568,"[[None, None, None, None, None, None], [None, ..."
84,PMC4968912,['this was an observational cohort study of he...,['<S> purposethe purpose of this study was to ...,"[methods, methods, None, None, None, methods, ...","[Methods, Study Population, Standard Automated...",[[this was an observational cohort study of he...,27409505,27409505,"[[None, None, None, None, None, None, None, No..."
155,PMC4309867,['this retrospective study included 343 glauco...,['<S> purposeto compare the clinical character...,"[methods, methods, None, methods, results, dis...","[Materials and Methods, Subjects, Glaucoma pro...",[[this retrospective study included 343 glauco...,25646059,25646059,"[[None, None, None, None, None, None, None, No..."
254,PMC5115213,"['the main causes of snhl are advanced age , t...",['<S> backgroundthe purpose of our study was t...,"[introduction, methods, methods, methods, None...","[Background, Material and Methods, Subjects, E...","[[the main causes of snhl are advanced age , t...",27846196,27846196,"[[None, None, None, None, None, None, None, No..."
5,PMC2778185,['agriculture has been one of the primary econ...,['<S> this was a cross - sectional study that ...,"[introduction, methods, discussion, discussion...","[1. Introduction, 2. Methodology, 3. Results a...",[[agriculture has been one of the primary econ...,20041012,20041012,"[[None, None, None, None, None, None, None, No..."
8,PMC3155785,['the centers for disease control and preventi...,['<S> \n objective . to examine risk factors f...,"[introduction, methods, results, discussion]","[1. Introduction, 2. Materials and Methods, 3....",[[the centers for disease control and preventi...,21860798,21860798,"[[None, None, None, None, None, None, None, No..."
45,PMC4828917,[''],['<S> mypro is a software pipeline for high - ...,[None],[Supplementary Material],[[]],25911337,25911337,[[None]]
105,PMC3364427,['all study subjects were recruited in a conse...,['<S> purposethe aim of this study was to inve...,"[methods, None, methods, results, discussion]","[Materials and Methods, Optical coherence tomo...",[[all study subjects were recruited in a conse...,22670072,22670072,"[[None, None, None, None, None, None, None, No..."
112,PMC3176397,"['in recent years , the results after surgery ...",['<S> backgrounda national surveillance progra...,"[introduction, methods, methods, results, meth...","[Introduction, Material and methods, Surgery, ...","[[in recent years , the results after surgery ...",21562744,21562744,"[[None, None, None, None, None, None, None, No..."
123,PMC4484657,['chronic obstructive pulmonary disease ( copd...,['<S> objectivethe nutritional status of chron...,"[introduction, methods, methods, None, None, N...","[Introduction, Materials and methods, Patients...",[[chronic obstructive pulmonary disease ( copd...,26150712,26150712,"[[limitation, None, None], [None, None, None, ..."


In [14]:
result_filtered.to_pickle('annotated_so_far.pkl')

## pubmed claim annotation

@article{achakulvisut2019claim,
  title={Claim Extraction in Biomedical Publications using Deep Discourse Model and Transfer Learning},
  author={Achakulvisut, Titipat and Bhagavatula, Chandra and Acuna, Daniel and Kording, Konrad},
  journal={arXiv preprint arXiv:1907.00962},
  year={2019}
}

In [1]:
import pandas as pd
result_filtered = pd.read_pickle('annotated_so_far.pkl')
result_filtered.to_json('annotated_so_far.json', orient='records', lines=True, force_ascii=False)

In [1]:
import pandas as pd
result_filtered = pd.read_json('annotated_so_far.json', orient='records', lines=True)
result_filtered

Unnamed: 0,article_id,article_text,abstract_text,labels,section_names,sections,PMID,ids,section_labels
0,PMC5330001,"['tardive dystonia ( td ) , a rarer side effec...",['<S> tardive dystonia ( td ) is a serious sid...,"[introduction, methods, discussion, None, None...","[INTRODUCTION, CASE REPORT, DISCUSSION, Declar...","[[tardive dystonia ( td ) , a rarer side effec...",28250568,28250568,"[[None, None, None, None, None, None], [None, ..."
1,PMC4968912,['this was an observational cohort study of he...,['<S> purposethe purpose of this study was to ...,"[methods, methods, None, None, None, methods, ...","[Methods, Study Population, Standard Automated...",[[this was an observational cohort study of he...,27409505,27409505,"[[None, None, None, None, None, None, None, No..."
2,PMC4309867,['this retrospective study included 343 glauco...,['<S> purposeto compare the clinical character...,"[methods, methods, None, methods, results, dis...","[Materials and Methods, Subjects, Glaucoma pro...",[[this retrospective study included 343 glauco...,25646059,25646059,"[[None, None, None, None, None, None, None, No..."
3,PMC5115213,"['the main causes of snhl are advanced age , t...",['<S> backgroundthe purpose of our study was t...,"[introduction, methods, methods, methods, None...","[Background, Material and Methods, Subjects, E...","[[the main causes of snhl are advanced age , t...",27846196,27846196,"[[None, None, None, None, None, None, None, No..."
4,PMC2778185,['agriculture has been one of the primary econ...,['<S> this was a cross - sectional study that ...,"[introduction, methods, discussion, discussion...","[1. Introduction, 2. Methodology, 3. Results a...",[[agriculture has been one of the primary econ...,20041012,20041012,"[[None, None, None, None, None, None, None, No..."
5,PMC3155785,['the centers for disease control and preventi...,['<S> \n objective . to examine risk factors f...,"[introduction, methods, results, discussion]","[1. Introduction, 2. Materials and Methods, 3....",[[the centers for disease control and preventi...,21860798,21860798,"[[None, None, None, None, None, None, None, No..."
6,PMC4828917,[''],['<S> mypro is a software pipeline for high - ...,[None],[Supplementary Material],[[]],25911337,25911337,[[None]]
7,PMC3364427,['all study subjects were recruited in a conse...,['<S> purposethe aim of this study was to inve...,"[methods, None, methods, results, discussion]","[Materials and Methods, Optical coherence tomo...",[[all study subjects were recruited in a conse...,22670072,22670072,"[[None, None, None, None, None, None, None, No..."
8,PMC3176397,"['in recent years , the results after surgery ...",['<S> backgrounda national surveillance progra...,"[introduction, methods, methods, results, meth...","[Introduction, Material and methods, Surgery, ...","[[in recent years , the results after surgery ...",21562744,21562744,"[[None, None, None, None, None, None, None, No..."
9,PMC4484657,['chronic obstructive pulmonary disease ( copd...,['<S> objectivethe nutritional status of chron...,"[introduction, methods, methods, None, None, N...","[Introduction, Materials and methods, Patients...",[[chronic obstructive pulmonary disease ( copd...,26150712,26150712,"[[limitation, None, None], [None, None, None, ..."


In [None]:
# --- 步驟 1：(可選) 解除安裝不相容的 torch ---
!pip uninstall torch -y

# --- 步驟 2：安裝 PyTorch 0.4.1 (CPU版) 和 AllenNLP 0.6.1 ---
# (這會從 PyTorch 官方存檔中尋找 torch==0.4.1)
!pip install --isolated --index-url https://pypi.org/simple "torch==0.4.1" -f https://download.pytorch.org/whl/cpu/torch_stable.html && pip install --isolated --index-url https://pypi.org/simple "allennlp==0.6.1"


Found existing installation: torch 1.10.2
Uninstalling torch-1.10.2:
  Successfully uninstalled torch-1.10.2
Looking in links: https://download.pytorch.org/whl/cpu/torch_stable.html
Collecting torch==0.4.1
  Using cached https://download.pytorch.org/whl/cpu/torch-0.4.1-cp36-cp36m-linux_x86_64.whl (91.1 MB)
Installing collected packages: torch
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pytorch-transformers 1.1.0 requires torch>=1.0.0, but you have torch 0.4.1 which is incompatible.
allennlp 0.9.0 requires torch>=1.2.0, but you have torch 0.4.1 which is incompatible.[0m
Successfully installed torch-0.4.1
Collecting allennlp==0.6.1
  Using cached allennlp-0.6.1-py3-none-any.whl (6.9 MB)
Collecting spacy<2.1,>=2.0
  Using cached spacy-2.0.18-cp36-cp36m-manylinux1_x86_64.whl (25.2 MB)
Collecting gevent==1.3.5
  Using cached gevent-1.3.5-cp36-cp36m-manylin

In [3]:

# --- 步驟 3：(在步驟 2 成功後) 重新安裝主套件 ---
!pip install --isolated --index-url https://pypi.org/simple git+https://github.com/titipata/detecting-scientific-claim.git --no-build-isolation

Collecting git+https://github.com/titipata/detecting-scientific-claim.git
  Cloning https://github.com/titipata/detecting-scientific-claim.git to /tmp/pip-req-build-0gqlgjgy
  Running command git clone --filter=blob:none -q https://github.com/titipata/detecting-scientific-claim.git /tmp/pip-req-build-0gqlgjgy
  Resolved https://github.com/titipata/detecting-scientific-claim.git to commit 7a6547c907e20c6df734e956b7fd04f9dd0faac5
  Preparing metadata (setup.py) ... [?25ldone


In [3]:
import subprocess
import sys
import os

# --- 步驟 0：定義我們要的「正確」版本 ---
# (這些是 2018 年 allennlp 0.6.1 所需的古老版本)
REQUIRED_VERSIONS = {
    "torch": "0.4.1",
    "allennlp": "0.6.1"
}

try:
    import torch
    import allennlp
    
    # 檢查 torch 版本
    if torch.__version__.startswith(REQUIRED_VERSIONS["torch"]):
        print(f"Torch 版本正確 (v{torch.__version__}).")
    else:
        print(f"偵測到錯誤的 Torch 版本 (v{torch.__version__}). 正在強制修復...")
        raise ImportError # 觸發修復
        
    # 檢查 allennlp 版本
    if allennlp.__version__.startswith(REQUIRED_VERSIONS["allennlp"]):
        print(f"AllenNLP 版本正確 (v{allennlp.__version__}).")
    else:
        print(f"偵測到錯誤的 AllenNLP 版本 (v{allennlp.__version__}). 正在強制修復...")
        raise ImportError # 觸發修復

except ImportError:
    print("="*50)
    print("偵測到環境不符或套件毀損。正在啟動強制修復...")
    print("這會需要幾分鐘時間來下載和安裝...")
    
    # 我們將使用 subprocess 來執行 pip，這比 !pip 更可靠
    pip_executable = sys.executable.replace("python", "pip")

    try:
        # 1. 解除安裝所有衝突的套件
        print("\n[步驟 1/4] 正在解除安裝舊套件 (torch, allennlp)...")
        subprocess.check_call([pip_executable, "uninstall", "torch", "allennlp", "detecting-scientific-claim", "-y"])

        # 2. 安裝正確的 torch 0.4.1 (CPU 版)
        print("\n[步驟 2/4] 正在安裝 torch==0.4.1 (這會需要一點時間)...")
        subprocess.check_call([
            pip_executable, "install", "--isolated", 
            "--index-url", "https://pypi.org/simple",
            "torch==0.4.1", 
            "-f", "https://download.pytorch.org/whl/cpu/torch_stable.html"
        ])

        # 3. 安裝正確的 allennlp 0.6.1
        print("\n[步驟 3/4] 正在安裝 allennlp==0.6.1...")
        subprocess.check_call([
            pip_executable, "install", "--isolated",
            "--index-url", "https://pypi.org/simple",
            "allennlp==0.6.1"
        ])

        # 4. 安裝主套件 (使用 --no-dependencies 避免它再次覆蓋 torch)
        print("\n[步驟 4/4] 正在安裝主套件 (detecting-scientific-claim)...")
        subprocess.check_call([
            pip_executable, "install", "--isolated",
            "--index-url", "https://pypi.org/simple",
            "git+https://github.com/titipata/detecting-scientific-claim.git",
            "--no-build-isolation",
            "--no-dependencies" # 
        ])

        print("\n環境修復完成！正在重新載入套件...")
        # 重新整理 sys.path 來載入新套件
        import site
        from importlib import reload
        reload(site)
        
    except subprocess.CalledProcessError as e:
        print(f"安裝過程中發生嚴重錯誤: {e}")
        print("請檢查您的網路連線並重試。")
        sys.exit(1)
    except Exception as e:
        print(f"發生未預期錯誤: {e}")
        sys.exit(1)

print("="*50)
print("環境檢查通過。")
print("="*50)

偵測到錯誤的 Torch 版本 (v1.10.2+cu102). 正在強制修復...
偵測到環境不符或套件毀損。正在啟動強制修復...
這會需要幾分鐘時間來下載和安裝...

[步驟 1/4] 正在解除安裝舊套件 (torch, allennlp)...


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



安裝過程中發生嚴重錯誤: Command '['/ocean/projects/cis230089p/slin23/miniconda3/envs/claim_env/bin/pip', 'uninstall', 'torch', 'allennlp', 'detecting-scientific-claim', '-y']' returned non-zero exit status 1.
請檢查您的網路連線並重試。
Traceback (most recent call last):
  File "<ipython-input-3-070dc649bf96>", line 21, in <module>
    raise ImportError # 觸發修復
ImportError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<ipython-input-3-070dc649bf96>", line 41, in <module>
    subprocess.check_call([pip_executable, "uninstall", "torch", "allennlp", "detecting-scientific-claim", "-y"])
  File "/ocean/projects/cis230089p/slin23/miniconda3/envs/claim_env/lib/python3.6/subprocess.py", line 311, in check_call
    raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['/ocean/projects/cis230089p/slin23/miniconda3/envs/claim_env/bin/pip', 'uninstall', 'torch', 'allennlp', 'detecting-scientific-claim', '-y']' returned non-zero exi

TypeError: object of type 'NoneType' has no len()

In [1]:
try:
    import discourse
    from allennlp.predictors.predictor import Predictor
    import torch
except ImportError as e:
    print("="*50)
    print(f"錯誤：套件匯入失敗: {e}")
    print("您似乎還沒有成功安裝好環境。")
    print("請您在終端機 (Terminal) 中執行本檔案最上方的「環境修復指令」。")
    print("="*50)
    sys.exit(1) # 退出程式

錯誤：套件匯入失敗: No module named 'discourse'
您似乎還沒有成功安裝好環境。
請您在終端機 (Terminal) 中執行本檔案最上方的「環境修復指令」。


NameError: name 'sys' is not defined