In [1]:
import pandas as pd
from pathlib import Path

import json

data = []
with open("../data/train.txt", "r") as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)
out_dir = Path("for_limitation_txt"); out_dir.mkdir(exist_ok=True)
df

Unnamed: 0,article_id,article_text,abstract_text,labels,section_names,sections
0,PMC3872579,[a recent systematic analysis showed that in 2...,[<S> background : the present study was carrie...,,"[INTRODUCTION, MATERIALS AND METHODS, Particip...",[[a recent systematic analysis showed that in ...
1,PMC3770628,[it occurs in more than 50% of patients and ma...,[<S> backgroundanemia in patients with cancer ...,,"[Introduction, Patients and methods, Study des...",[[it occurs in more than 50% of patients and m...
2,PMC5330001,"[tardive dystonia ( td ) , a rarer side effect...",[<S> tardive dystonia ( td ) is a serious side...,,"[INTRODUCTION, CASE REPORT, DISCUSSION, Declar...","[[tardive dystonia ( td ) , a rarer side effec..."
3,PMC4386667,"[lepidoptera include agricultural pests that ,...",[<S> many lepidopteran insects are agricultura...,,"[1. Introduction, 2. Insect Immunity, 3. Signa...",[[lepidoptera include agricultural pests that ...
4,PMC4307954,[syncope is caused by transient diffuse cerebr...,[<S> we present an unusual case of recurrent c...,,"[Introduction, Case report, Discussion, Confli...",[[syncope is caused by transient diffuse cereb...
...,...,...,...,...,...,...
119919,PMC3502213,[eukaryotic cells depend on vesicle - mediated...,[<S> long - distance trafficking of membranous...,,"[Introduction, Motor-Dependent Transport of Ra...",[[eukaryotic cells depend on vesicle - mediate...
119920,PMC3198562,[as regards the selection criteria of the post...,[<S> aims and objectives : to study the stress...,,"[INTRODUCTION, MATERIALS AND METHODS, Modeling...",[[fiber post systems are routinely used in res...
119921,PMC4436536,[in most of the peer review publications in th...,[<S> abstractbackgroundthe objective of this s...,,"[Introduction, Methods, Results, Discussion, L...",[[in most of the peer review publications in t...
119922,PMC4251613,[the reveal registry is a longitudinal registr...,[<S> background : patients with pulmonary arte...,,"[TRIAL REGISTRY:, Materials and Methods, REVEA...","[[], [the reveal registry is a longitudinal re..."


In [2]:
import re
import numpy as np
import pickle

with open('../../section_normalization/structured_abstract_sections.pkl', 'rb') as f:
    normalized_sections = pickle.load(f)

def map_normalized_sections(row, norm_sections):
    mapping = {'background': 'introduction', 'objective': 'introduction', 'methods': 'methods', 'results': 'results', 'conclusions': 'discussion'}
    
    return_list = []
    for section_name in row['section_names']:
        added = False
        for norm_sec in norm_sections:
            compare = section_name.lower()
            compare = re.sub(r'^\s*\d+(?:\.\d+)*[)\.\u3001\uFF0E]?\s*', '', compare).strip()
            if compare in norm_sections[norm_sec]:
                return_list.append(mapping[norm_sec]) 
                added = True
                break
        if not added:
            return_list.append(None)
            
    return return_list

df['labels'] = df.apply(lambda row: map_normalized_sections(row, normalized_sections), axis=1)
df

Unnamed: 0,article_id,article_text,abstract_text,labels,section_names,sections
0,PMC3872579,[a recent systematic analysis showed that in 2...,[<S> background : the present study was carrie...,"[introduction, methods, methods, methods, meth...","[INTRODUCTION, MATERIALS AND METHODS, Particip...",[[a recent systematic analysis showed that in ...
1,PMC3770628,[it occurs in more than 50% of patients and ma...,[<S> backgroundanemia in patients with cancer ...,"[introduction, methods, methods, methods, resu...","[Introduction, Patients and methods, Study des...",[[it occurs in more than 50% of patients and m...
2,PMC5330001,"[tardive dystonia ( td ) , a rarer side effect...",[<S> tardive dystonia ( td ) is a serious side...,"[introduction, methods, discussion, None, None...","[INTRODUCTION, CASE REPORT, DISCUSSION, Declar...","[[tardive dystonia ( td ) , a rarer side effec..."
3,PMC4386667,"[lepidoptera include agricultural pests that ,...",[<S> many lepidopteran insects are agricultura...,"[introduction, None, None, None, None, None, N...","[1. Introduction, 2. Insect Immunity, 3. Signa...",[[lepidoptera include agricultural pests that ...
4,PMC4307954,[syncope is caused by transient diffuse cerebr...,[<S> we present an unusual case of recurrent c...,"[introduction, methods, discussion, introduction]","[Introduction, Case report, Discussion, Confli...",[[syncope is caused by transient diffuse cereb...
...,...,...,...,...,...,...
119919,PMC3502213,[eukaryotic cells depend on vesicle - mediated...,[<S> long - distance trafficking of membranous...,"[introduction, None, None, discussion]","[Introduction, Motor-Dependent Transport of Ra...",[[eukaryotic cells depend on vesicle - mediate...
119920,PMC3198562,[as regards the selection criteria of the post...,[<S> aims and objectives : to study the stress...,"[introduction, methods, None, None, results, d...","[INTRODUCTION, MATERIALS AND METHODS, Modeling...",[[fiber post systems are routinely used in res...
119921,PMC4436536,[in most of the peer review publications in th...,[<S> abstractbackgroundthe objective of this s...,"[introduction, methods, results, discussion, d...","[Introduction, Methods, Results, Discussion, L...",[[in most of the peer review publications in t...
119922,PMC4251613,[the reveal registry is a longitudinal registr...,[<S> background : patients with pulmonary arte...,"[None, methods, None, methods, methods, result...","[TRIAL REGISTRY:, Materials and Methods, REVEA...","[[], [the reveal registry is a longitudinal re..."


In [3]:
def to_lines(section_name, sentences):
    
    lines = []
    # lines = [f"# {section_name} \n"]
    lines += [s.strip().replace("\n"," ").strip() for s in sentences if str(s).strip()]
    lines.append("")  # 空行分隔 section
    return lines

for i, row in df.iterrows():
    sec_names = row["section_names"]
    sec_lists = row["sections"]  
    lines = []
    for name, sent_list in zip(sec_names, sec_lists):
        lines += to_lines(name, sent_list)
    
    # print(f"Writing doc_{i+1}.txt with {len(lines)} lines.")
    (out_dir / f"doc_{i+1}.txt").write_text("\n".join(lines), encoding="utf-8")


In [None]:
import ast
import pandas as pd
from pathlib import Path
import re 

def to_webanno_tsv_full(section_name, sentences, section_label, sent_offset, token_offset):
    """
    輸出 WebAnno TSV 3.3 完整 Token 格式:
    SENT_ID-TOKEN_ID <Tab> TOKEN_TEXT <Tab> SPAN_ANNOTATION
    """
    lines = []
    current_sent_idx = sent_offset
    current_token_idx = 1 # 每個句子從 Token 索引 1 開始計數
    
    name = str(section_name).strip()
    label = str(section_label).strip()
    
    # 確保 Section Name 優先被 Tokenize 和標註
    all_sentences = [name] + [s for s in sentences if str(s).strip() and str(s).strip() != name]

    # 遍歷所有句子
    for sent in all_sentences:
        sent_clean = str(sent).strip().replace("\n", " ").strip()
        if not sent_clean:
            continue

        # Tokenize 句子
        tokens = re.split(r'\s+', sent_clean)
        if not tokens:
            continue
        
        is_section_name = (sent_clean == name)
        current_token_idx = 1 # 每個句子 Token 索引從 1 開始

        for idx, token in enumerate(tokens):
            if not token: 
                continue
                
            tag = ""
            if is_section_name and label:
                # 決定標籤：使用 BI-模式 (注入到 Span 欄位)
                if idx == 0:
                    tag = f"B-{label}|{label}" # 標註格式: Tag|Label
                else:
                    tag = f"I-{label}"
            
            # 格式: SENT_ID-TOKEN_ID <Tab> TOKEN_TEXT <Tab> SPAN_ANNOTATION
            # WebAnno 3.3 僅需要一欄 Span 標註
            lines.append(f"{current_sent_idx}-{current_token_idx}\t{token}\t{tag}")
            current_token_idx += 1
            
        lines.append("") # 句子分隔符
        current_sent_idx += 1
            
    return lines, current_sent_idx

# --- 主迴圈部分 ---

out_dir = Path("./output_webanno_final_v3")
out_dir.mkdir(exist_ok=True) 

for i, row in df.iterrows():
    
    sec_names = row["section_names"]
    sec_lists = row["sections"]
    labels = row["labels"]
    
    data_lines = []
    
    lines = []
    # 1. WebAnno TSV 3.3 格式聲明 (絕對在最頂端)
    lines.append("#FORMAT=WebAnno TSV 3.3")
    lines.append("#T_SP: section-level|label") 
    lines.append("") 
    
    min_len = min(len(sec_names), len(sec_lists), len(labels))
    global_sent_idx = 1 # 句子索引從 1 開始
    
    # 核心邏輯：遍歷每個區段
    for name, sent_list, label in zip(sec_names[:min_len], sec_lists[:min_len], labels[:min_len]):
        
        # 傳入當前的句子索引
        new_lines, new_sent_idx = to_webanno_tsv_full(name, sent_list, label, global_sent_idx, 1)
        data_lines += new_lines
        global_sent_idx = new_sent_idx
        
    # 寫入檔案
    while data_lines and not data_lines[-1].strip():
        data_lines.pop() 
        
    lines.extend(data_lines)
    
    output_filename = out_dir / f"doc_{i+1}.tsv"
    
    output_filename.write_text("\n".join(lines), encoding="utf-8")
    
print(f"✅ 檔案已修正為 WebAnno TSV 3.3 完整 Token 格式 (SENT_ID-TOKEN_ID)，請再次導入。輸出至 {out_dir}")

✅ 檔案已修正為 WebAnno TSV 3.3 完整 Token 格式 (SENT_ID-TOKEN_ID)，請再次導入。輸出至 output_webanno_final_v3
