In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("../../data/all_articles5-v2.csv")
        
df = df.rename(columns={
    "full_text": "article",
    "abstract": "abstract"
})
df = df.dropna(subset=["article", "abstract"])
df = df[df["article"].str.strip().astype(bool)]
df = df[df["abstract"].str.strip().astype(bool)]
df["article"] = df["article"].astype(str)
df["abstract"] = df["abstract"].astype(str)


In [2]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [23]:
import ast
test_df['section_names'] = test_df['section_names'].apply(ast.literal_eval)
test_df['sections'] = test_df['sections'].apply(ast.literal_eval)

In [36]:
import re

def distribute_paragraphs_to_six_facets(text):
    facets = [""] * 6
    if not isinstance(text, str) or not text.strip():
        return facets
    
    paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text.strip()) if p.strip()]
    # print(f"Total paragraphs: {len(paragraphs)}")
    if not paragraphs:
        return facets
    
    total_len = sum(len(p) for p in paragraphs)
    cutoffs = [(total_len * i) // 6 for i in range(1, 6)]
    
    current_facet_idx = 0
    cumulative_len = 0
    facet_contents = [[] for _ in range(6)]
    
    for p in paragraphs:
        p_len = len(p.strip())
        if current_facet_idx < 5 and cumulative_len >= cutoffs[current_facet_idx] and facet_contents[current_facet_idx]:
            current_facet_idx += 1
            
        facet_contents[current_facet_idx].append(p.strip())
        cumulative_len += p_len
    
    for i in range(6):
        facets[i] = "\n\n".join(facet_contents[i])
        
    return facets



In [32]:
import numpy as np
def clean_for_json(data_dict):
    cleaned = {}
    for k, v in data_dict.items():
        if pd.isna(v): # 如果是 NaN 或 None
            cleaned[k] = "" # 轉成空字串
        elif isinstance(v, (np.int64, np.int32)): # 修正 numpy int 問題
            cleaned[k] = int(v)
        elif isinstance(v, (np.float64, np.float32)): # 修正 numpy float 問題
            cleaned[k] = float(v)
        else:
            cleaned[k] = v
    return cleaned

In [33]:
total_rows = len(test_df) 

In [37]:
import json
output_file = "../dataset/simple_split_stream.jsonl"
for index, row in test_df.iterrows():
    # print(len(row['sections']))
    six_facets = distribute_paragraphs_to_six_facets('\n\n'.join(row['sections']))
    dict_row = {
        "first_facet": six_facets[0].strip() if six_facets[0].strip() else "",
        "second_facet": six_facets[1].strip() if six_facets[1].strip() else "",
        "third_facet": six_facets[2].strip() if six_facets[2].strip() else "",
        "fourth_facet": six_facets[3].strip() if six_facets[3].strip() else "",
        "fifth_facet": six_facets[4].strip() if six_facets[4].strip() else "",
        "sixth_facet": six_facets[5].strip() if six_facets[5].strip() else "",
    }
    
    save_data = row.to_dict()
    del save_data['sections']      
    del save_data['section_names']
    save_data.update(dict_row)
    lean_data = clean_for_json(save_data)
    clean_data = clean_for_json(save_data)
    with open(output_file, 'a', encoding='utf-8') as f_out:
        json.dump(clean_data, f_out, ensure_ascii=False)
        f_out.write('\n') 
        f_out.flush() 
    
    if (index + 1) % 10 == 0:
        print(f"Processed {index + 1}/{total_rows} rows...")
    
    

Processed 38490/4631 rows...
Processed 23400/4631 rows...
Processed 33920/4631 rows...
Processed 18680/4631 rows...
Processed 34130/4631 rows...
Processed 32790/4631 rows...
Processed 62470/4631 rows...
Processed 68110/4631 rows...
Processed 17370/4631 rows...
Processed 71860/4631 rows...
Processed 14490/4631 rows...
Processed 36230/4631 rows...
Processed 21220/4631 rows...
Processed 19980/4631 rows...
Processed 31360/4631 rows...
Processed 68230/4631 rows...
Processed 68410/4631 rows...
Processed 61530/4631 rows...
Processed 52160/4631 rows...
Processed 60010/4631 rows...
Processed 41190/4631 rows...
Processed 55620/4631 rows...
Processed 46810/4631 rows...
Processed 51450/4631 rows...
Processed 30380/4631 rows...
Processed 9280/4631 rows...
Processed 7240/4631 rows...
Processed 21040/4631 rows...
Processed 40190/4631 rows...
Processed 19130/4631 rows...
Processed 38590/4631 rows...
Processed 50160/4631 rows...
Processed 16340/4631 rows...
Processed 19490/4631 rows...
Processed 65460/

In [3]:
import pandas as pd
df = pd.read_json("../dataset/simple_split_stream.jsonl", lines=True)
df

Unnamed: 0,article_id,pmid,title,abstract,article,missing_fields,first_facet,second_facet,third_facet,fourth_facet,fifth_facet,sixth_facet
0,PMC696377,31842374,Plasmodesmata Conductivity Regulation: A Mecha...,<S> plant cells form a multicellular symplast ...,"due to their immobile lifestyle, plants are ex...",[],"due to their immobile lifestyle, plants are ex...",this group of proteins include non-secreted cy...,the cytoskeletal proteins actin and myosin hav...,"reticulons (rtnlb), as family of membrane er-t...",the callose contents at pd depends on the acti...,in the selection of proteins that bind to pd-l...
1,PMC958938,36324544,Abderrazak Hajjioui: shining a light on rehabi...,<S> abderrazak hajjioui talks to tatum anderso...,i noticed that moroccan practitioners tended t...,[],i noticed that moroccan practitioners tended t...,,,,,
2,PMC561342,28970690,Successful surgical procedure in a patient of ...,<S> this case marks the beginning of issuing c...,"cci may be done at 10 min, 1 h, or 24 h after ...",[],"cci may be done at 10 min, 1 h, or 24 h after ...",frequent transfusions were required over the y...,"when hla antibodies are present, patients may ...","this case shows us that, when we are facing a ...",,
3,PMC642025,30875402,Assessment of early goal-directed therapy guid...,<S> background\nassessing adherence to early g...,assessing adherence to early goal-directed the...,[],assessing adherence to early goal-directed the...,a panel of five pediatric critical care expert...,a panel of five pediatric critical care expert...,the 28 components were divided into 18 general...,the 28 components were divided into 18 general...,adherence determination based on selected clin...
4,PMC579575,29172801,The new European guideline on cardiovascular d...,<S> abstract\nthe new guideline on cardiovascu...,"in the new guideline , the accessible and simp...",[],"in the new guideline , the accessible and simp...","in the new guideline , the accessible and simp...",the new guideline acknowledges that especially...,organizing broad cvd prevention is still a hug...,to improve the translation of the recommendati...,the new european guideline on cardiovascular d...
...,...,...,...,...,...,...,...,...,...,...,...,...
4626,PMC883360,35158828,"Alternative Splicing, Epigenetic Modifications...",<S> simple summary\nepigenetics studies the al...,epigenetics studies the alteration of gene exp...,[],epigenetics studies the alteration of gene exp...,when these islands are located in gene promote...,due to the already mentioned coupling with tra...,since the launching of the encode project in 2...,remodeling of chromatin is often considered as...,"in a certain sense, present section covers an ..."
4627,PMC424766,25403234,Let it grow—the open market solution to mariju...,<S> this commentary evaluates regulatory frame...,this article presents the case for legalizing ...,[],this article presents the case for legalizing ...,this article presents the case for legalizing ...,marijuana’s prospective legalization should be...,marijuana’s legalization raises numerous criti...,the current regulatory model for marijuana is ...,what is missing from most analyses is recognit...
4628,PMC766392,33167486,Gated Dehazing Network via Least Square Advers...,"<S> in a hazy environment, visibility is reduc...","especially, haze reduces the visibility of the...",[],"especially, haze reduces the visibility of the...",a haze image ix acquired by a digital image se...,"to remove the haze, the proposed generator tak...",the loss function of the proposed method consi...,"for example, in an outdoor environment, the de...",in order to evaluate the qualitative performan...
4629,PMC436037,21147631,Lung mass in a 28-year-old male: A case report...,<S> a twenty eight-year-old male presented wit...,it has distinct pathologic features and occurs...,[],it has distinct pathologic features and occurs...,hepatocellular carcinoma is a highly malignant...,,,,
