In [33]:
import pandas as pd
import json
import os

def process_claims_data(json_file_path: str) -> pd.DataFrame:
    """
    Loads data from a JSON file, processes it to create a DataFrame where each
    row represents a single claim, and saves the result to a CSV file.

    Args:
        json_file_path: The full path to the input JSON file.

    Returns:
        A pandas DataFrame containing the processed and cleaned data.
    """
    # --- Step 1: Load and Validate JSON ---
    try:
        with open(json_file_path, 'r') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: The file '{json_file_path}' was not found.")
        return pd.DataFrame() # Return empty DataFrame on error
    except json.JSONDecodeError:
        print(f"Error: The file '{json_file_path}' is not a valid JSON file.")
        return pd.DataFrame()

    # --- Step 2: Process the Data ---
    records = []
    paper_id = data.get('paper_id', 'unknown_paper')
    figures_data = data.get('figures', {})
    claims_data = data.get('claims', [])

    # Iterate through each claim, as each claim is a row
    for claim in claims_data:
        claim_sentence = claim.get('sentence')
        if not claim_sentence:
            continue # Skip if the claim has no sentence

        # A claim can reference multiple figures, so create a row for each reference
        for fig_ref in claim.get('figure_references', []):
            figure_key = fig_ref.get('figure_key')
            if not figure_key or figure_key not in figures_data:
                continue # Skip if the reference is invalid

            figure_info = figures_data[figure_key]

            # Create a dictionary for this specific claim-figure pair
            record = {
                'paper_id': paper_id,
                'claim': claim_sentence,
                'figure_id': f"{paper_id}_{figure_key}",
                'title': figure_info.get('title'),
                'caption': figure_info.get('caption'),
                'local_image_path': figure_info.get('local_image_path'),
                'url': figure_info.get('final_url')
            }
            records.append(record)

    if not records:
        print("Warning: No valid records were created from the input file.")
        return pd.DataFrame()

    # Create the initial DataFrame
    df = pd.DataFrame(records)

    # --- Step 3: Clean Duplicates and Save ---
    # Remove fully duplicated rows (same claim pointing to the same figure)
    df_cleaned = df.drop_duplicates().reset_index(drop=True)

    # Define the output filename and save the DataFrame
    output_filename = json_file_path.replace('complete_data.json', '') + f"{paper_id}.csv"
    # output_filename = f"{paper_id}.csv"
    df_cleaned.to_csv(output_filename, index=False)

    print(f"✅ Processed {len(claims_data)} claims into {len(df_cleaned)} unique rows.")
    print(f"✅ Data saved to '{output_filename}'")

    return df_cleaned

In [35]:
from pathlib import Path

root_dir = Path("./data/")
json_files = list(root_dir.rglob("*/complete_data.json"))

all_dfs = []
count = 0

for j_file in json_files:
    # print(f"Processing file: {j_file}")
    df_result = process_claims_data(str(j_file))
    # print(f"Resulting DataFrame shape: {df_result.shape}")
    count += df_result.shape[0]
    # print("-" * 40)
    all_dfs.append(df_result)

combined_df = pd.concat(all_dfs, ignore_index=True)
combined_df = combined_df.drop_duplicates(subset='claim', keep='first')
combined_df = combined_df[combined_df['claim'].str.len() > 100]
combined_df['class'] = 'SUPPORT'
combined_df.to_csv("all_support_data.csv", index=False)

print(f"Total number of rows processed across all files: {count}")
print(f"Combined DataFrame shape: {combined_df.shape}")

✅ Processed 130 claims into 9 unique rows.
✅ Data saved to 'data/PMC4870313/PMC4870313.csv'
✅ Processed 66 claims into 8 unique rows.
✅ Data saved to 'data/PMC12386565/PMC12386565.csv'
✅ Processed 23 claims into 4 unique rows.
✅ Data saved to 'data/PMC10444918/PMC10444918.csv'
✅ Processed 48 claims into 7 unique rows.
✅ Data saved to 'data/PMC12316108/PMC12316108.csv'
✅ Processed 20 claims into 2 unique rows.
✅ Data saved to 'data/PMC10171250/PMC10171250.csv'
✅ Processed 100 claims into 10 unique rows.
✅ Data saved to 'data/PMC8943845/PMC8943845.csv'
✅ Processed 20 claims into 3 unique rows.
✅ Data saved to 'data/PMC9872512/PMC9872512.csv'
✅ Processed 40 claims into 6 unique rows.
✅ Data saved to 'data/PMC6435624/PMC6435624.csv'
✅ Processed 98 claims into 11 unique rows.
✅ Data saved to 'data/PMC6264471/PMC6264471.csv'
✅ Processed 19 claims into 3 unique rows.
✅ Data saved to 'data/PMC8786220/PMC8786220.csv'
✅ Processed 19 claims into 15 unique rows.
✅ Data saved to 'data/PMC6141386/PM

In [36]:
combined_df.head(2)

Unnamed: 0,paper_id,claim,figure_id,title,caption,local_image_path,url,class
0,PMC4870313,"Accordingly, the number of citable articles ha...",PMC4870313_figure_1,Fig. 2. Number of funded and unfunded articles...,,./data/PMC4870313/images/figure_1.jpg,https://cdn.ncbi.nlm.nih.gov/pmc/blobs/1141/48...,SUPPORT
1,PMC4870313,The proportion of funded research has fluctuat...,PMC4870313_figure_2,Fig. 3. Countries of editorial board members o...,,./data/PMC4870313/images/figure_2.jpg,https://cdn.ncbi.nlm.nih.gov/pmc/blobs/1141/48...,SUPPORT


In [1]:
import pandas as pd
df = pd.read_csv("all_perturbed_data.csv")
print(df.columns)
df = df[['claim', 'new_claim', 'figure_id', 'caption', 'class']]
df.head()

Index(['paper_id', 'claim', 'figure_id', 'title', 'caption',
       'local_image_path', 'url', 'class', 'new_claim'],
      dtype='object')


Unnamed: 0,claim,new_claim,figure_id,caption,class
0,"As shown in Figure 1, the workflow concluded w...","As shown in Figure 1, the workflow started by ...",PMC10001304_figure_1,Research workflow.,CONTRADICTION
1,The research framework does not illustrate ana...,Research Framework The research framework illu...,PMC10001304_figure_2,Research framework.,CONTRADICTION
2,The research framework does not illustrate the...,The research framework illustrates the analysi...,PMC10001304_figure_2,Research framework.,CONTRADICTION
3,"“As shown in Figure 3, the earliest studies on...","As shown in Figure 3, the earliest studies on ...",PMC10001304_figure_3,Emergence of each type of intervention program.,CONTRADICTION
4,South Korea had the highest research volume in...,China had the highest research volume in terms...,PMC10001304_figure_4,Countries/regions engaged in academic research.,CONTRADICTION


In [2]:
i = 0
for row in df.itertuples(index=False):
    print(row[0])
    print(row[1], '\n')
    i += 1
    if i >=20:
        break

As shown in Figure 1, the workflow concluded with collecting journal articles from June to August 2022, excluding irrelevant and incompletely published studies, and performing bibliometric analyses prior to data collection.
As shown in Figure 1, the workflow started by collecting journal articles from June to August 2022, excluding irrelevant and incompletely published studies, and performing bibliometric analyses. 

The research framework does not illustrate analysis results based on scale, time, space, or composition; instead, it presents a single-dimensional summary without reference to descriptive or LDA analyses, the timeline of the 104 studies, country-level research capacity distribution, or network visualization (Figure 2).
Research Framework The research framework illustrates the analysis results based on scale, time, space, and composition. “Scale” includes descriptive and LDA analyses. “Time” refers to the analysis of the timeline of the 104 studies. “Space” refers to the di

In [3]:
df['class'].unique()

array(['CONTRADICTION'], dtype=object)

In [12]:
files = ['all_support_data.csv', 'all_perturbed_data_without_judge.csv', 'all_neutral_data.csv']
df = pd.DataFrame()
for file in files:
    temp_df = pd.read_csv(file)
    df = pd.concat([df, temp_df], ignore_index=True)
df.shape

(11472, 9)

In [13]:
df['class'].value_counts()

class
SUPPORT          3990
CONTRADICTION    3990
NEUTRAL          3492
Name: count, dtype: int64

In [14]:
df.head()

Unnamed: 0,paper_id,claim,figure_id,title,caption,local_image_path,url,class,new_claim
0,PMC4870313,"Accordingly, the number of citable articles ha...",PMC4870313_figure_1,Fig. 2. Number of funded and unfunded articles...,,./data/PMC4870313/images/figure_1.jpg,https://cdn.ncbi.nlm.nih.gov/pmc/blobs/1141/48...,SUPPORT,"Accordingly, the number of citable articles ha..."
1,PMC4870313,The proportion of funded research has fluctuat...,PMC4870313_figure_2,Fig. 3. Countries of editorial board members o...,,./data/PMC4870313/images/figure_2.jpg,https://cdn.ncbi.nlm.nih.gov/pmc/blobs/1141/48...,SUPPORT,The proportion of funded research has fluctuat...
2,PMC4870313,The editorial board consists of 61 specialists...,PMC4870313_figure_3,Fig. 4. Countries of authors of 392 articles p...,,./data/PMC4870313/images/figure_3.jpg,https://cdn.ncbi.nlm.nih.gov/pmc/blobs/1141/48...,SUPPORT,The editorial board consists of 61 specialists...
3,PMC4870313,Authors of CiOS were from 20 countries: most o...,PMC4870313_figure_4,Fig. 5. Countries of authors who cited Clinics...,,./data/PMC4870313/images/figure_4.jpg,https://cdn.ncbi.nlm.nih.gov/pmc/blobs/1141/48...,SUPPORT,Authors of CiOS were from 20 countries: most o...
4,PMC4870313,The largest number of citing authors was in th...,PMC4870313_figure_5,Fig. 6. Changing pattern of total citations of...,,./data/PMC4870313/images/figure_5.jpg,https://cdn.ncbi.nlm.nih.gov/pmc/blobs/1141/48...,SUPPORT,The largest number of citing authors was in th...


In [15]:
df.to_csv('all_data_without_judge_with_fig_ref.csv', index=False)