In [3]:
import os
from pathlib import Path
from pprint import pprint
import json

In [13]:
# Step 1: Open and load the JSON file
with open('outputOCR.json', 'r') as file:
    data = json.load(file)

In [None]:
# Initialize structured format
structured_data = {
    "Introduction": [],
    "Methods": [],
    "Results and Discussion": [],
    "Conclusion": [],
    "Tables": [],
    "Figures": [],
    "Other": []
}

# Iterate through sections to categorize them
previous = "Other"
for section in data['sections']:
    heading = section.get('heading')
    content = section.get('content', [])
    # Ensure heading is a string
    if heading is None:
        heading = ""
    else:
        heading = heading.lower()
            
    # Extracting tables and figures
    cleaned_content = []
    i = 0
    while i < len(content):
        # Assume that any line that starts with "Fig" is a figure caption
        if content[i].startswith("Fig"):
            structured_data["Figures"].append(content[i])
            i += 1
        # Remove "<!-- image -->", etc demarcations
        elif content[i].startswith("<!--") and content[i].endswith("-->"):
            i += 1
        # Assume any line that starts with "Table" is a table title
        # Assume tables are at least two lines long
        elif content[i].startswith("Table") and i < len(content) - 2:
            if content[i+1].startswith("|"):
                table_heading = content[i]
                i += 1
            # Account for the possibility of table headings being two lines long
            elif content[i+2].startswith("|"):
                # Any line that starts with "<!--" is not part of the heading
                if content[i+1].startswith("<!--"):
                    table_heading = content[i]
                else:
                    table_heading = content[i] + " " + content[i+1]
                i += 2
            else:
                table_heading = None
            
            if table_heading:
                table = []
                # Assume a line starts with "|" if and only if it is a table
                # Add all consecutive lines which start with "|"
                while i < len(content):
                    if content[i].startswith("|"):
                        table.append(content[i])
                        i += 1
                    else:
                        break

                # Add table to Tables section
                structured_data["Tables"].append({
                    "heading": table_heading,
                    "content": table
                })
            else:
                cleaned_content.append(content[i])
                i += 1
        else:
            cleaned_content.append(content[i])
            i += 1
    content = cleaned_content

    # Categorize sections
    if "introduction" in heading:
        structured_data["Introduction"].append({
            "heading": section.get('heading', ''),
            "content": content
        })
        previous = "Introduction"
    elif "method" in heading:
        structured_data["Methods"].append({
            "heading": section.get('heading', ''),
            "content": content
        })
        previous = "Methods"
    elif "results" in heading or "discussion" in heading:
        structured_data["Results and Discussion"].append({
            "heading": section.get('heading', ''),
            "content": content
        })
        previous = "Results and Discussion"
    elif "conclusion" in heading:
        structured_data["Conclusion"].append({
            "heading": section.get('heading', ''),
            "content": content
        })
        previous = "Conclusion"
    elif "table" in heading: #or any("table" in line.lower() for line in content):
        structured_data["Tables"].append({
            "heading": heading,
            "content": content
        })
    else:
        if previous == "Conclusion":
            pass
        else:
            structured_data[previous].append({
                "heading": heading,
                "content": content
            })


# Save the structured JSON
with open('standardized_output.json', 'w') as outfile:
    json.dump(structured_data, outfile, indent=2)

print("Standardization complete. JSON saved as 'standardized_output.json'")

Standardization complete. JSON saved as 'standardized_output.json'
