In [None]:
import os
import sys
import glob
import logging
import json
import xml.etree.ElementTree as ET

from grobid_client.grobid_client import GrobidClient

In [None]:
# Set input and output directories
input_dir = "/home/kali/Documents/Grobid"
output_dir = "/home/kali/Documents/Grobid/Output"
json_output_dir = "/home/kali/Documents/Grobid/Output/JSON"

In [None]:
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
# Create the JSON output directory if it doesn't exist
if not os.path.exists(json_output_dir):
    os.makedirs(json_output_dir)

In [None]:
# Initialize Grobid client in the config.json file
client = GrobidClient(config_path="./config.json")

In [None]:
# Set up logging
logging.basicConfig(filename='pdf_extraction.log', level=logging.ERROR, format='%(asctime)s %(levelname)s: %(message)s')

In [None]:
# Define the number of retries
MAX_RETRIES = 1

for i in range(MAX_RETRIES + 1):
    try:
        # Process the PDFs in the input directory
        client.process("processFulltextDocument", input_dir, output=output_dir, consolidate_citations=True, tei_coordinates=True, force=True)
        print(f"PDF extraction successful in attempt {i+1}/{MAX_RETRIES+1}")
        break
    except Exception as e:
        if i == MAX_RETRIES:
            # If we've reached the maximum number of retries, log the error and re-raise the exception
            logging.error(f"PDF extraction failed after {MAX_RETRIES+1} attempts: {str(e)}")
            raise
        else:
            # If we haven't reached the maximum number of retries, log the error and try again
            logging.error(f"PDF extraction failed in attempt {i+1}/{MAX_RETRIES+1}: {str(e)}")
            print(f"PDF extraction failed in attempt {i+1}/{MAX_RETRIES+1}. Retrying...")

In [None]:
# Convert XML files to JSON Lines files
for filename in os.listdir(output_dir):
    if filename.endswith(".xml"):

        xml_file_path = os.path.join(output_dir, filename)
        json_file_path = os.path.join(json_output_dir, f"{filename[:-4]}.jsonl")


        try:
            # Parse the XML file
            tree = ET.parse(xml_file_path)
            root = tree.getroot()

            # Convert the XML to a JSON-serializable dictionary
            data = {}
            for elem in root.iter():
                if elem.tag not in data:
                    data[elem.tag] = []

                data[elem.tag].append(elem.text)


            # Write the JSON data to a file
            with open(json_file_path, "w") as f:
                json.dump(data, f)
                f.write("\n")  # Add a newline character for JSON Lines format
            print(f"Converted {xml_file_path} to {json_file_path}")
        except Exception as e:
            logging.error(f"Error converting {xml_file_path} to JSON: {str(e)}")