In [11]:
import json
import os

def is_valid_json(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            json.load(file)
        return True
    except json.JSONDecodeError:
        return False

def fix_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Check and remove the problematic lines if they exist
    if lines[0].strip() == '```json':
        lines = lines[1:]  # Remove the first line
    if lines[-1].strip() == '```':
        lines = lines[:-1]  # Remove the last line

    # Try to join back the lines and parse as JSON
    try:
        json_data = json.loads(''.join(lines))
        # If successful, write back the corrected JSON
        with open(file_path, 'w', encoding='utf-8') as file:
            json.dump(json_data, file, indent=4)
        print(f"Fixed and saved valid JSON to {file_path}")
    except json.JSONDecodeError as e:
        print(f"Error fixing {file_path}: {e}")

def process_directory(directory_path):
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            if not is_valid_json(file_path):
                print(f"Invalid JSON detected: {filename}, attempting to fix...")
                fix_json_file(file_path)
            else:
                print(f"{filename} is a valid JSON file.")

# Replace 'your_directory_path' with the path to your directory containing JSON files
directory_path = 'leaflet_jsons'
process_directory(directory_path)


Invalid JSON detected: 13831.json, attempting to fix...
Fixed and saved valid JSON to leaflet_jsons/13831.json
Invalid JSON detected: 28267.json, attempting to fix...
Fixed and saved valid JSON to leaflet_jsons/28267.json
19164.json is a valid JSON file.
Invalid JSON detected: 23719.json, attempting to fix...
Fixed and saved valid JSON to leaflet_jsons/23719.json
Invalid JSON detected: 16611.json, attempting to fix...
Fixed and saved valid JSON to leaflet_jsons/16611.json
Invalid JSON detected: 15137.json, attempting to fix...
Fixed and saved valid JSON to leaflet_jsons/15137.json
18784.json is a valid JSON file.
8409.json is a valid JSON file.
30677.json is a valid JSON file.
26053.json is a valid JSON file.
Invalid JSON detected: 29684.json, attempting to fix...
Fixed and saved valid JSON to leaflet_jsons/29684.json
Invalid JSON detected: 33345.json, attempting to fix...
Fixed and saved valid JSON to leaflet_jsons/33345.json
Invalid JSON detected: 9956.json, attempting to fix...
Fixe

In [12]:
import json
import os
import shutil  # For moving files

def check_json_structure(file_path):
    """Check if JSON file adheres to the specified structure."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON in {file_path}: {e}")
        return False  # Indicating that this file does not adhere due to a JSON error
        
    # Define the expected structure
    expected_structure = {
        "leaflet": {
            "candidateName": str,
            "constituency": str,
            "politicalParty": str,
            "electionDate": str,
            "messages": list,
            "keyPolicies": list,
            "mentions": {
                "candidate": str,
                "otherPartyLeaderCandidate": str,
                "partyLeader": str,
                "tacticalSituation": str
            },
            "issues": {
                "brexitEurope": str,
                "economy": str,
                "education": str,
                "environment": str,
                "governance": str,
                "health": str,
                "immigration": str,
                "socialWelfare": str,
                "housing": list
            },
            "personalStatement": str,
            "politicalExperience": list,
            "contactInformation": {
                "address": str,
                "phone": str,
                "email": str,
                "website": str,
                "socialMedia": {
                    "facebook": str,
                    "twitter": str,
                    "instagram": str,
                    "linkedin": str
                }
            },
            "endorsements": list,
            "quotes": list,
            "campaignMaterial": {
                "images": list
            },
            "additionalNotes": str
        }
    }

    def check_structure(data, structure):
        if type(data) is not dict or type(structure) is not dict:
            return type(data) == structure
        for key, value_type in structure.items():
            if key not in data or not check_structure(data[key], value_type):
                return False
        return True

    return check_structure(data, expected_structure)

def process_directory_for_structure(directory_path):
    faulty_dir = os.path.join(directory_path, "faulty")
    if not os.path.exists(faulty_dir):
        os.makedirs(faulty_dir)

    num_bad = 0
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            if not check_json_structure(file_path):
                print(f"{filename} does not adhere to the specified format.")
                faulty_file_path = os.path.join(faulty_dir, filename)
                shutil.move(file_path, faulty_file_path)  # Move the file to the "faulty" directory
                num_bad += 1
    print(f"Moved {num_bad} files to the faulty directory.")

# Replace 'your_directory_path' with the path to your directory containing JSON files
directory_path = 'leaflet_jsons'
process_directory_for_structure(directory_path)


19761.json does not adhere to the specified format.
Error decoding JSON in leaflet_jsons/13185.json: Invalid control character at: line 87 column 113 (char 5701)
13185.json does not adhere to the specified format.
29155.json does not adhere to the specified format.
31737.json does not adhere to the specified format.
12989.json does not adhere to the specified format.
10788.json does not adhere to the specified format.
3296.json does not adhere to the specified format.
Error decoding JSON in leaflet_jsons/37885.json: Expecting value: line 1 column 1 (char 0)
37885.json does not adhere to the specified format.
10811.json does not adhere to the specified format.
28626.json does not adhere to the specified format.
15823.json does not adhere to the specified format.
Error decoding JSON in leaflet_jsons/17298.json: Expecting value: line 1 column 1 (char 0)
17298.json does not adhere to the specified format.
29439.json does not adhere to the specified format.
38926.json does not adhere to the