In [2]:
import json
import os

def is_valid_json(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            json.load(file)
        return True
    except json.JSONDecodeError:
        return False

def fix_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Check and remove the problematic lines if they exist
    if lines[0].strip() == '```json':
        lines = lines[1:]  # Remove the first line
    if lines[-1].strip() == '```':
        lines = lines[:-1]  # Remove the last line

    # Try to join back the lines and parse as JSON
    try:
        json_data = json.loads(''.join(lines))
        # If successful, write back the corrected JSON
        with open(file_path, 'w', encoding='utf-8') as file:
            json.dump(json_data, file, indent=4)
        print(f"Fixed and saved valid JSON to {file_path}")
    except json.JSONDecodeError as e:
        print(f"Error fixing {file_path}: {e}")

def process_directory(directory_path):
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            if not is_valid_json(file_path):
                print(f"Invalid JSON detected: {filename}, attempting to fix...")
                fix_json_file(file_path)
            else:
                print(f"{filename} is a valid JSON file.")

# Replace 'your_directory_path' with the path to your directory containing JSON files
directory_path = 'leaflet_jsons_2017_2019'
process_directory(directory_path)


28267.json is a valid JSON file.
30677.json is a valid JSON file.
29684.json is a valid JSON file.
33345.json is a valid JSON file.
33130.json is a valid JSON file.
32143.json is a valid JSON file.
31111.json is a valid JSON file.
32080.json is a valid JSON file.
38845.json is a valid JSON file.
31465.json is a valid JSON file.
31669.json is a valid JSON file.
34237.json is a valid JSON file.
27279.json is a valid JSON file.
33856.json is a valid JSON file.
31244.json is a valid JSON file.
34095.json is a valid JSON file.
29746.json is a valid JSON file.
35533.json is a valid JSON file.
30202.json is a valid JSON file.
34802.json is a valid JSON file.
32532.json is a valid JSON file.
26994.json is a valid JSON file.
29398.json is a valid JSON file.
33724.json is a valid JSON file.
29085.json is a valid JSON file.
33051.json is a valid JSON file.
27946.json is a valid JSON file.
29128.json is a valid JSON file.
38775.json is a valid JSON file.
34983.json is a valid JSON file.
29155.json

In [3]:
import json
import os
import shutil

def check_json_structure(file_path):
    """Check if JSON file adheres to the specified structure."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON in {file_path}: {e}")
        return False  # Indicating that this file does not adhere due to a JSON error

    # Define the expected structure
    expected_structure = {
        "leaflet": {
            "candidateName": str,
            "candidateGender": str,
            "constituency": str,
            "politicalParty": str,
            "electionDate": str,
            "messages": [
                {
                    "title": str,
                    "content": str
                }
            ],
            "keyPolicies": [
                {
                    "policyTitle": str,
                    "policyDescription": str
                }
            ],
            "mentions": {
                "candidate": str,
                "otherPartyLeaderCandidate": str,
                "partyLeader": str,
                "tacticalSituation": str
            },
            "issues": {
                "brexitEurope": str,
                "defence": str,
                "economy": str,
                "education": str,
                "environment": str,
                "governance": str,
                "government": str,
                "health": str,
                "immigration": str,
                "labourEmployment": str,
                "lawCrime": str,
                "transport": str,
                "socialWelfare": str,
                "community": [
                    {
                        "content": str,
                        "yimbyNimby": str
                    }
                ]
            },
            "personalStatement": str,
            "politicalExperience": [
                {
                    "role": str,
                    "duration": str,
                    "achievements": str
                }
            ],
            "contactInformation": {
                "address": str,
                "phone": str,
                "email": str,
                "website": str,
                "socialMedia": {
                    "facebook": str,
                    "twitter": str,
                    "instagram": str,
                    "linkedin": str
                }
            },
            "endorsements": [
                {
                    "endorser": str,
                    "endorsement": str
                }
            ],
            "quotes": [
                {
                    "text": str,
                    "source": str,
                    "date": str
                }
            ],
            "campaignMaterial": {
                "images": [
                    {
                        "caption": str
                    }
                ]
            },
            "additionalNotes": str,
            "analysis": {
                "statementAnalysis": {
                    "quantitative": {
                        "count": int,
                        "examples": [str]
                    },
                    "qualitative": {
                        "count": int,
                        "examples": [str]
                    }
                },
                "photoAnalysis": {
                    "localPlaces": int,
                    "people": int,
                    "candidateAlone": int,
                    "candidateWithOthers": int
                },
                "issueFocus": {
                    "local": {
                        "count": int,
                        "examples": [str]
                    },
                    "national": {
                        "count": int,
                        "examples": [str]
                    }
                },
                "personalStories": [
                    {
                        "story": str,
                        "issueRelated": str
                    }
                ],
                "endorsementsUsed": {
                    "count": int,
                    "examples": [str]
                },
                "negativeMessaging": {
                    "localIssues": {
                        "count": int,
                        "examples": [str]
                    },
                    "nationalIssues": {
                        "count": int,
                        "examples": [str]
                    },
                    "leaderOfOpponentsParty": {
                        "count": int,
                        "examples": [str]
                    },
                    "oppositeCandidateNoName": {
                        "count": int,
                        "examples": [str]
                    },
                    "oppositeCandidateWithName": {
                        "count": int,
                        "examples": [str]
                    }
                },
                "nonPolicyAttributes": {
                    "localness": str,
                    "employment": str,
                    "education": str,
                    "family": str
                },
                "localCandidate": {
                    "isLocal": bool,
                    "examples": [str]
                }
            }
        }
    }

    def check_structure(data, structure):
        if type(data) is list:
            if type(structure) is list and len(structure) > 0:
                return all(check_structure(item, structure[0]) for item in data)
            else:
                return False
        elif type(data) is dict:
            if type(structure) is dict:
                for key, value in structure.items():
                    if key not in data or not check_structure(data[key], value):
                        return False
                return True
            else:
                return False
        else:
            return type(data) == structure

    return check_structure(data, expected_structure)

def process_directory_for_structure(directory_path):
    faulty_dir = os.path.join(directory_path, "faulty")
    if not os.path.exists(faulty_dir):
        os.makedirs(faulty_dir)

    num_bad = 0
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            if not check_json_structure(file_path):
                print(f"{filename} does not adhere to the specified format.")
                faulty_file_path = os.path.join(faulty_dir, filename)
                shutil.move(file_path, faulty_file_path)  # Move the file to the "faulty" directory
                num_bad += 1
    print(f"Moved {num_bad} files to the faulty directory.")

# Replace 'your_directory_path' with the path to your directory containing JSON files
directory_path = 'leaflet_jsons_2017_2019'
process_directory_for_structure(directory_path)


28267.json does not adhere to the specified format.
33130.json does not adhere to the specified format.
32143.json does not adhere to the specified format.
33856.json does not adhere to the specified format.
30202.json does not adhere to the specified format.
34526.json does not adhere to the specified format.
27056.json does not adhere to the specified format.
29777.json does not adhere to the specified format.
28875.json does not adhere to the specified format.
27841.json does not adhere to the specified format.
32529.json does not adhere to the specified format.
27191.json does not adhere to the specified format.
29188.json does not adhere to the specified format.
28311.json does not adhere to the specified format.
34500.json does not adhere to the specified format.
38799.json does not adhere to the specified format.
27245.json does not adhere to the specified format.
27257.json does not adhere to the specified format.
30596.json does not adhere to the specified format.
30072.json d