In [3]:
import zipfile
import json
from pathlib import Path 

In [4]:
##############################################################################
# Infer the data type  
# -- Change the value into a string stating the data type   
# -- If the value is None, None is returned
# -- If the data type is not one of the data types specified in this function,
#       'unknown is returned                                      
###############################################################################

def infer_placeholder(value):
    if isinstance(value, str):
        return "string"
    elif isinstance(value, bool):
        return "boolean"
    elif isinstance(value, int) or isinstance(value, float):
        return "number"
    elif isinstance(value, list):
        return ["array"]
    elif isinstance(value, dict):
        return {k: infer_placeholder(v) for k, v in value.items()}
    elif value is None:
        return None
    else:
        return "unknown"

############################################################################################
# Function to iterate over all dictionaries and list to apply the infer_placeholder function 
#############################################################################################

def simplify_json_structure(data):
    # If the data is a dictionary...
    if isinstance(data, dict):
        # ...create a new dictionary where:
        # - each key is kept the same
        # - each value is simplified by calling this function recursively
        return {k: simplify_json_structure(v) for k, v in data.items()}

    # If the data is a list...
    
    elif isinstance(data, list):
        if len(data) > 0:
            # Map each item in the list through this function
            simplified_items = [simplify_json_structure(item) for item in data]

            # If all items are the same, keep only one to reduce noise
            if all(item == simplified_items[0] for item in simplified_items):
                return [simplified_items[0]]
            else:
                return simplified_items
        else:
            return ["array"]

    else:
        return infer_placeholder(data)

    
def save_structure(output_structure, zip_path):
    # Serializing json
    json_object = json.dumps(output_structure, indent=2)
    zip_name = Path(zip_path).stem 

    # Writing to sample.json
    with open(f"{main_path}Instagram/Input_test/IG_structure_{zip_name}.json", "w") as outfile:
        outfile.write(json_object)

def structure_from_zip(zip_path):
    output_structure = {}

    with zipfile.ZipFile(zip_path, 'r') as z:
        for file_info in z.infolist():
            if file_info.filename.endswith('.json') and not file_info.is_dir():
                with z.open(file_info.filename) as f:
                    try:
                        content = json.load(f)
                        placeholder_content = simplify_json_structure(content)
                        output_structure[file_info.filename] = placeholder_content
                    except json.JSONDecodeError:
                        output_structure[file_info.filename] = "Invalid JSON"

    save_structure(output_structure, zip_path)
    return json.dumps(output_structure, indent=2, ensure_ascii=False)

In [5]:
main_path = "/home/rvissche/Nextcloud/What-If/what-if-data-donation/what-if-data-donation/structure_donations/Processed_structure_donations/"
input_directory = Path(f'{main_path}Instagram/Raw')  

In [6]:
for file in input_directory.iterdir():  
    if file.is_file():  
        structure_from_zip(file)