In [2]:
import zipfile
import os
import pandas as pd
import json
from tqdm import tqdm
from IPython.display import display

In [6]:
root_directory = "./"

In [None]:
zip_path = os.path.join(root_directory, 'data/characters-2024-11-05.zip')
target_directory = os.path.join(root_directory, 'data/')

try:
    os.makedirs(target_directory, exist_ok=True) # Create target directory if it doesn't exist

    # Open the .zip file and extract all its contents
    with zipfile.ZipFile(zip_path, 'r') as zip_file:
        zip_file.extractall(target_directory)
    
    print('Extraction OK') # JSON files are in characters
        
# Error
except Exception as e:
    print(f"[ERROR] {e}")

In [5]:
def collect_json_keys(character_data, all_keys, parent_key=''):
    if isinstance(character_data, dict):
        for key, value in character_data.items():
            new_key = f"{parent_key}.{key}" if parent_key != '' else key
            collect_json_keys(value, all_keys, new_key)
            
    elif isinstance(character_data, list):
        for index, element in enumerate(character_data):
            new_key = f"{parent_key}[{index}]"
            collect_json_keys(element, all_keys, new_key)
            
    else:
        all_keys.add(parent_key)

In [11]:
character_json_path = './data/characters'
all_keys = set()

for character_json_filename in tqdm(os.listdir(character_json_path)):
    character_json_filepath = os.path.join(character_json_path, character_json_filename)
        
    try:
        with open(character_json_filepath) as character_json_file:
            character_data = json.load(character_json_file)
            collect_json_keys(character_data, all_keys)
    
    except Exception as e:
        print(f"[ERROR] {e}")

print('ALL KEYS COLLECTED!')

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110466/110466 [03:15<00:00, 565.40it/s]

ALL KEYS COLLECTED!





In [13]:
master_character_json = {key: None for key in all_keys}

In [18]:
def populate_character_json(character_data, parent_key='', flat_character_data=None):
    if flat_character_data is None:
        flat_character_data = master_character_json
        
    if isinstance(character_data, dict):
        for key, value in character_data.items():
            new_key = f"{parent_key}.{key}" if parent_key != '' else key
            populate_character_json(value, new_key, flat_character_data)
            
    elif isinstance(character_data, list):
        for index, element in enumerate(character_data):
            new_key = f"{parent_key}[{index}]"
            populate_character_json(element, new_key, flat_character_data)
            
    else:
        flat_character_data[parent_key] = character_data

    return flat_character_data

In [25]:
flat_character_json_path = './data/flat_characters'

for character_json_filename in tqdm(os.listdir(character_json_path)):
    character_json_filepath = os.path.join(character_json_path, character_json_filename)
        
    try:
        with open(character_json_filepath) as character_json_file:
            character_data = json.load(character_json_file)
            new_character_data = populate_character_json(character_data)

            flat_character_json_filename = "flat_" + character_json_filename
            flat_character_json_filepath = os.path.join(flat_character_json_path, flat_character_json_filename)
            
            with open(flat_character_json_filepath, "w") as json_file:
                json.dump(new_character_data, json_file)
    except Exception as e:
        print(f"[ERROR] {e}")

print('ALL FLAT JSON CREATED!')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110466/110466 [21:59<00:00, 83.74it/s]

ALL FLAT JSON CREATED!



