RANDOM FOREST TO COMPLETE NULL VALUES

In [1]:
import zipfile
import os
import pandas as pd
import json
from tqdm import tqdm
from IPython.display import display
import re

In [2]:
zip_path = './data/characters-2024-11-05.zip'
target_directory = './data/'
character_json_path = './data/characters'

try:
    # Check if the extraction has already been done
    if os.path.exists(character_json_path):
        print(f"[INFO] '{character_json_path}' ALREADY EXISTS! SKIP EXTRACTION!")
    else:
        os.makedirs(target_directory, exist_ok=True)
        
        # Open the .zip file and extract all its contents
        with zipfile.ZipFile(zip_path, 'r') as zip_file:
            zip_file.extractall(target_directory)
        
        print('EXTRACTION OK!')  # JSON files are in characters

# Error handling
except Exception as e:
    print(f"[ERROR] {e}")

[INFO] './data/characters' ALREADY EXISTS! SKIP EXTRACTION!


In [3]:
def collect_json_keys(character_data, all_keys, parent_key=''):
    if isinstance(character_data, dict):
        for key, value in character_data.items():
            new_key = f"{parent_key}.{key}" if parent_key != '' else key
            if not (new_key.endswith('id') or new_key.endswith('icon') or new_key.endswith('passiveTreeUrl')):
                collect_json_keys(value, all_keys, new_key)
            
    elif isinstance(character_data, list):
        for index, element in enumerate(character_data):
            new_key = f"{parent_key}[{index}]"
            if not (new_key.endswith('id') or new_key.endswith('icon') or new_key.endswith('passiveTreeUrl')):
                collect_json_keys(element, all_keys, new_key)
            
    else:
        if isinstance(character_data, bool) or isinstance(character_data, (int, float)):
            all_keys.add(parent_key)
        
        elif isinstance(character_data, str):
            if re.search(r'\d+', character_data):
                generalized_text = re.sub(r'\d+', '#', character_data)
                all_keys.add(generalized_text)
            else:
                all_keys.add(character_data)

In [4]:
all_keys = set()
master_character_json_path = './data/master_character_json.json'

try:
    if os.path.exists(master_character_json_path):
        print(f"[INFO] '{master_character_json_path}' ALREADY EXISTS! SKIP KEY COLLECTION!")
    else:
        for character_json_filename in tqdm(os.listdir(character_json_path)):
            character_json_filepath = os.path.join(character_json_path, character_json_filename)
        
            with open(character_json_filepath) as character_json_file:
                character_data = json.load(character_json_file)
                collect_json_keys(character_data, all_keys)
                
        print('ALL KEYS COLLECTED!')
    
except Exception as e:
    print(f"[ERROR] {e}")

[INFO] './data/master_character_json.json' ALREADY EXISTS! SKIP KEY COLLECTION!


In [5]:
master_character_dictionary = {key: 0 for key in all_keys}

if os.path.exists(master_character_json_path):
    print(f"[INFO] '{master_character_json_path}' ALREADY EXISTS! SKIP JSON CREATION!")
else:
    with open(master_character_json_path, 'w') as master_character_json:
        json.dump(master_character_dictionary, master_character_json)

[INFO] './data/master_character_json.json' ALREADY EXISTS! SKIP JSON CREATION!


In [6]:
def populate_character_json(character_data, master_character_json, parent_key='', flat_character_data=None):
    if flat_character_data is None:
        flat_character_data = master_character_json
    
    if isinstance(character_data, dict):
        for key, value in character_data.items():
            new_key = f"{parent_key}.{key}" if parent_key != '' else key
            if not (new_key.endswith('id') or new_key.endswith('icon') or new_key.endswith('passiveTreeUrl')):
                populate_character_json(value, master_character_json, new_key, flat_character_data)
            
    elif isinstance(character_data, list):
        for index, element in enumerate(character_data):
            new_key = f"{parent_key}[{index}]"
            if not (new_key.endswith('id') or new_key.endswith('icon') or new_key.endswith('passiveTreeUrl')):
                populate_character_json(element, master_character_json, new_key, flat_character_data)
            
    else:
        if isinstance(character_data, bool):
            if flat_character_data[parent_key] == 0:
                flat_character_data[parent_key] = character_data

        elif isinstance(character_data, (int, float)):
            if flat_character_data[parent_key] == 0:
                flat_character_data[parent_key] = character_data
            else:
                flat_character_data[parent_key] += character_data
        
        elif isinstance(character_data, str):
            if re.search(r'\d+', character_data):
                extracted_values = list(map(int, re.findall(r'\d+', character_data)))
                generalized_text = re.sub(r'\d+', '#', character_data)
                
                if len(extracted_values) == 1:
                    new_value = extracted_values[0]
                else:
                    new_value = sum(extracted_values) / len(extracted_values)

                if flat_character_data[generalized_text] == 0:
                    flat_character_data[generalized_text] = new_value
                else:
                    flat_character_data[generalized_text] += new_value
                
            else:
                if flat_character_data[character_data] == 0:
                    flat_character_data[character_data] = 1
                else:
                    flat_character_data[character_data] += 1

    return flat_character_data

In [7]:
flat_character_json_path = './data/flat_characters'
    
try:
    if os.path.exists(flat_character_json_path):
        print(f"[INFO] '{flat_character_json_path}' ALREADY EXISTS! SKIP FLATTENING!")
    else:
        os.makedirs(flat_character_json_path, exist_ok=True)

        with open(master_character_json_path, 'r') as master_character_json_file:
            master_character_json = json.load(master_character_json_file)
    
        for character_json_filename in tqdm(os.listdir(character_json_path)):
            character_json_filepath = os.path.join(character_json_path, character_json_filename)
        
            with open(character_json_filepath) as character_json_file:
                character_data = json.load(character_json_file)
                new_character_data = populate_character_json(character_data, master_character_json)

                flat_character_json_filename = "flat_" + character_json_filename
                flat_character_json_filepath = os.path.join(flat_character_json_path, flat_character_json_filename)
            
                with open(flat_character_json_filepath, "w") as json_file:
                    json.dump(new_character_data, json_file)
                    
        print('ALL FLAT JSON CREATED!')
        
except Exception as e:
    print(f"[ERROR] {e}")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110466/110466 [7:42:35<00:00,  3.98it/s]

ALL FLAT JSON CREATED!



