Import required dependencies

In [2]:
import os
import json
import requests
import pandas as pd
import pickle           # Save and load data
import time
from datetime import datetime, timedelta

# Setting Pandas options.
pd.set_option("display.max_rows", 50) # How to display all rows from data frame using pandas. Setting value to None to show all rows.
pd.set_option("display.max_columns", None)
pd.set_option("display.max_info_columns", 100)
pd.set_option("display.max_info_rows", 1000000)
pd.set_option("display.precision", 2)
#pd.set_option("styler.format.precision", 2)

load config files

In [3]:
# Load API Key and download directory from config file
CONFIG_FILE = "../config/api-call.json"

def load_config(config_path):
    """Load configuration from a JSON file."""
    with open(config_path, "r") as file:
        return json.load(file)

Config parameters for loading JSON files

In [4]:
# Load config values
config = load_config(CONFIG_FILE)
# print(config)
DOWNLOAD_DIR = config["ned"]["ned_download_dir"]
# Folder path containing files
folder_path = config["ned"]["ned_download_dir"]
# File pattern to match (e.g., all CSV files that start with "data_")
file_pattern = "power-gen-type-0"  # Change this based on your naming convention
file = "power-gen-type-0.json"
# Ensure the download directory exists
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

Fetch JSON files in loop

In [5]:

# Type	    What is the type of energy carrier?	0 All, 1 Wind, 2 Solar, 3 Biogas, 4 HeatPump, 8 Cofiring, 9 Geothermal, 10 Other, 11 Waste, 12 BioOil, 13 Biomass
# 14 Wood, 17 WindOffshore, 18 FossilGasPower, 19 FossilHardCoal, 20 Nuclear, 21 WastePower, 22 WindOffshoreB, 23 NaturalGas, 24 Biomethane, 25 BiomassPower
# 26 OtherPower, 27 ElectricityMix, 28 GasMix, 31 GasDistribution, 35 CHP Total, 50 SolarThermal, 51 WindOffshoreC, 53 IndustrialConsumersGasCombination
# 54 IndustrialConsumersPowerGasCombination, 55 LocalDistributionCompaniesCombination, 56 AllConsumingGas


# Define an array of n values (Custom values instead of a fixed range)
# n_values = [4, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 35, 50, 51, 53, 54, 55, 56]  # Type values

# Create an empty list to store DataFrames
# df_list = []

# Dictionary to store dynamically named DataFrames
# df_dict = {}

print("starting the processing")

file_path = os.path.join(folder_path, file)
df_name = os.path.splitext(file)[0]  # Removes .json extension
        
try:
    with open(file_path, "r", encoding="utf-8") as f:
        print(f"File name is : {f}")
        data = json.load(f)  # Load JSON data
except Exception as e:
    print(f"Error reading file: {e}")
    exit()

# Step 2: Extract 'hydra:member'
# if "hydra:member" in data and isinstance(data["hydra:member"], list) and len(data["hydra:member"]) > 0:
    # print(df_dict[df_name])
    # df_dict[df_name] = pd.DataFrame(data["hydra:member"])

df = pd.DataFrame(data)
print("rows in df are : ", df.shape[0])
print("\n DataFrame created successfully!\n")
# print(df.head())  # Display first few rows

df_orig_num    = df.select_dtypes(include='number')
l_df_num_names = df_orig_num.columns.tolist()
print(l_df_num_names)
print(f"\nNumber of numerical variables: {len(l_df_num_names)}")

df_orig_cat    = df.select_dtypes(include='object')
l_df_cat_names = list(df_orig_cat.columns)

print(f"\nNumber of categorical variables: {len(l_df_cat_names)}")
print(l_df_cat_names)

# Extract the 'hydra:member' array which contains the utilization data
# utilization_data = data.get('hydra:member', [])  # Handle missing key

# Create a list of dictionaries, where each dictionary represents a row
rows = []
for item in data:
    # Select the fields you want to include in the table
    row = {
        'id': item.get('id'),
        'power-gen-type': item.get('type').split("/")[-1],
        'capacity': item.get('capacity'),                    
        'validto': item.get('validto')                
    }
    rows.append(row)

# Create a Pandas DataFrame from the list of dictionaries
formatted_df = pd.DataFrame(rows)

# Convert 'capacity' to numeric, handling errors by setting non-numeric values to NaN
# formatted_df['capacity'] = pd.to_numeric(formatted_df['capacity'], errors='coerce')

# Filter, excluding rows where capacity is NaN
# filtered_df = formatted_df[formatted_df['capacity'] > 0].dropna(subset=['capacity'])

print(formatted_df.shape[0])
print(formatted_df)

# Loop through all files in the folder
# for file in os.listdir(folder_path):
    # if file.startswith(file_pattern) and file.endswith(".json"):  # Adjust for other formats if needed
#    if file.startswith(file_pattern) and file.endswith(".json"):  # Adjust for other formats if needed
        

#        else:
#            # print(df)
#            print("\nNo valid data found in 'hydra:member'.")
#        print(f"Data stored in: {df_name}")
        


starting the processing
File name is : <_io.TextIOWrapper name='../data/powergen/power-gen-type-0.json' mode='r' encoding='utf-8'>
rows in df are :  34944

 DataFrame created successfully!

['id', 'capacity', 'volume', 'percentage', 'emission', 'emissionfactor']

Number of numerical variables: 6

Number of categorical variables: 11
['@id', '@type', 'point', 'type', 'granularity', 'granularitytimezone', 'activity', 'classification', 'validfrom', 'validto', 'lastupdate']
34944
                id power-gen-type  capacity                    validto
0      23791261744              0   8134680  2021-12-31T23:15:00+00:00
1      23791591475              0   8039900  2021-12-31T23:30:00+00:00
2      23792244041              0   8128668  2021-12-31T23:45:00+00:00
3      23792573621              0   8090804  2022-01-01T00:00:00+00:00
4      23794423050              0   8101220  2022-01-01T00:15:00+00:00
...            ...            ...       ...                        ...
34939  51525834909     

In [None]:
# def filter_response_data(json_dict):

    # Convert the list of utilizations into a DataFrame
    # json_data = pd.DataFrame(json_dict["hydra:member"])
    
    # f_describe(df, 10)
    #df_orig_num    = json_data.select_dtypes(include='number')
    #l_df_num_names = df_orig_num.columns.tolist()

    # print(l_df_num_names)
    # print(f"\nNumber of numerical variables: {len(l_df_num_names)}")

    #df_orig_cat    = json_data.select_dtypes(include='object')
    #l_df_cat_names = list(df_orig_cat.columns)

    # print(f"\nNumber of categorical variables: {len(l_df_cat_names)}")
    # print(l_df_cat_names)

    # formatted_df = json_to_table(json_data)

    # Convert 'capacity' to numeric, handling errors by setting non-numeric values to NaN
    # formatted_df['capacity'] = pd.to_numeric(formatted_df['capacity'], errors='coerce')

    # Filter, excluding rows where capacity is NaN
    # filtered_df = formatted_df[formatted_df['capacity'] > 0].dropna(subset=['capacity'])

    # print(filtered_df.shape[0])
    # print(filtered_df)

    # Create dictionary 'dc_ned_json_data_1' with objects that will be used in the next exercises.
    #dc_ned_json_data_1 = {
    #    'df_orig': json_data    
    #}

    # Save dc_exercise_1_2_3 as 'dc_ned_json_data_1.pkl'
    #with open('../data/dc-ned-json-data-1.pkl', 'wb') as pickle_file:
    #    pickle.dump(dc_ned_json_data_1, pickle_file)

    # Display the DataFrame as a table
    # print("\nConverted JSON Payload to Table Format:\n")
    # print(json_data.to_string(index=False))

    #if formatted_df is not None:
    #    # Print the DataFrame (table format)
    #    print(formatted_df.head(3))
    # return filtered_df