import required packages

In [1]:
# Third party packages.
import pandas as pd             # Data handling
import numpy as np              # Numeric calculations
import pickle                   # Save and load data
import altair as alt            # Visualize data
# import seaborn as sns           # Visualize data
import matplotlib.pyplot as plt # Visualize data

from scipy.stats import pearsonr # Correlation

load pkl file

In [2]:
# with open('../data/dc-ned-json-data-1.pkl', 'rb') as pickle_file:
with open('../data/powergen/power-gen-type-0.pkl', 'rb') as pickle_file:
    dc_ned_json_data_1 = pickle.load(pickle_file)

# df_ned_json_data = dc_ned_json_data_1['df_orig']
df_ned_json_data = pd.DataFrame(dc_ned_json_data_1)  # Convert dictionary to DataFrame

set pandas dataframe settings

In [3]:
# Setting Pandas options.
pd.set_option("display.max_rows", 50) # How to display all rows from data frame using pandas. Setting value to None to show all rows.
pd.set_option("display.max_columns", None)
pd.set_option("display.max_info_columns", 100)
pd.set_option("display.max_info_rows", 1000000)
pd.set_option("display.precision", 2)
#pd.set_option("styler.format.precision", 2)

Descriptive analytics

In [4]:
# Show a sample of first 3 rows.
df_ned_json_data.head(3)

Unnamed: 0,@id,@type,id,point,type,granularity,granularitytimezone,activity,classification,capacity,volume,percentage,emission,emissionfactor,validfrom,validto,lastupdate
0,/v1/utilizations/23791261744,Utilization,23791261744,/v1/points/0,/v1/types/0,/v1/granularities/4,/v1/granularity_time_zones/0,/v1/activities/1,/v1/classifications/2,8134680,2033670,0,0,0,2021-12-31T23:00:00+00:00,2021-12-31T23:15:00+00:00,2024-11-10T19:01:35+00:00
1,/v1/utilizations/23791591475,Utilization,23791591475,/v1/points/0,/v1/types/0,/v1/granularities/4,/v1/granularity_time_zones/0,/v1/activities/1,/v1/classifications/2,8039900,2009975,0,0,0,2021-12-31T23:15:00+00:00,2021-12-31T23:30:00+00:00,2024-11-10T19:01:35+00:00
2,/v1/utilizations/23792244041,Utilization,23792244041,/v1/points/0,/v1/types/0,/v1/granularities/4,/v1/granularity_time_zones/0,/v1/activities/1,/v1/classifications/2,8128668,2032167,0,0,0,2021-12-31T23:30:00+00:00,2021-12-31T23:45:00+00:00,2024-11-10T19:01:35+00:00


Data Understanding

a. Which variables are numerical and which can categorical? How many variables are of both types?

In [5]:
print(
    f"Number of columns in the original data: "
    f"{df_ned_json_data.shape[1]}"
    "\n"
)

df_orig_num    = df_ned_json_data.select_dtypes(include='number')
l_df_num_names = df_orig_num.columns.tolist()

print(f"\nNumber of numerical variables: {len(l_df_num_names)}")
print(l_df_num_names)


df_orig_cat    = df_ned_json_data.select_dtypes(include='object')
l_df_cat_names = list(df_orig_cat.columns)

print(f"\nNumber of categorical variables: {len(l_df_cat_names)}")
print(l_df_cat_names)


Number of columns in the original data: 17


Number of numerical variables: 6
['id', 'capacity', 'volume', 'percentage', 'emission', 'emissionfactor']

Number of categorical variables: 11
['@id', '@type', 'point', 'type', 'granularity', 'granularitytimezone', 'activity', 'classification', 'validfrom', 'validto', 'lastupdate']


Next to check for missing values, variable completeness

df_orig_num.info()

Create a frequency table counting number of missing values per variable

In [6]:
# Pandas Series with type of each variable (variable, column) in df_orig.
ps_missing_type    = df_ned_json_data.dtypes

# Number of missing data per variable.
ps_missing_total   = df_ned_json_data.isnull().sum()

# Percentage of missing per variable.
ps_missing_percent = round(100 * ps_missing_total / df_ned_json_data.shape[0], 1)

In [7]:
# Create table (Pandas DataFrame).
df_missing_data = pd.DataFrame({

    'data_type':   ps_missing_type,
    'empty_total': ps_missing_total,
    'empty_perc':  ps_missing_percent
})

# Sort table by number of missing data in descending order.
df_missing_data.sort_values(
    by        = 'empty_total',
    ascending = False,
    inplace   = True
)

# Remove variables that have no missing values.
df_missing_data = df_missing_data.query("empty_total > 0")

# Show table.
print(
    f"Number of variables having missing data: "
    f"{df_missing_data.shape[0]} (out of {df_ned_json_data.shape[1]})"
)

df_missing_data

Number of variables having missing data: 0 (out of 17)


Unnamed: 0,data_type,empty_total,empty_perc


Conduct descriptive / summary statistics

In [8]:
df_orig_num.describe()
df_orig_cat.describe()

Unnamed: 0,@id,@type,point,type,granularity,granularitytimezone,activity,classification,validfrom,validto,lastupdate
count,34944,34944,34944,34944,34944,34944,34944,34944,34944,34944,34944
unique,34944,1,1,1,1,1,1,1,34944,34944,2
top,/v1/utilizations/51516174042,Utilization,/v1/points/0,/v1/types/0,/v1/granularities/4,/v1/granularity_time_zones/0,/v1/activities/1,/v1/classifications/2,2022-12-30T18:45:00+00:00,2022-12-30T19:00:00+00:00,2024-11-10T18:39:07+00:00
freq,1,34944,34944,34944,34944,34944,34944,34944,1,1,34848


data preparation phase

In [9]:
df_orig_num.median()

id                3.76e+10
capacity          8.61e+06
volume            2.15e+06
percentage        0.00e+00
emission          0.00e+00
emissionfactor    0.00e+00
dtype: float64

Impute missing values

In [10]:
df_imputed_num = df_orig_num.replace(np.nan, df_orig_num.median())

print("Median values in original numerical data (first five variables):")
print(df_orig_num.median().head(5))

print("\nMedian values in imputed numerical data (first five variables):")
print(df_imputed_num.median().head(5))


print(
    f"\nNumber of missing data in imputed data: "
    f"{df_imputed_num.isna().sum().sum()}"
)

Median values in original numerical data (first five variables):
id            3.76e+10
capacity      8.61e+06
volume        2.15e+06
percentage    0.00e+00
emission      0.00e+00
dtype: float64

Median values in imputed numerical data (first five variables):
id            3.76e+10
capacity      8.61e+06
volume        2.15e+06
percentage    0.00e+00
emission      0.00e+00
dtype: float64

Number of missing data in imputed data: 0


Imput missing values for categorical variable

In [11]:
df_orig_cat.mode()

Unnamed: 0,@id,@type,point,type,granularity,granularitytimezone,activity,classification,validfrom,validto,lastupdate
0,/v1/utilizations/23791261744,Utilization,/v1/points/0,/v1/types/0,/v1/granularities/4,/v1/granularity_time_zones/0,/v1/activities/1,/v1/classifications/2,2021-12-31T23:00:00+00:00,2021-12-31T23:15:00+00:00,2024-11-10T18:39:07+00:00
1,/v1/utilizations/23791591475,,,,,,,,2021-12-31T23:15:00+00:00,2021-12-31T23:30:00+00:00,
2,/v1/utilizations/23792244041,,,,,,,,2021-12-31T23:30:00+00:00,2021-12-31T23:45:00+00:00,
3,/v1/utilizations/23792573621,,,,,,,,2021-12-31T23:45:00+00:00,2022-01-01T00:00:00+00:00,
4,/v1/utilizations/23794423050,,,,,,,,2022-01-01T00:00:00+00:00,2022-01-01T00:15:00+00:00,
...,...,...,...,...,...,...,...,...,...,...,...
34939,/v1/utilizations/51525834909,,,,,,,,2022-12-30T21:45:00+00:00,2022-12-30T22:00:00+00:00,
34940,/v1/utilizations/51527700058,,,,,,,,2022-12-30T22:00:00+00:00,2022-12-30T22:15:00+00:00,
34941,/v1/utilizations/51528040982,,,,,,,,2022-12-30T22:15:00+00:00,2022-12-30T22:30:00+00:00,
34942,/v1/utilizations/51528716097,,,,,,,,2022-12-30T22:30:00+00:00,2022-12-30T22:45:00+00:00,


Intereative Plot

In [12]:
import plotly.express as px

df = df_orig_num

# 1. Interactive Time Series Plot
fig_line = px.line(
    df,
    x=df.index,
    y="capacity",
    title="Interactive Power Generation Capacity Time Series",
    labels={"x": "Time", "Capacity": "Wh)"},
    template="plotly_dark"
)
fig_line.update_xaxes(rangeslider_visible=True)
fig_line.show()

# 2. Interactive Histogram of Energy Price
fig_hist = px.histogram(
    df,
    x="capacity",
    nbins=50,
    title="Distribution of Power generation Capacity",
    labels={"Capacity": "Wh)"},
    template="plotly_dark"
)
fig_hist.update_layout(bargap=0.1)
fig_hist.show()

# 3. Interactive Box Plot for Energy Price
fig_box = px.box(
    df,
    y="capacity",
    title="Box Plot of Capacity",
    labels={"Capacity": "Wh)"},
    template="plotly_dark"
)
fig_box.show()