import required packages

In [56]:
# Third party packages.
import pandas as pd             # Data handling
import numpy as np              # Numeric calculations
import pickle                   # Save and load data
import altair as alt            # Visualize data
# import seaborn as sns           # Visualize data
import matplotlib.pyplot as plt # Visualize data

from scipy.stats import pearsonr # Correlation

load pkl file

In [57]:
with open('../data/dc-ned-json-data-1.pkl', 'rb') as pickle_file:
    dc_ned_json_data_1 = pickle.load(pickle_file)

df_ned_json_data = dc_ned_json_data_1['df_orig']

set pandas dataframe settings

In [58]:
# Setting Pandas options.
pd.set_option("display.max_rows", 50) # How to display all rows from data frame using pandas. Setting value to None to show all rows.
pd.set_option("display.max_columns", None)
pd.set_option("display.max_info_columns", 100)
pd.set_option("display.max_info_rows", 1000000)
pd.set_option("display.precision", 2)
#pd.set_option("styler.format.precision", 2)

Descriptive analytics

In [59]:
# Show a sample of first 3 rows.
df_ned_json_data.head(3)

Unnamed: 0,@id,@type,id,point,type,granularity,granularitytimezone,activity,classification,capacity,volume,percentage,emission,emissionfactor,validfrom,validto,lastupdate
0,/v1/utilizations/68092561339,Utilization,68092561339,/v1/points/0,/v1/types/2,/v1/granularities/3,/v1/granularity_time_zones/0,/v1/activities/1,/v1/classifications/2,0,0,0.0,0,0,2025-01-31T23:00:00+00:00,2025-01-31T23:10:00+00:00,2025-02-02T23:52:51+00:00
1,/v1/utilizations/68092561340,Utilization,68092561340,/v1/points/0,/v1/types/2,/v1/granularities/3,/v1/granularity_time_zones/0,/v1/activities/1,/v1/classifications/2,0,0,0.0,0,0,2025-01-31T23:10:00+00:00,2025-01-31T23:20:00+00:00,2025-02-02T23:52:51+00:00
2,/v1/utilizations/68092623134,Utilization,68092623134,/v1/points/0,/v1/types/2,/v1/granularities/3,/v1/granularity_time_zones/0,/v1/activities/1,/v1/classifications/2,0,0,0.0,0,0,2025-01-31T23:20:00+00:00,2025-01-31T23:30:00+00:00,2025-02-02T23:52:51+00:00


Data Understanding

a. Which variables are numerical and which can categorical? How many variables are of both types?

In [60]:
print(
    f"Number of columns in the original data: "
    f"{df_ned_json_data.shape[1]}"
    "\n"
)

df_orig_num    = df_ned_json_data.select_dtypes(include='number')
l_df_num_names = df_orig_num.columns.tolist()

print(f"\nNumber of numerical variables: {len(l_df_num_names)}")
print(l_df_num_names)


df_orig_cat    = df_ned_json_data.select_dtypes(include='object')
l_df_cat_names = list(df_orig_cat.columns)

print(f"\nNumber of categorical variables: {len(l_df_cat_names)}")
print(l_df_cat_names)


Number of columns in the original data: 17


Number of numerical variables: 6
['id', 'capacity', 'volume', 'percentage', 'emission', 'emissionfactor']

Number of categorical variables: 11
['@id', '@type', 'point', 'type', 'granularity', 'granularitytimezone', 'activity', 'classification', 'validfrom', 'validto', 'lastupdate']


Next to check for missing values, variable completeness

df_orig_num.info()

Create a frequency table counting number of missing values per variable

In [61]:
# Pandas Series with type of each variable (variable, column) in df_orig.
ps_missing_type    = df_ned_json_data.dtypes

# Number of missing data per variable.
ps_missing_total   = df_ned_json_data.isnull().sum()

# Percentage of missing per variable.
ps_missing_percent = round(100 * ps_missing_total / df_ned_json_data.shape[0], 1)

In [62]:
# Create table (Pandas DataFrame).
df_missing_data = pd.DataFrame({

    'data_type':   ps_missing_type,
    'empty_total': ps_missing_total,
    'empty_perc':  ps_missing_percent
})

# Sort table by number of missing data in descending order.
df_missing_data.sort_values(
    by        = 'empty_total',
    ascending = False,
    inplace   = True
)

# Remove variables that have no missing values.
df_missing_data = df_missing_data.query("empty_total > 0")

# Show table.
print(
    f"Number of variables having missing data: "
    f"{df_missing_data.shape[0]} (out of {df_ned_json_data.shape[1]})"
)

df_missing_data

Number of variables having missing data: 0 (out of 17)


Unnamed: 0,data_type,empty_total,empty_perc


Conduct descriptive / summary statistics

In [63]:
df_orig_num.describe()
df_orig_cat.describe()

Unnamed: 0,@id,@type,point,type,granularity,granularitytimezone,activity,classification,validfrom,validto,lastupdate
count,144,144,144,144,144,144,144,144,144,144,144
unique,144,1,1,1,1,1,1,1,144,144,24
top,/v1/utilizations/68092561339,Utilization,/v1/points/0,/v1/types/2,/v1/granularities/3,/v1/granularity_time_zones/0,/v1/activities/1,/v1/classifications/2,2025-01-31T23:00:00+00:00,2025-01-31T23:10:00+00:00,2025-02-02T23:52:51+00:00
freq,1,144,144,144,144,144,144,144,1,1,6


data preparation phase

In [64]:
df_orig_num.median()

id                6.81e+10
capacity          0.00e+00
volume            0.00e+00
percentage        0.00e+00
emission          0.00e+00
emissionfactor    0.00e+00
dtype: float64

Impute missing values

In [65]:
df_imputed_num = df_orig_num.replace(np.nan, df_orig_num.median())

print("Median values in original numerical data (first five variables):")
print(df_orig_num.median().head(5))

print("\nMedian values in imputed numerical data (first five variables):")
print(df_imputed_num.median().head(5))


print(
    f"\nNumber of missing data in imputed data: "
    f"{df_imputed_num.isna().sum().sum()}"
)

Median values in original numerical data (first five variables):
id            6.81e+10
capacity      0.00e+00
volume        0.00e+00
percentage    0.00e+00
emission      0.00e+00
dtype: float64

Median values in imputed numerical data (first five variables):
id            6.81e+10
capacity      0.00e+00
volume        0.00e+00
percentage    0.00e+00
emission      0.00e+00
dtype: float64

Number of missing data in imputed data: 0


Imput missing values for categorical variable

In [66]:
df_orig_cat.mode()

Unnamed: 0,@id,@type,point,type,granularity,granularitytimezone,activity,classification,validfrom,validto,lastupdate
0,/v1/utilizations/68092561339,Utilization,/v1/points/0,/v1/types/2,/v1/granularities/3,/v1/granularity_time_zones/0,/v1/activities/1,/v1/classifications/2,2025-01-31T23:00:00+00:00,2025-01-31T23:10:00+00:00,2025-02-02T23:52:51+00:00
1,/v1/utilizations/68092561340,,,,,,,,2025-01-31T23:10:00+00:00,2025-01-31T23:20:00+00:00,2025-02-03T00:52:44+00:00
2,/v1/utilizations/68092623134,,,,,,,,2025-01-31T23:20:00+00:00,2025-01-31T23:30:00+00:00,2025-02-03T01:52:42+00:00
3,/v1/utilizations/68092667002,,,,,,,,2025-01-31T23:30:00+00:00,2025-01-31T23:40:00+00:00,2025-02-03T02:52:40+00:00
4,/v1/utilizations/68092710979,,,,,,,,2025-01-31T23:40:00+00:00,2025-01-31T23:50:00+00:00,2025-02-03T03:52:41+00:00
...,...,...,...,...,...,...,...,...,...,...,...
139,/v1/utilizations/68103762862,,,,,,,,2025-02-01T22:10:00+00:00,2025-02-01T22:20:00+00:00,
140,/v1/utilizations/68103824413,,,,,,,,2025-02-01T22:20:00+00:00,2025-02-01T22:30:00+00:00,
141,/v1/utilizations/68103868013,,,,,,,,2025-02-01T22:30:00+00:00,2025-02-01T22:40:00+00:00,
142,/v1/utilizations/68103929818,,,,,,,,2025-02-01T22:40:00+00:00,2025-02-01T22:50:00+00:00,
