Python notebook file for descriptive analytics

In [45]:
# Third party packages.
import pandas as pd     # Data handling
import numpy as np      # Numeric calculations
import pickle           # Save and load data
import openpyxl         # for excel manupulations
import os
import json             # for JSON config file read


set pandas dataframe settings

In [46]:
# Setting Pandas options.
pd.set_option("display.max_rows", 50) # How to display all rows from data frame using pandas. Setting value to None to show all rows.
pd.set_option("display.max_columns", None)
pd.set_option("display.max_info_columns", 100)
pd.set_option("display.max_info_rows", 1000000)
pd.set_option("display.precision", 2)
#pd.set_option("styler.format.precision", 2)

read csv

In [47]:
# Load API Key and download directory from config file
CONFIG_FILE = "config/weather-data-config.json"

def load_config(config_path):
    """Load configuration from a JSON file."""
    with open(config_path, "r") as file:
        return json.load(file)

# Directory containing CSV files
config = load_config(CONFIG_FILE)
# CSV_DIR = "../../src/data/pv-gen-huis-01"  # Change to your folder name
# OUTPUT_FILE = "../../src/data/pv-gen-huis-01/merged_output.csv"  # Name of the merged CSV file


In [48]:
# (I) URL to repo on GitHub.
data_file = config["weather_file"]  # Name of the data CSV file


# (III) Load data from GitHub using Polars.
df_orig = (
    pd.read_csv(data_file)
)

df_orig = pd.read_csv(data_file)
# Show a sample of first 3 rows.
df_orig.head(3)

Unnamed: 0,STN,YYYYMMDD,HH,DD,FH,FF,FX,T,T10N,TD,SQ,Q,DR,RH,P,VV,N,U,WW,IX,M,R,S,O,Y
0,370,20230101,1,220,110,120,190,163,,59,0,0,0,-1,10087,82,8,49,23.0,7,0,1,0,0,0
1,370,20230101,2,210,100,90,180,156,,57,0,0,0,0,10093,83,8,51,,5,0,0,0,0,0
2,370,20230101,3,210,100,100,170,153,,60,0,0,0,0,10095,82,8,53,2.0,7,0,0,0,0,0


Data Understanding

a. Which variables are numerical and which can categorical? How many variables are of both types?

In [49]:
print(
    f"Number of columns in the original data: "
    f"{df_orig.shape[1]}"
    "\n"
)

df_orig_num    = df_orig.select_dtypes(include='number')
l_df_num_names = df_orig_num.columns.tolist()

print(l_df_num_names)
print(f"\nNumber of numerical variables: {len(l_df_num_names)}")

df_orig_cat    = df_orig.select_dtypes(include='object')
l_df_cat_names = list(df_orig_cat.columns)

print(l_df_cat_names)
print(f"\nNumber of categorical variables: {len(l_df_cat_names)}")


Number of columns in the original data: 25

['STN', 'YYYYMMDD', 'HH', 'DD', 'FH', 'FF', 'FX', 'T', 'T10N', 'TD', 'SQ', 'Q', 'DR', 'RH', 'P', 'VV', 'N', 'U', 'WW', 'IX', 'M', 'R', 'S', 'O', 'Y']

Number of numerical variables: 25
[]

Number of categorical variables: 0


Next to check for missing values, variable completeness

In [50]:
df_orig_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3192 entries, 0 to 3191
Data columns (total 25 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   STN       3192 non-null   int64  
 1   YYYYMMDD  3192 non-null   int64  
 2   HH        3192 non-null   int64  
 3   DD        3192 non-null   int64  
 4   FH        3192 non-null   int64  
 5   FF        3192 non-null   int64  
 6   FX        3192 non-null   int64  
 7   T         3192 non-null   int64  
 8   T10N      532 non-null    float64
 9   TD        3192 non-null   int64  
 10  SQ        3192 non-null   int64  
 11  Q         3192 non-null   int64  
 12  DR        3192 non-null   int64  
 13  RH        3192 non-null   int64  
 14  P         3192 non-null   int64  
 15  VV        3192 non-null   int64  
 16  N         3192 non-null   int64  
 17  U         3192 non-null   int64  
 18  WW        1731 non-null   float64
 19  IX        3192 non-null   int64  
 20  M         3192 non-null   int6

Create a frequency table counting number of missing values per variable

In [51]:
# Pandas Series with type of each variable (variable, column) in df_orig.
ps_missing_type    = df_orig.dtypes

# Number of missing data per variable.
ps_missing_total   = df_orig.isnull().sum()

# Percentage of missing per variable.
ps_missing_percent = round(100 * ps_missing_total / df_orig.shape[0], 1)

In [52]:
# Create table (Pandas DataFrame).
df_missing_data = pd.DataFrame({

    'data_type':   ps_missing_type,
    'empty_total': ps_missing_total,
    'empty_perc':  ps_missing_percent
})

# Sort table by number of missing data in descending order.
df_missing_data.sort_values(
    by        = 'empty_total',
    ascending = False,
    inplace   = True
)

# Remove variables that have no missing values.
df_missing_data = df_missing_data.query("empty_total > 0")

# Show table.
print(
    f"Number of variables having missing data: "
    f"{df_missing_data.shape[0]} (out of {df_orig.shape[1]})"
)

df_missing_data

Number of variables having missing data: 2 (out of 25)


Unnamed: 0,data_type,empty_total,empty_perc
T10N,float64,2660,83.3
WW,float64,1461,45.8


Conduct descriptive / summary statistics

In [53]:
df_orig_num.describe()

Unnamed: 0,STN,YYYYMMDD,HH,DD,FH,FF,FX,T,T10N,TD,SQ,Q,DR,RH,P,VV,N,U,WW,IX,M,R,S,O,Y
count,3192.0,3190.0,3192.0,3192.0,3192.0,3192.0,3192.0,3192.0,532.0,3192.0,3192.0,3192.0,3192.0,3192.0,3192.0,3192.0,3192.0,3192.0,1731.0,3192.0,3192.0,3192.0,3192.0,3190.0,3192.0
mean,370.0,20200000.0,12.5,194.25,41.98,42.13,71.11,43.91,21.52,22.43,0.95,12.41,1.16,1.05,10182.41,58.92,6.67,86.26,31.09,6.08,0.06,0.25,0.03,0.000627,0.04
std,0.0,8130.0,6.92,111.32,25.59,25.96,41.16,45.43,52.3,45.18,2.63,25.57,2.87,4.11,151.6,19.69,2.76,11.04,25.33,1.0,0.24,0.43,0.18,0.025,0.19
min,370.0,20200000.0,1.0,0.0,0.0,0.0,0.0,-83.0,-118.0,-132.0,0.0,0.0,0.0,-1.0,9769.0,0.0,0.0,29.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0
25%,370.0,20200000.0,6.75,150.0,20.0,20.0,40.0,7.0,-13.25,-7.0,0.0,0.0,0.0,0.0,10065.0,56.0,7.0,80.0,10.0,5.0,0.0,0.0,0.0,0.0,0.0
50%,370.0,20200000.0,12.5,210.0,40.0,40.0,60.0,44.0,22.0,23.0,0.0,0.0,0.0,0.0,10207.0,64.0,8.0,89.0,23.0,7.0,0.0,0.0,0.0,0.0,0.0
75%,370.0,20300000.0,18.25,230.0,60.0,60.0,100.0,81.25,64.25,56.0,0.0,13.0,0.0,0.0,10305.25,70.0,8.0,95.0,52.0,7.0,0.0,0.0,0.0,0.0,0.0
max,370.0,20300000.0,24.0,990.0,150.0,160.0,250.0,163.0,137.0,117.0,10.0,151.0,10.0,69.0,10433.0,83.0,9.0,100.0,86.0,7.0,1.0,1.0,1.0,1.0,1.0


In [54]:
# df_orig_cat.describe()

Data Preparation phase

In [55]:
df_orig_num.median()

STN         3.70e+02
YYYYMMDD    2.02e+07
HH          1.25e+01
DD          2.10e+02
FH          4.00e+01
FF          4.00e+01
FX          6.00e+01
T           4.40e+01
T10N        2.20e+01
TD          2.30e+01
SQ          0.00e+00
Q           0.00e+00
DR          0.00e+00
RH          0.00e+00
P           1.02e+04
VV          6.40e+01
N           8.00e+00
U           8.90e+01
WW          2.30e+01
IX          7.00e+00
M           0.00e+00
R           0.00e+00
S           0.00e+00
O           0.00e+00
Y           0.00e+00
dtype: float64

Impute missing values

In [56]:
df_imputed_num = df_orig_num.replace(np.nan, df_orig_num.median())

print("Median values in original numerical data (first five variables):")
print(df_orig_num.median().head(5))

print("\nMedian values in imputed numerical data (first five variables):")
print(df_imputed_num.median().head(5))


print(
    f"\nNumber of missing data in imputed data: "
    f"{df_imputed_num.isna().sum().sum()}"
)

Median values in original numerical data (first five variables):
STN         3.70e+02
YYYYMMDD    2.02e+07
HH          1.25e+01
DD          2.10e+02
FH          4.00e+01
dtype: float64

Median values in imputed numerical data (first five variables):
STN         3.70e+02
YYYYMMDD    2.02e+07
HH          1.25e+01
DD          2.10e+02
FH          4.00e+01
dtype: float64

Number of missing data in imputed data: 0
