code to predict pv generation

In [2]:
import numpy as np
import pandas as pd
from scipy.stats import norm, gamma
import matplotlib.pyplot as plt
import scipy.stats as stats
from matplotlib.ticker import MaxNLocator # To ensure demand axis are integer.

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error

# Third party packages.
import os                       # Execute tasks related to your operating system.

import polars as pl             # Data handling ('Blazingly fast DataFrames') and is a drop-in replacement for pandas.
import pickle                   # Save and load data to and from pickle files.
import time
import json
import os

from icecream import ic         # Print variables.


Data Understanding

Load data set. Split with 80% and 20% split

In [3]:
# Load API Key and download directory from config file
CONFIG_FILE = "../config/api-call.json"

def load_config(config_path):
    """Load configuration from a JSON file."""
    with open(config_path, "r") as file:
        return json.load(file)

# Load config values
config = load_config(CONFIG_FILE)

# print(config)
DOWNLOAD_DIR = config["ned"]["ned_test_download_dir"]
print("DOWNLOAD_DIR :", DOWNLOAD_DIR)

# Step 1: Read JSON data from a file
json_file_path = os.path.join(DOWNLOAD_DIR, f"model-gen-test-data.json")

with open(json_file_path, "r") as json_file:
    dc_ned_json_data_1 = json.load(json_file)

# df_ned_json_data = dc_ned_json_data_1['df_orig']
df_ned_json_data = pd.DataFrame(dc_ned_json_data_1)  # Convert dictionary to DataFrame

df_pd_orig = df_ned_json_data
# Show a sample of first 3 rows.
df_pd_orig.head(3)


DOWNLOAD_DIR : ../data/


Unnamed: 0,ID,Date,Value
0,1,2025-01-01T00:00:00+00:00,102.8
1,2,2025-01-01T00:15:00+00:00,105.9
2,3,2025-01-01T00:30:00+00:00,110.1


Descriptive Statistics

In [4]:
# Using Pandas.
df_pd_num = df_pd_orig.select_dtypes(include='number')

# Column names (Pandas: df_pd_orig_num.columns.tolist()).
l_df_num_names = df_pd_num.columns
print(f"We have {len(l_df_num_names)} numerical variables:\n{l_df_num_names}")

# Convert to datetime format
df_pd_orig_str    = df_pd_orig.select_dtypes(include='object')
print("Number of rows:", df_pd_orig_str.shape[0], "and column names are:", df_pd_orig_str.columns, "and column types are:", df_pd_orig_str.dtypes)

We have 2 numerical variables:
Index(['ID', 'Value'], dtype='object')
Number of rows: 50 and column names are: Index(['Date'], dtype='object') and column types are: Date    object
dtype: object


To check if the output column has unique values

In [5]:
l_col = ['Value']

for s_col in l_col:
    print(f"{l_col}: {df_pd_orig[l_col]}")
    print(f"{s_col}: {df_pd_orig[s_col].unique()}")

['Value']:     Value
0   102.8
1   105.9
2   110.1
3   115.3
4   121.0
5   127.2
6   133.7
7   140.2
8   146.5
9   152.2
10  157.0
11  161.0
12  164.2
13  166.8
14  168.9
15  170.6
16  172.0
17  173.2
18  174.4
19  175.5
20  176.5
21  177.6
22  178.6
23  179.7
24  180.8
25  181.9
26  183.0
27  184.0
28  185.1
29  186.2
30  187.3
31  188.4
32  189.5
33  190.6
34  191.7
35  192.8
36  193.9
37  195.0
38  196.1
39  197.2
40  198.3
41  199.4
42  200.5
43  201.6
44  202.7
45  203.8
46  204.9
47  206.0
48  207.1
49  208.2
Value: [102.8 105.9 110.1 115.3 121.  127.2 133.7 140.2 146.5 152.2 157.  161.
 164.2 166.8 168.9 170.6 172.  173.2 174.4 175.5 176.5 177.6 178.6 179.7
 180.8 181.9 183.  184.  185.1 186.2 187.3 188.4 189.5 190.6 191.7 192.8
 193.9 195.  196.1 197.2 198.3 199.4 200.5 201.6 202.7 203.8 204.9 206.
 207.1 208.2]


Contruct a descriptive summary

In [6]:
df_pd_orig.describe()

Unnamed: 0,ID,Value
count,50.0,50.0
mean,25.5,174.138
std,14.57738,28.429367
min,1.0,102.8
25%,13.25,164.85
50%,25.5,181.35
75%,37.75,194.725
max,50.0,208.2


Prepare daya for prediction

In [7]:
# Convert timestamp to datetime format
df_pd_orig["Date"] = pd.to_datetime(df_pd_orig["Date"])

# Extract Features from Timestamp
df_pd_orig["year"] = df_pd_orig["Date"].dt.year
df_pd_orig["month"] = df_pd_orig["Date"].dt.month
df_pd_orig["day"] = df_pd_orig["Date"].dt.day
df_pd_orig["hour"] = df_pd_orig["Date"].dt.hour
df_pd_orig["minute"] = df_pd_orig["Date"].dt.minute

# Select Features (X) and Target Variable (y)
X = df_pd_orig[["year", "month", "day", "hour", "minute"]]  # Features
y = df_pd_orig["Value"]  # Target (Numerical column to predict)


# Split Data into Training & Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


Predict using linear regression model

In [8]:
# Step 4: Train a Model (Linear Regression)


# If you need numeric values (e.g., Unix timestamp or ordinal)
# df_cleaned_datetime_PVW["timestamp"] = df_cleaned_datetime_PVW["datetime_parsed"].astype("int64") // 10**9  # Unix timestamp (seconds since 1970)
# selected_columns_df = df_pd_orig[["validto", "capacity"]]
# 02-02-2025 23:30

# X = selected_columns_df[["validto"]]  # Features
# y = selected_columns_df["capacity"]  # Target variable

model = LinearRegression()
model.fit(X_train, y_train)

#  Step 5: Make Predictions
y_pred = model.predict(X_test)

# Step 6: Evaluate the Model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Step 7: Convert Predictions Back to Polars (Optional)
df_pred = pl.DataFrame({"X Values": X_test.values, "Actual": y_test.values, "Predicted": y_pred})
print(df_pred)

Mean Absolute Error: 8.680518601514354
shape: (13, 3)
┌─────────────────┬────────┬────────────┐
│ X Values        ┆ Actual ┆ Predicted  │
│ ---             ┆ ---    ┆ ---        │
│ array[i32, 5]   ┆ f64    ┆ f64        │
╞═════════════════╪════════╪════════════╡
│ [2025, 1, … 15] ┆ 166.8  ┆ 151.555233 │
│ [2025, 1, … 45] ┆ 197.2  ┆ 201.229184 │
│ [2025, 1, … 30] ┆ 187.3  ┆ 183.885108 │
│ [2025, 1, … 15] ┆ 203.8  ┆ 211.49843  │
│ [2025, 1, … 15] ┆ 173.2  ┆ 159.048133 │
│ …               ┆ …      ┆ …          │
│ [2025, 1, … 0]  ┆ 189.5  ┆ 186.661454 │
│ [2025, 1, … 45] ┆ 175.5  ┆ 163.764686 │
│ [2025, 1, … 0]  ┆ 164.2  ┆ 149.196957 │
│ [2025, 1, … 0]  ┆ 121.0  ┆ 134.211158 │
│ [2025, 1, … 15] ┆ 195.0  ┆ 196.512631 │
└─────────────────┴────────┴────────────┘


Predict using LASSO model

In [10]:
# Train Lasso Regression Model
lasso = Lasso(alpha=0.1)  # Adjust alpha for regularization strength
lasso.fit(X_train, y_train)
# Make Predictions
y_pred = lasso.predict(X_test)

# Evaluate Model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Step 7: Convert Predictions Back to Polars (Optional)
df_pred = pl.DataFrame({"X Values": X_test.values, "Actual": y_test.values, "Predicted": y_pred})
print(df_pred)


Mean Absolute Error: 8.670693727998536
shape: (13, 3)
┌─────────────────┬────────┬────────────┐
│ X Values        ┆ Actual ┆ Predicted  │
│ ---             ┆ ---    ┆ ---        │
│ array[i32, 5]   ┆ f64    ┆ f64        │
╞═════════════════╪════════╪════════════╡
│ [2025, 1, … 15] ┆ 166.8  ┆ 151.57793  │
│ [2025, 1, … 45] ┆ 197.2  ┆ 201.193651 │
│ [2025, 1, … 30] ┆ 187.3  ┆ 183.871323 │
│ [2025, 1, … 15] ┆ 203.8  ┆ 211.462194 │
│ [2025, 1, … 15] ┆ 173.2  ┆ 159.063463 │
│ …               ┆ …      ┆ …          │
│ [2025, 1, … 0]  ┆ 189.5  ┆ 186.654334 │
│ [2025, 1, … 45] ┆ 175.5  ┆ 163.765986 │
│ [2025, 1, … 0]  ┆ 164.2  ┆ 149.226669 │
│ [2025, 1, … 0]  ┆ 121.0  ┆ 134.255602 │
│ [2025, 1, … 15] ┆ 195.0  ┆ 196.491128 │
└─────────────────┴────────┴────────────┘
