code to predict pv generation

In [24]:
import numpy as np
import pandas as pd
from scipy.stats import norm, gamma
import matplotlib.pyplot as plt
import scipy.stats as stats
from matplotlib.ticker import MaxNLocator # To ensure demand axis are integer.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Third party packages.
import os                       # Execute tasks related to your operating system.

import polars as pl             # Data handling ('Blazingly fast DataFrames') and is a drop-in replacement for pandas.
import pickle                   # Save and load data to and from pickle files.
import time

from icecream import ic         # Print variables.


Data Understanding

Load data set. Split with 80% and 20% split

In [25]:
# (I) URL to repo on GitHub.
data_file = "../../src/data/pv-gen-huis-01/merged_output.csv"  # Name of the merged CSV file


# (III) Load data from GitHub using Polars.
df_pl_orig = (
    pl.read_csv(data_file)
)

df_pd_orig = pd.read_csv(data_file)
# Show a sample of first 3 rows.
df_pd_orig.head(3)


Unnamed: 0,Time,PV Productie (W)
0,21-07-2024 0:00,0
1,21-07-2024 0:15,0
2,21-07-2024 0:30,0


Descriptive Statistics

In [26]:
# Using Pandas.
df_pd_num = df_pd_orig.select_dtypes(include='number')

# Column names (Pandas: df_pd_orig_num.columns.tolist()).
l_df_num_names = df_pd_num.columns
print(f"We have {len(l_df_num_names)} numerical variables:\n{l_df_num_names}")

# In Pandas.
# Convert to datetime format
df_pd_orig_str    = df_pd_orig.select_dtypes(include='object')
# Convert to datetime with errors='coerce' (invalid values become NaT)
df_pd_orig_str["datetime_parsed"] = pd.to_datetime(df_pd_orig_str["Time"], format="%d-%m-%Y %H:%M", errors="coerce")
print("Number of rows:", df_pd_orig_str.shape[0], "and column names are:", df_pd_orig_str.columns, "and column types are:", df_pd_orig_str.dtypes)

# Display invalid datetime values
#invalid_dates = df_pd_orig_str[df_pd_orig_str["datetime_parsed"].isna()]
#print(invalid_dates)

# Drop rows with invalid datetime values (NaT)
df_datetime_cleaned = df_pd_orig_str.dropna(subset=["datetime_parsed"])
print("Number of rows:", df_datetime_cleaned.shape[0], "and column names are:", df_datetime_cleaned.columns)

df_cleaned_datetime_PVW = df_datetime_cleaned[df_datetime_cleaned["PV Productie (W)"] != "0"]
print("Number of rows:", df_cleaned_datetime_PVW.shape[0], "and column names are:", df_cleaned_datetime_PVW.columns)
print(df_cleaned_datetime_PVW.head(3))

# Check data types
df_pd_obj    = df_cleaned_datetime_PVW.select_dtypes(include='object')
# Column names (Pandas: df_pd_orig_str.columns.tolist()).
l_df_str_names = df_cleaned_datetime_PVW.columns

print(f"We have {len(l_df_str_names)} string variables:\n{l_df_str_names}")

print(
    f"The data set has  {df_cleaned_datetime_PVW.shape[1]} columns. The number and string data "
    f"have total of {df_cleaned_datetime_PVW.shape[1] + df_cleaned_datetime_PVW.shape[1]} columns."
)


# We use the number of observations more often, so we define a variable.
n_obs = df_cleaned_datetime_PVW.shape[0]


We have 0 numerical variables:
Index([], dtype='object')
Number of rows: 16974 and column names are: Index(['Time', 'PV Productie (W)', 'datetime_parsed'], dtype='object') and column types are: Time                        object
PV Productie (W)            object
datetime_parsed     datetime64[ns]
dtype: object
Number of rows: 16800 and column names are: Index(['Time', 'PV Productie (W)', 'datetime_parsed'], dtype='object')
Number of rows: 7961 and column names are: Index(['Time', 'PV Productie (W)', 'datetime_parsed'], dtype='object')
               Time PV Productie (W)     datetime_parsed
24  21-07-2024 6:00           2.5608 2024-07-21 06:00:00
25  21-07-2024 6:15          12.0613 2024-07-21 06:15:00
26  21-07-2024 6:30          20.8753 2024-07-21 06:30:00
We have 3 string variables:
Index(['Time', 'PV Productie (W)', 'datetime_parsed'], dtype='object')
The data set has  3 columns. The number and string data have total of 6 columns.


Look for missing data

In [27]:
l_col = ['PV Productie (W)']

# Pandas
print("Pandas:")

for s_col in l_col:
    # print(f"{l_col}: {df_cleaned_datetime_PVW[l_col]}")
    print(f"{s_col}: {df_cleaned_datetime_PVW[s_col].unique()}")

# Comms
print("")
# print(f"Pandas: {list(df_cleaned_datetime_PVW['PV Productie (W)'][53:58])}")

Pandas:
PV Productie (W): ['2.5608' '12.0613' '20.8753' ... '110.7054' '90.8471' '17.8264']



Contruct a descriptive summary

In [28]:
df_cleaned_datetime_PVW.describe()

Unnamed: 0,datetime_parsed
count,7961
mean,2024-09-18 09:38:46.026881024
min,2024-07-01 05:30:00
25%,2024-08-02 10:00:00
50%,2024-09-07 07:00:00
75%,2024-10-20 17:15:00
max,2025-02-09 17:30:00


Predict

In [29]:
df_pd_orig_str_copy = df_cleaned_datetime_PVW


# Step 4: Train a Model (Linear Regression)


# If you need numeric values (e.g., Unix timestamp or ordinal)
df_cleaned_datetime_PVW["timestamp"] = df_cleaned_datetime_PVW["datetime_parsed"].astype("int64") // 10**9  # Unix timestamp (seconds since 1970)
selected_columns_df = df_cleaned_datetime_PVW[["timestamp", "PV Productie (W)"]]
# 02-02-2025 23:30

X = selected_columns_df[["timestamp"]]  # Features
y = selected_columns_df["PV Productie (W)"]  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

#  Step 5: Make Predictions
y_pred = model.predict(X_test)

# Step 6: Evaluate the Model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Step 7: Convert Predictions Back to Polars (Optional)
df_pred = pl.DataFrame({"X Values": X_test.values, "Actual": y_test.values, "Predicted": y_pred})
print(df_pred)

Mean Absolute Error: 579.4446337943965
shape: (80, 3)
┌───────────────┬───────────┬─────────────┐
│ X Values      ┆ Actual    ┆ Predicted   │
│ ---           ┆ ---       ┆ ---         │
│ array[i64, 1] ┆ str       ┆ f64         │
╞═══════════════╪═══════════╪═════════════╡
│ [1730103300]  ┆ 224.8311  ┆ 629.0541    │
│ [1725816600]  ┆ 499.5489  ┆ 900.880887  │
│ [1725784200]  ┆ 626.1318  ┆ 902.935425  │
│ [1732188600]  ┆ 136.987   ┆ 496.821754  │
│ [1724936400]  ┆ 2277.678  ┆ 956.695835  │
│ …             ┆ …         ┆ …           │
│ [1730552400]  ┆ 503.8647  ┆ 600.575921  │
│ [1737204300]  ┆ 281.4701  ┆ 178.767862  │
│ [1723380300]  ┆ 3244.9167 ┆ 1055.370728 │
│ [1721335500]  ┆ 62.158    ┆ 1185.034902 │
│ [1724688000]  ┆ 844.0526  ┆ 972.447293  │
└───────────────┴───────────┴─────────────┘


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned_datetime_PVW["timestamp"] = df_cleaned_datetime_PVW["datetime_parsed"].astype("int64") // 10**9  # Unix timestamp (seconds since 1970)
