In [1]:
import itertools

import matplotlib
import xgboost as xgb
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import numpy as np
import time

from dateutil.parser import parse
import pytz

In [2]:
# process data
data_file_name = "/kaggle/input/sol-scada/2023-07-30_to_2024-06-04.csv"

# Load the solar data
solar_data = pd.read_csv(data_file_name)
data = pd.DataFrame()
data[['date_time', 'power', 'ambient_temparature', 'temparature', 'irradiance']] = solar_data[[
    'DateTime', 
    'Active Power(kW)', 
    'Ambient Temperature(°C)', 
    'Temperature Module1(°C)', 
    'Irradiance(W/m2)'
]]

# remove row where power = 0
data = data[(data['power'] > 0) & (data['power'] < 1000) & (data['irradiance'] > 0)]
MAX = data['power'].max()
print(MAX)
data['power'] = data['power'] / MAX

868.3506


In [3]:
# from sklearn.model_selection import train_test_split
# train, test = train_test_split(
#     data, test_size=0.2, random_state=42
# )
# Split the data into a test/train split
data.dropna(how="any", inplace=True)
data = data.sample(frac=1).reset_index(drop=True)
length = len(data)
split_length = int(length * 0.2)
train, test = data[: (length - split_length)], data[(length - split_length) :]
print(train)

                date_time     power  ambient_temparature  temparature  \
0     01/09/2024 15:00:00  0.514848                 37.6         48.0   
1     09/09/2023 08:30:00  0.147496                 30.8         39.4   
2     08/30/2023 09:00:00  0.373607                 30.5         40.3   
3     10/01/2023 06:00:00  0.025845                 25.5         24.0   
4     02/20/2024 07:00:00  0.076739                 24.1         23.0   
...                   ...       ...                  ...          ...   
5640  04/21/2024 10:30:00  0.545487                 36.2         53.1   
5641  01/07/2024 10:30:00  0.622510                 35.6         50.2   
5642  09/24/2023 16:00:00  0.079403                 25.3         25.7   
5643  11/18/2023 08:30:00  0.381776                 26.4         31.9   
5644  12/25/2023 14:30:00  0.438123                 36.2         54.4   

      irradiance  
0            494  
1            122  
2            297  
3             24  
4             80  
...      

In [4]:
# train model
# Pull apart the independent and dependent variables
train_target = train["power"]
test_target = test["power"]

# Create copies of the DataFrames without modifying the originals
train_copy = train.drop(["date_time", "power"], axis=1).copy()
test_copy = test.drop(["date_time", "power"], axis=1).copy()

# Build XGBoost DMatrix for both train and test with their target variables
train_dmatrix = xgb.DMatrix(train_copy, train_target)
test_dmatrix = xgb.DMatrix(test_copy, test_target)

# Build the XGB parameter list
param = {
    "booster": "gbtree",
    "verbosity": 1,
    "nthread": 12,
    "learning_rate": 0.005,
    "max_depth": 6,
    "subsample": 0.8,
    "lambda": 1.0,
    "objective": "reg:logistic",
    "eval_metric": ["auc", "mae"]
}

# Train the model
eval_list = [(test_dmatrix, "eval"), (train_dmatrix, "train")]
num_round = 10_000
start = time.time()
bst = xgb.train(
    param,
    train_dmatrix,
    num_boost_round=num_round,
    evals=eval_list,
    early_stopping_rounds=20,
)

print(f"Duration: {time.time() - start}s")

[0]	eval-auc:0.84187	eval-mae:0.26004	train-auc:0.83687	train-mae:0.25280
[1]	eval-auc:0.84259	eval-mae:0.25884	train-auc:0.83768	train-mae:0.25162
[2]	eval-auc:0.84309	eval-mae:0.25764	train-auc:0.83809	train-mae:0.25044
[3]	eval-auc:0.84309	eval-mae:0.25646	train-auc:0.83812	train-mae:0.24928
[4]	eval-auc:0.84316	eval-mae:0.25528	train-auc:0.83818	train-mae:0.24811
[5]	eval-auc:0.84314	eval-mae:0.25410	train-auc:0.83827	train-mae:0.24696
[6]	eval-auc:0.84324	eval-mae:0.25293	train-auc:0.83835	train-mae:0.24581
[7]	eval-auc:0.84327	eval-mae:0.25176	train-auc:0.83838	train-mae:0.24467
[8]	eval-auc:0.84333	eval-mae:0.25060	train-auc:0.83843	train-mae:0.24353
[9]	eval-auc:0.84331	eval-mae:0.24944	train-auc:0.83842	train-mae:0.24240
[10]	eval-auc:0.84329	eval-mae:0.24830	train-auc:0.83843	train-mae:0.24127
[11]	eval-auc:0.84331	eval-mae:0.24715	train-auc:0.83845	train-mae:0.24015
[12]	eval-auc:0.84332	eval-mae:0.24601	train-auc:0.83845	train-mae:0.23904
[13]	eval-auc:0.84335	eval-mae:0.24

In [5]:
pickle.dump(bst, open("/kaggle/working/bst_model_v2.pck", "wb"))

In [6]:
def plot_model(bst):

    # Plot setup
    sns.set(font_scale=1.25, rc={"figure.figsize": (15, 10)})
    sns.set_style("ticks")  # ticks
    sns.set_palette("colorblind")

    # Plot the importance of each feature
    xgb.plot_importance(bst)
    plt.gcf().subplots_adjust(left=0.15)
    plt.savefig("/kaggle/working/importance.png", bbox_inches="tight")
    clear_plots()

    # Plot setup
    sns.set(font_scale=1.25, rc={"figure.figsize": (20, 10)})
    sns.set_style("whitegrid")  # ticks
    sns.set_palette("colorblind")

    # Plot a decision tree
    matplotlib.rcParams["figure.dpi"] = 1080
    xgb.plot_tree(bst, num_trees=4, rankdir="LR")
    plt.savefig("/kaggle/working/tree.png")
    clear_plots()

In [7]:
def clear_plots():
    plt.clf()
    import importlib

    importlib.reload(matplotlib)
    importlib.reload(plt)
    importlib.reload(sns)

In [8]:
origin_test = test

In [9]:
origin_test = origin_test.sort_values(by='date_time', ascending=False)
print(origin_test)

                date_time     power  ambient_temparature  temparature  \
6247  12/31/2023 16:30:00  0.196038                 25.8         27.2   
5671  12/31/2023 14:30:00  0.144172                 32.8         37.0   
6321  12/31/2023 13:00:00  0.557625                 35.2         51.9   
6932  12/31/2023 09:00:00  0.501718                 35.8         47.5   
6551  12/30/2023 11:00:00  0.834078                 37.5         62.2   
...                   ...       ...                  ...          ...   
6870  01/02/2024 16:00:00  0.208776                 31.4         34.3   
6557  01/02/2024 15:00:00  0.137659                 33.3         39.0   
6950  01/02/2024 13:00:00  0.218821                 32.8         40.2   
6611  01/02/2024 10:30:00  0.670992                 34.0         52.8   
6173  01/02/2024 09:00:00  0.535827                 31.2         45.9   

      irradiance  
6247         178  
5671         127  
6321         771  
6932         467  
6551         802  
...      

In [10]:
test = origin_test

# How much of a moving average to have
smoothing_step = 750

# Resetting the index to ensure it starts from 0
test.reset_index(drop=True, inplace=True)

# Store dates for plotting output
start_date = test["date_time"][0]
# end_date_index = len(test) - smoothing_step
# end_date = test["date_time"].iloc[end_date_index]
end_date = test["date_time"][smoothing_step - 1]
dates = test["date_time"]

# Format data
test_target = test["power"]
test_copy = test.drop(["date_time", "power"], axis=1).copy()
test_dmatrix = xgb.DMatrix(test_copy)

# Load model and plot it
bst = pickle.load(open("/kaggle/working/bst_model_v2.pck", "rb"))
plot_model(bst)

# Create target vs prediction
pred = bst.predict(test_dmatrix)
actual = test_target

# Scale data from 0-1 to 0-MAX
actual *= MAX
pred *= MAX
print(actual)

0       170.2299
1       125.1918
2       484.2136
3       435.6673
4       724.2724
          ...   
1406    181.2909
1407    119.5360
1408    190.0133
1409    582.6565
1410    465.2853
Name: power, Length: 1411, dtype: float64


<Figure size 1500x1000 with 0 Axes>

<Figure size 21600x10800 with 0 Axes>

In [11]:
# Create a smoothing average
smooth_pred = []
smooth_actual = []
error = 0
for i in range(len(pred)):
    smooth_pred.append(np.mean(pred[i : i + smoothing_step]))
    smooth_actual.append(np.mean(actual[i : i + smoothing_step]))
    error += abs(pred[i] - actual[i])
error /= len(smooth_pred)

In [12]:
print(error)

39.66913512378436


In [13]:
# Line plot setup
sns.set(font_scale=1.25, rc={"figure.figsize": (20, 10)})
sns.set_style("whitegrid")  # ticks
sns.set_palette("colorblind")

# Decrease tick quantity
ax = plt.gca()
ax.xaxis.set_major_locator(plt.MaxNLocator(6))

# Plot lines
sns.lineplot(
    x=dates[: len(dates)], y=smooth_pred, label="Prediction"
)
sns.lineplot(
    x=dates[: len(dates)], y=smooth_actual, label="Actual"
)
plt.xlabel(f"Date")
plt.ylabel(f"Energy Generation (kWh)")
plt.title(
    f"Average Energy Generation Per Day From {end_date} to {start_date}\nAbsolute Error: {error:.2f} kWh"
)
plt.savefig("/kaggle/working/error.png")
clear_plots()

# Scatter plot setup
sns.set(font_scale=1.25, rc={"figure.figsize": (10, 10)})
sns.set_style("whitegrid")  # ticks
sns.set_palette("colorblind")

sns.scatterplot(x=actual, y=pred)
# plt.title("Solar Energy Generation In 30 Minute Intervals")
plt.xlabel("Actual")
plt.ylabel("Predict")
plt.xlim(0.0, MAX)
plt.ylim(0.0, MAX)
x = [np.min(actual), np.max(actual)]
y = x
plt.plot(x, y, 'r')
plt.savefig("/kaggle/working/scatterplot.png")
clear_plots()

In [14]:
# Line plot setup
sns.set(font_scale=1.25, rc={"figure.figsize": (20, 10)})
sns.set_style("whitegrid")  # ticks
sns.set_palette("colorblind")

# Decrease tick quantity
ax = plt.gca()
ax.xaxis.set_major_locator(plt.MaxNLocator(6))

# Plot lines
sns.lineplot(
    x=dates[: len(dates)], y=pred, label="Prediction"
)
sns.lineplot(
    x=dates[: len(dates)], y=actual, label="Actual"
)
plt.xlabel(f"Date")
plt.ylabel(f"Energy Generation (kWh)")
plt.title("Solar Energy Generation In 30 Minute Intervals")
plt.savefig("/kaggle/working/lineplot.png")
clear_plots()

In [15]:
# Line plot setup
sns.set(font_scale=1.25, rc={"figure.figsize": (20, 10)})
sns.set_style("whitegrid")  # ticks
sns.set_palette("colorblind")

# Decrease tick quantity
ax = plt.gca()
ax.xaxis.set_major_locator(plt.MaxNLocator(6))

# Plot lines
sns.lineplot(
    x=dates[: 100], y=pred[: 100], label="Prediction"
)
sns.lineplot(
    x=dates[: 100], y=actual[: 100], label="Actual"
)
plt.xlabel(f"Date")
plt.ylabel(f"Energy Generation (kWh)")
plt.title("Solar Energy Generation In 30 Minute Intervals")
plt.savefig("/kaggle/working/lineplot_100.png")
clear_plots()

In [17]:
from sklearn.metrics import roc_auc_score, roc_curve
# Verification using sklearn's roc_auc_score
def calculate_auc(y_true, y_scores):
    """
    Calculate the AUC given the true labels and predicted scores.

    Parameters:
    y_true (array-like): True binary labels.
    y_scores (array-like): Target scores, can either be probability estimates of the positive class.

    Returns:
    float: AUC score
    """
    # Sort the scores and corresponding true labels
    desc_score_indices = np.argsort(y_scores)[::-1]
    y_true_sorted = np.array(y_true)[desc_score_indices]
    y_scores_sorted = np.array(y_scores)[desc_score_indices]
    
    # Calculate the ROC curve points
    fpr, tpr, _ = roc_curve(y_true_sorted, y_scores_sorted)
    
    # Calculate the AUC using the trapezoidal rule
    auc = np.trapz(tpr, fpr)
    
    return auc

# Example usage
y_true = [0, 0, 1, 1]
y_scores = [0.1, 0.4, 0.35, 0.8]
auc = calculate_auc(y_true, y_scores)
print("AUC:", auc)

AUC: 0.75
