In [1]:
# In this file, we will add one new column to our predictor data set, Day_of_Week, to see how that impacts the performance
# of our ML models for predicting solar output using weather.

import pandas as pd

In [2]:
# Read the generation data file and parse DATE_TIME as a datetime object
df_generation = pd.read_csv('Plant_1_Generation_Data.csv', parse_dates=['DATE_TIME'])

# Read the weather data
df_weather = pd.read_csv('Plant_1_Weather_Sensor_Data.csv', parse_dates=['DATE_TIME'])

# Merge the two dataframes on the DATE_TIME column
df_merged = pd.merge(df_generation, df_weather, on='DATE_TIME', how='left', suffixes=('_gen', '_weather'))

# Now df_merged contains the combined data
print(df_merged.head())

# Save the merged dataframe to an Excel file so we can inspect it
df_merged.to_excel('Merged_Data.xlsx', index=False)

  df_generation = pd.read_csv('Plant_1_Generation_Data.csv', parse_dates=['DATE_TIME'])


   DATE_TIME  PLANT_ID_gen   SOURCE_KEY_gen  DC_POWER  AC_POWER  DAILY_YIELD  \
0 2020-05-15       4135001  1BY6WEcLGh8j5v7       0.0       0.0          0.0   
1 2020-05-15       4135001  1IF53ai7Xc0U56Y       0.0       0.0          0.0   
2 2020-05-15       4135001  3PZuoBAID5Wc2HD       0.0       0.0          0.0   
3 2020-05-15       4135001  7JYdWkrLSPkdwr4       0.0       0.0          0.0   
4 2020-05-15       4135001  McdE0feGgRqW7Ca       0.0       0.0          0.0   

   TOTAL_YIELD  PLANT_ID_weather SOURCE_KEY_weather  AMBIENT_TEMPERATURE  \
0    6259559.0         4135001.0    HmiyD2TTLFNqkNe            25.184316   
1    6183645.0         4135001.0    HmiyD2TTLFNqkNe            25.184316   
2    6987759.0         4135001.0    HmiyD2TTLFNqkNe            25.184316   
3    7602960.0         4135001.0    HmiyD2TTLFNqkNe            25.184316   
4    7158964.0         4135001.0    HmiyD2TTLFNqkNe            25.184316   

   MODULE_TEMPERATURE  IRRADIATION  
0           22.857507    

In [3]:
# Filter the dataframe for a specific Source Key, which represents one inverter for a group of panels
df_specific_panel = df_merged[df_merged['SOURCE_KEY_gen'] == '1BY6WEcLGh8j5v7']

# Sort by DATE_TIME
df_specific_panel.sort_values('DATE_TIME', inplace=True)

# Create a DatetimeIndex with the expected frequency range with a timestamp every 15 minutes ('15T')
date_range = pd.date_range(start=df_specific_panel['DATE_TIME'].min(),
                           end=df_specific_panel['DATE_TIME'].max(),
                           freq='15T')

# Find the difference between this ideal range and the actual timestamps in the dataframe
missing_timestamps = date_range.difference(df_specific_panel['DATE_TIME'])

# show which timestamps are missing in the data for this panel/inverter
print(missing_timestamps)

DatetimeIndex(['2020-05-15 23:15:00', '2020-05-15 23:30:00',
               '2020-05-15 23:45:00', '2020-05-16 00:00:00',
               '2020-05-16 00:15:00', '2020-05-16 00:30:00',
               '2020-05-16 00:45:00', '2020-05-16 01:00:00',
               '2020-05-16 01:15:00', '2020-05-16 01:30:00',
               ...
               '2020-05-29 04:30:00', '2020-05-29 04:45:00',
               '2020-05-29 05:00:00', '2020-05-29 05:15:00',
               '2020-05-29 05:30:00', '2020-05-29 05:45:00',
               '2020-05-29 06:00:00', '2020-06-03 14:00:00',
               '2020-06-17 06:15:00', '2020-06-17 06:30:00'],
              dtype='datetime64[ns]', length=110, freq=None)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_specific_panel.sort_values('DATE_TIME', inplace=True)


In [4]:
# Create a new dataframe with missing timestamps
missing_df = pd.DataFrame(missing_timestamps, columns=['DATE_TIME'])

# Fill in the blanks on columns we know the value for
missing_df['PLANT_ID_gen'] = df_specific_panel['PLANT_ID_gen'].iloc[0]
missing_df['SOURCE_KEY_gen'] = '1BY6WEcLGh8j5v7'
missing_df['PLANT_ID_weather'] = df_specific_panel['PLANT_ID_weather'].iloc[0]
missing_df['SOURCE_KEY_weather'] = df_specific_panel['SOURCE_KEY_weather'].iloc[0]

# Concatenate the original dataframe with the dataframe of missing timestamps
df_complete = pd.concat([df_specific_panel, missing_df], ignore_index=True)

# Sort the complete DataFrame by DATE_TIME
df_complete.sort_values('DATE_TIME', inplace=True)

# Save the new dataframe to excel so we can look at it
df_complete.to_excel('Complete_Data_for_Panel.xlsx', index=False)

In [5]:
# Let's impute values for the new rows we created for the missing timestamps
# Create a new column 'imputed' and set it to 0 for all rows initially

df_complete['imputed'] = 0

# Here are the columns I want to impute values for, and I'll use linear interpolation for that
columns_to_interpolate = [
    'DC_POWER', 'AC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD', 
    'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION'
]

# Run a loop that adds a value of 1 for all imputed rows so I know which they are later on
for column in columns_to_interpolate:
    mask = df_complete[column].isna()
    df_complete.loc[mask, 'imputed'] = 1

    # Interpolate the missing values using linear interpolation
    df_complete[column] = df_complete[column].interpolate(method='linear')

# Save the new dataframe to excel so we can look at it again
df_complete.to_excel('Complete_Data_for_Panel.xlsx', index=False)

# 1. Adding Day_of Week (done in-class)

In [6]:
# Now let's use time series feature engineering to add a new column called Day_of_Week as a value from 0 to 6
df_complete['Day_of_Week'] = df_complete['DATE_TIME'].dt.dayofweek

df_complete

Unnamed: 0,DATE_TIME,PLANT_ID_gen,SOURCE_KEY_gen,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,PLANT_ID_weather,SOURCE_KEY_weather,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,imputed,Day_of_Week
0,2020-05-15 00:00:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0,4135001.0,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0,0,4
1,2020-05-15 00:15:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0,4135001.0,HmiyD2TTLFNqkNe,25.084589,22.761668,0.0,0,4
2,2020-05-15 00:30:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0,4135001.0,HmiyD2TTLFNqkNe,24.935753,22.592306,0.0,0,4
3,2020-05-15 00:45:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0,4135001.0,HmiyD2TTLFNqkNe,24.846130,22.360852,0.0,0,4
4,2020-05-15 01:00:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0,4135001.0,HmiyD2TTLFNqkNe,24.621525,22.165423,0.0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3149,2020-06-17 22:45:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,5521.0,6485319.0,4135001.0,HmiyD2TTLFNqkNe,22.150570,21.480377,0.0,0,2
3150,2020-06-17 23:00:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,5521.0,6485319.0,4135001.0,HmiyD2TTLFNqkNe,22.129816,21.389024,0.0,0,2
3151,2020-06-17 23:15:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,5521.0,6485319.0,4135001.0,HmiyD2TTLFNqkNe,22.008275,20.709211,0.0,0,2
3152,2020-06-17 23:30:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,5521.0,6485319.0,4135001.0,HmiyD2TTLFNqkNe,21.969495,20.734963,0.0,0,2


In [7]:
# The rest of the steps are the same except we now add Day_of_Week into our X feature matrix
# Now we will calculate a naive baseline. A naive baseline is a very simple model of our data such as the average or median
# The naive baseline gives us something to compare our other models against. If they are any good, we should 
# easily be able to beat it. We will use the median value as the naive baseline for our solar data.

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Prepare the feature matrix (X), which now includes our Day_of_Week column, and the target vector (y)
X = df_complete[['AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION','Day_of_Week']]
y = df_complete['AC_POWER']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Naive baseline: Use the median AC_POWER from the training set as the prediction
y_train_median = y_train.median()
y_naive_pred = np.full(shape=y_test.shape, fill_value=y_train_median)

# Calculate the MAE for the naive baseline
mae_naive = mean_absolute_error(y_test, y_naive_pred)
print('Naive Baseline MAE:', mae_naive)

# Calculate the MSE for the naive baseline
mse_naive = mean_squared_error(y_test, y_naive_pred)
print('Naive Baseline MSE:', mse_naive)

# Calculate the RMSE for the naive baseline
rmse_naive = np.sqrt(mse_naive)
print('Naive Baseline RMSE:', rmse_naive)

Naive Baseline MAE: 266.0781670367449
Naive Baseline MSE: 183122.3930413372
Naive Baseline RMSE: 427.9280232017263


In [8]:
# Now let's try linear regression and see how it performs

from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
linear_model = LinearRegression()

# Fit the model on the training data
linear_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = linear_model.predict(X_test)

# Calculate the MAE, MSE, and RMSE for the Linear Regression model
mae_linear = mean_absolute_error(y_test, y_pred)
print('Linear Regression MAE:', mae_linear)

mse_linear = mean_squared_error(y_test, y_pred)
print('Linear Regression MSE:', mse_linear)

rmse_linear = np.sqrt(mse_linear)
print('Linear Regression RMSE:', rmse_linear)

Linear Regression MAE: 34.6605922185422
Linear Regression MSE: 10388.706936698569
Linear Regression RMSE: 101.92500643462608


In [9]:
from sklearn.ensemble import RandomForestRegressor

# Train a Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_rf_pred = rf_model.predict(X_test)

# Calculate the MAE, MSE, and RMSE for the Random Forest model
mae_rf = mean_absolute_error(y_test, y_rf_pred)
print('Random Forest MAE:', mae_rf)

mse_rf = mean_squared_error(y_test, y_rf_pred)
print('Random Forest MSE:', mse_rf)

rmse_rf = np.sqrt(mse_rf)
print('Random Forest RMSE:', rmse_rf)

Random Forest MAE: 29.752905015865007
Random Forest MSE: 8431.536215790293
Random Forest RMSE: 91.82339688657947


In [10]:
from sklearn.svm import SVR

# Initialize the SVR model
# Here we are using the default 'rbf' kernel, we can try other configurations later
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)

# Fit the model on the training data
svr_model.fit(X_train, y_train)

# Make predictions on the test set
y_svr_pred = svr_model.predict(X_test)

# Calculate the MAE, MSE, and RMSE for the SVR model
mae_svr = mean_absolute_error(y_test, y_svr_pred)
print('SVR MAE:', mae_svr)

mse_svr = mean_squared_error(y_test, y_svr_pred)
print('SVR MSE:', mse_svr)

rmse_svr = np.sqrt(mse_svr)
print('SVR RMSE:', rmse_svr)

SVR MAE: 97.769593438954
SVR MSE: 25649.8732213019
SVR RMSE: 160.1557779828811


In [11]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize the Gradient Boosting Regressor
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the model on the training data
gbr_model.fit(X_train, y_train)

# Make predictions on the test set
y_gbr_pred = gbr_model.predict(X_test)

# Calculate the MAE, MSE, and RMSE for the Gradient Boosting model
mae_gbr = mean_absolute_error(y_test, y_gbr_pred)
print('Gradient Boosting Regressor MAE:', mae_gbr)

mse_gbr = mean_squared_error(y_test, y_gbr_pred)
print('Gradient Boosting Regressor MSE:', mse_gbr)

rmse_gbr = np.sqrt(mse_gbr)
print('Gradient Boosting Regressor RMSE:', rmse_gbr)

Gradient Boosting Regressor MAE: 30.35497475111342
Gradient Boosting Regressor MSE: 8623.015445977704
Gradient Boosting Regressor RMSE: 92.86019301066364


In [12]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler

# Standardize the features (important for KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the KNN regressor, starting with 5 neighbors and can try other configurations later
knn_model = KNeighborsRegressor(n_neighbors=5)

# Fit the model on the training data
knn_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_knn_pred = knn_model.predict(X_test_scaled)

# Calculate the MAE, MSE, and RMSE for the KNN model
mae_knn = mean_absolute_error(y_test, y_knn_pred)
print('KNN Regressor MAE:', mae_knn)

mse_knn = mean_squared_error(y_test, y_knn_pred)
print('KNN Regressor MSE:', mse_knn)

rmse_knn = np.sqrt(mse_knn)
print('KNN Regressor RMSE:', rmse_knn)

KNN Regressor MAE: 34.276523300210705
KNN Regressor MSE: 9386.461245185095
KNN Regressor RMSE: 96.88375119278307


In [13]:
import tensorflow as tf

# Standardize the features (also important for NN). I'm commenting this out but left it in to note we would do it
# if we hadn't already done it above.
#scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train)
#X_test_scaled = scaler.transform(X_test)

# Create a simple neural network model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)  # Output layer
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Predict on the test set
y_nn_pred = model.predict(X_test_scaled).flatten()

# Calculate the MAE, MSE, and RMSE for the neural network model
mae_nn = mean_absolute_error(y_test, y_nn_pred)
print('Neural Network MAE:', mae_nn)

mse_nn = mean_squared_error(y_test, y_nn_pred)
print('Neural Network MSE:', mse_nn)

rmse_nn = np.sqrt(mse_nn)
print('Neural Network RMSE:', rmse_nn)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network MAE: 62.74037184818832
Neural Network MSE: 12257.1451552284
Neural Network RMSE: 110.71199192150956


In [14]:
# Now let's compare all our models and see how they performed based on these metrics

results = {
    'Model': ['Naive Baseline', 'Linear Regression', 'Random Forest', 'SVR', 'Gradient Boosting', 'KNN', 'Neural Network'],
    'MAE': [mae_naive, mae_linear, mae_rf, mae_svr, mae_gbr, mae_knn, mae_nn],
    'MSE': [mse_naive, mse_linear, mse_rf, mse_svr, mse_gbr, mse_knn, mse_nn],
    'RMSE': [rmse_naive, rmse_linear, rmse_rf, rmse_svr, rmse_gbr, rmse_knn, rmse_nn]
}

results_df = pd.DataFrame(results)

results_df.set_index('Model', inplace=True)

# Sort the DataFrame by the RMSE column in ascending order to view results
results_df_sorted_by_rmse = results_df.sort_values(by='RMSE', ascending=True)

print(results_df_sorted_by_rmse)

                          MAE            MSE        RMSE
Model                                                   
Random Forest       29.752905    8431.536216   91.823397
Gradient Boosting   30.354975    8623.015446   92.860193
KNN                 34.276523    9386.461245   96.883751
Linear Regression   34.660592   10388.706937  101.925006
Neural Network      62.740372   12257.145155  110.711992
SVR                 97.769593   25649.873221  160.155778
Naive Baseline     266.078167  183122.393041  427.928023


# 1) Add Season
I added the season as a dummy variable by grouping the datatime data by month, where winter (Dec, Jan, Feb) is 1, spring (Mar, Apr, May) is 2, summer (June, July, Aug) is 3, and fall (Sep, Oct, Nov) is 4. Adding season improved the RMSE of Random Forest and KNN a lot. It is not so much for Gradient Boosting and Linear. And it worsen the Neural Network and SVR. 

Additionally, I noticed that the data only have May and June data, which could weaken the effect of the season grouping. 

In [15]:
season = df_complete['DATE_TIME'].dt.month%12 // 3 + 1
df_complete1 = df_complete
df_complete1['Season'] = season
df_complete1.to_excel('new_Data_for_Panel.xlsx', index=False)

In [16]:
# Prepare the feature matrix (X), which now includes our Day_of_Week column, and the target vector (y)
X = df_complete1[['AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION','Day_of_Week','Season']]
y = df_complete1['AC_POWER']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# naive baseline
y_train_median = y_train.median()
y_naive_pred = np.full(shape=y_test.shape, fill_value=y_train_median)
mae_naive = mean_absolute_error(y_test, y_naive_pred)
mse_naive = mean_squared_error(y_test, y_naive_pred)
rmse_naive = np.sqrt(mse_naive)
# linear regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)
mae_linear = mean_absolute_error(y_test, y_pred)
mse_linear = mean_squared_error(y_test, y_pred)
rmse_linear = np.sqrt(mse_linear)
# random forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_rf_pred = rf_model.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_rf_pred)
mse_rf = mean_squared_error(y_test, y_rf_pred)
rmse_rf = np.sqrt(mse_rf)
# SVR model
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svr_model.fit(X_train, y_train)
y_svr_pred = svr_model.predict(X_test)
mae_svr = mean_absolute_error(y_test, y_svr_pred)
mse_svr = mean_squared_error(y_test, y_svr_pred)
rmse_svr = np.sqrt(mse_svr)
# gradient boosting regressor
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbr_model.fit(X_train, y_train)
y_gbr_pred = gbr_model.predict(X_test)
mae_gbr = mean_absolute_error(y_test, y_gbr_pred)
mse_gbr = mean_squared_error(y_test, y_gbr_pred)
rmse_gbr = np.sqrt(mse_gbr)
# KNeighbors regressor
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
y_knn_pred = knn_model.predict(X_test_scaled)
mae_knn = mean_absolute_error(y_test, y_knn_pred)
mse_knn = mean_squared_error(y_test, y_knn_pred)
rmse_knn = np.sqrt(mse_knn)
# tensorflow
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)  # Output layer
])
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.2)
y_nn_pred = model.predict(X_test_scaled).flatten()
mae_nn = mean_absolute_error(y_test, y_nn_pred)
mse_nn = mean_squared_error(y_test, y_nn_pred)
rmse_nn = np.sqrt(mse_nn)

# results
results1 = {
    'Model': ['Naive Baseline', 'Linear Regression', 'Random Forest', 'SVR', 'Gradient Boosting', 'KNN', 'Neural Network'],
    'MAE': [mae_naive, mae_linear, mae_rf, mae_svr, mae_gbr, mae_knn, mae_nn],
    'MSE': [mse_naive, mse_linear, mse_rf, mse_svr, mse_gbr, mse_knn, mse_nn],
    'RMSE': [rmse_naive, rmse_linear, rmse_rf, rmse_svr, rmse_gbr, rmse_knn, rmse_nn]
}
results1_df = pd.DataFrame(results1)
results1_df.set_index('Model', inplace=True)
results1_df_sorted_by_rmse = results1_df.sort_values(by='RMSE', ascending=True)

print(results1_df_sorted_by_rmse)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
                          MAE            MSE        RMSE
Model                                                   
KNN                 32.655306    7444.056202   86.278944
Random Forest       28.453975    7537.135154   86.816676
Gradient Boosting   31.065471    8635.286592   92.926243
Linear Regression   35.102106   10379.621479  101.880427
Neural Network      63.080012   12758.916332  112.955373
SVR                101.637483   27289.308176  165.194758
Naive Baseline     266.078167  183122.393041  427.928023


# 1) Group Temperature
I added the temperature groups using bins. Grouping temperature improved the RMSE of Random Forest and KNN a lot, but not as good as adding season. The improvement is not so much for Gradient Boosting and Linear, but the Gradient Boosting one is better than the adding season alone. And it also worsens the SVR. 

I also noticed that changing the bins will alter the results, but I only trial and error for how the bins are set. I might consider clustering using the k-means to see how it might improve the result.

In [17]:
df_complete2 = df_complete1
bins = [0, 25, 45, float('inf')]
labels = ['low', 'Med1', 'high']
df_complete2['temp_category'] = pd.cut(df_complete2['MODULE_TEMPERATURE'], bins=bins, labels=labels)
bin_counts = df_complete2['temp_category'].value_counts()
print("Counts per Temperature Category:")
print(bin_counts)
df_complete2 = pd.get_dummies(df_complete2, columns=['temp_category'], drop_first=True)

Counts per Temperature Category:
temp_category
low     1664
Med1     992
high     608
Name: count, dtype: int64


In [18]:
df_complete3 = df_complete2
bins = [0, 25, 30, float('inf')]
labels = ['low', 'med', 'high']
df_complete3['A_temp_category'] = pd.cut(df_complete3['AMBIENT_TEMPERATURE'], bins=bins, labels=labels)
bin_counts = df_complete2['A_temp_category'].value_counts()
print("Counts per Ambient Temperature Category:")
print(bin_counts)
df_complete3 = pd.get_dummies(df_complete3, columns=['A_temp_category'], drop_first=True)
df_complete3

Counts per Ambient Temperature Category:
A_temp_category
low     1750
med     1108
high     406
Name: count, dtype: int64


Unnamed: 0,DATE_TIME,PLANT_ID_gen,SOURCE_KEY_gen,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,PLANT_ID_weather,SOURCE_KEY_weather,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,imputed,Day_of_Week,Season,temp_category_Med1,temp_category_high,A_temp_category_med,A_temp_category_high
0,2020-05-15 00:00:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0,4135001.0,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0,0,4,2,False,False,True,False
1,2020-05-15 00:15:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0,4135001.0,HmiyD2TTLFNqkNe,25.084589,22.761668,0.0,0,4,2,False,False,True,False
2,2020-05-15 00:30:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0,4135001.0,HmiyD2TTLFNqkNe,24.935753,22.592306,0.0,0,4,2,False,False,False,False
3,2020-05-15 00:45:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0,4135001.0,HmiyD2TTLFNqkNe,24.846130,22.360852,0.0,0,4,2,False,False,False,False
4,2020-05-15 01:00:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0,4135001.0,HmiyD2TTLFNqkNe,24.621525,22.165423,0.0,0,4,2,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3149,2020-06-17 22:45:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,5521.0,6485319.0,4135001.0,HmiyD2TTLFNqkNe,22.150570,21.480377,0.0,0,2,3,False,False,False,False
3150,2020-06-17 23:00:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,5521.0,6485319.0,4135001.0,HmiyD2TTLFNqkNe,22.129816,21.389024,0.0,0,2,3,False,False,False,False
3151,2020-06-17 23:15:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,5521.0,6485319.0,4135001.0,HmiyD2TTLFNqkNe,22.008275,20.709211,0.0,0,2,3,False,False,False,False
3152,2020-06-17 23:30:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,5521.0,6485319.0,4135001.0,HmiyD2TTLFNqkNe,21.969495,20.734963,0.0,0,2,3,False,False,False,False


In [19]:
X = df_complete3[['AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION','Day_of_Week','Season','temp_category_Med1','temp_category_high','A_temp_category_med','A_temp_category_high']]
y = df_complete3['AC_POWER']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# naive baseline
y_train_median = y_train.median()
y_naive_pred = np.full(shape=y_test.shape, fill_value=y_train_median)
mae_naive = mean_absolute_error(y_test, y_naive_pred)
mse_naive = mean_squared_error(y_test, y_naive_pred)
rmse_naive = np.sqrt(mse_naive)
# linear regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)
mae_linear = mean_absolute_error(y_test, y_pred)
mse_linear = mean_squared_error(y_test, y_pred)
rmse_linear = np.sqrt(mse_linear)
# random forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_rf_pred = rf_model.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_rf_pred)
mse_rf = mean_squared_error(y_test, y_rf_pred)
rmse_rf = np.sqrt(mse_rf)
# SVR model
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svr_model.fit(X_train, y_train)
y_svr_pred = svr_model.predict(X_test)
mae_svr = mean_absolute_error(y_test, y_svr_pred)
mse_svr = mean_squared_error(y_test, y_svr_pred)
rmse_svr = np.sqrt(mse_svr)
# gradient boosting regressor
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbr_model.fit(X_train, y_train)
y_gbr_pred = gbr_model.predict(X_test)
mae_gbr = mean_absolute_error(y_test, y_gbr_pred)
mse_gbr = mean_squared_error(y_test, y_gbr_pred)
rmse_gbr = np.sqrt(mse_gbr)
# KNeighbors regressor
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
y_knn_pred = knn_model.predict(X_test_scaled)
mae_knn = mean_absolute_error(y_test, y_knn_pred)
mse_knn = mean_squared_error(y_test, y_knn_pred)
rmse_knn = np.sqrt(mse_knn)
# tensorflow
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)  # Output layer
])
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.2)
y_nn_pred = model.predict(X_test_scaled).flatten()
mae_nn = mean_absolute_error(y_test, y_nn_pred)
mse_nn = mean_squared_error(y_test, y_nn_pred)
rmse_nn = np.sqrt(mse_nn)

# results
results2 = {
    'Model': ['Naive Baseline', 'Linear Regression', 'Random Forest', 'SVR', 'Gradient Boosting', 'KNN', 'Neural Network'],
    'MAE': [mae_naive, mae_linear, mae_rf, mae_svr, mae_gbr, mae_knn, mae_nn],
    'MSE': [mse_naive, mse_linear, mse_rf, mse_svr, mse_gbr, mse_knn, mse_nn],
    'RMSE': [rmse_naive, rmse_linear, rmse_rf, rmse_svr, rmse_gbr, rmse_knn, rmse_nn]
}
results2_df = pd.DataFrame(results2)
results2_df.set_index('Model', inplace=True)
results2_df_sorted_by_rmse = results2_df.sort_values(by='RMSE', ascending=True)

print(results2_df_sorted_by_rmse)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
                          MAE            MSE        RMSE
Model                                                   
Random Forest       28.686205    7657.136435   87.505065
KNN                 35.795729    8347.307574   91.363601
Gradient Boosting   30.680555    8352.942790   91.394435
Linear Regression   35.082407   10342.934046  101.700217
Neural Network      60.847808   12125.315313  110.115009
SVR                112.248116   32623.268516  180.619126
Naive Baseline     266.078167  183122.393041  427.928023
