In [1]:
import pandas as pd

solar_df_plant1 = pd.read_csv('Plant_1_Generation_Data.csv')
weather_df_plant1 = pd.read_csv('Plant_1_Weather_Sensor_Data.csv')

solar_df_plant2 = pd.read_csv('Plant_2_Generation_Data.csv')
weather_df_plant2 = pd.read_csv('Plant_2_Weather_Sensor_Data.csv')

solar_df_plant1['DATE_TIME'] = pd.to_datetime(solar_df_plant1['DATE_TIME'])
weather_df_plant1['DATE_TIME'] = pd.to_datetime(weather_df_plant1['DATE_TIME'])

solar_df_plant2['DATE_TIME'] = pd.to_datetime(solar_df_plant2['DATE_TIME'])
weather_df_plant2['DATE_TIME'] = pd.to_datetime(weather_df_plant2['DATE_TIME'])

# merge
merged_df_plant1 = pd.merge(solar_df_plant1, weather_df_plant1, on='DATE_TIME', how='inner')
merged_df_plant2 = pd.merge(solar_df_plant2, weather_df_plant2, on='DATE_TIME', how='inner')

# combine
final_df = pd.concat([merged_df_plant1, merged_df_plant2], ignore_index=True)

print("Final Merged Dataframe Head:")
print(final_df.head())
print("\nFinal Dataframe Shape:", final_df.shape)

Final Merged Dataframe Head:
   DATE_TIME  PLANT_ID_x     SOURCE_KEY_x  DC_POWER  AC_POWER  DAILY_YIELD  \
0 2020-05-15     4135001  1BY6WEcLGh8j5v7       0.0       0.0          0.0   
1 2020-05-15     4135001  1IF53ai7Xc0U56Y       0.0       0.0          0.0   
2 2020-05-15     4135001  3PZuoBAID5Wc2HD       0.0       0.0          0.0   
3 2020-05-15     4135001  7JYdWkrLSPkdwr4       0.0       0.0          0.0   
4 2020-05-15     4135001  McdE0feGgRqW7Ca       0.0       0.0          0.0   

   TOTAL_YIELD  PLANT_ID_y     SOURCE_KEY_y  AMBIENT_TEMPERATURE  \
0    6259559.0     4135001  HmiyD2TTLFNqkNe            25.184316   
1    6183645.0     4135001  HmiyD2TTLFNqkNe            25.184316   
2    6987759.0     4135001  HmiyD2TTLFNqkNe            25.184316   
3    7602960.0     4135001  HmiyD2TTLFNqkNe            25.184316   
4    7158964.0     4135001  HmiyD2TTLFNqkNe            25.184316   

   MODULE_TEMPERATURE  IRRADIATION  
0           22.857507          0.0  
1           22.8575

  solar_df_plant1['DATE_TIME'] = pd.to_datetime(solar_df_plant1['DATE_TIME'])


In [2]:
df_final = final_df.copy()

# fix the DATE_TIME
df_final['DATE_TIME'] = pd.to_datetime(df_final['DATE_TIME'], format='%d-%m-%Y %H:%M')

# set the 'DATE_TIME' column as the dataframe index
df_final.set_index('DATE_TIME', inplace=True)

# drop redundant or unnecessary columns
df_final.drop(['PLANT_ID_x', 'SOURCE_KEY_x', 'PLANT_ID_y', 'SOURCE_KEY_y'], axis=1, inplace=True)

# filter out nighttime hours where solar power is zero
df_final = df_final[df_final['IRRADIATION'] > 0]

print("Cleaned Dataframe Head:")
print(df_final.head())
print("\nCleaned Dataframe Shape:", df_final.shape)

Cleaned Dataframe Head:
                     DC_POWER  AC_POWER  DAILY_YIELD  TOTAL_YIELD  \
DATE_TIME                                                           
2020-05-15 05:45:00       0.0       0.0          0.0    6259559.0   
2020-05-15 05:45:00       0.0       0.0          0.0    6183645.0   
2020-05-15 05:45:00       0.0       0.0          0.0    6987759.0   
2020-05-15 05:45:00       0.0       0.0          0.0    7602960.0   
2020-05-15 05:45:00       0.0       0.0          0.0    7158964.0   

                     AMBIENT_TEMPERATURE  MODULE_TEMPERATURE  IRRADIATION  
DATE_TIME                                                                  
2020-05-15 05:45:00            24.289211           23.096692     0.000863  
2020-05-15 05:45:00            24.289211           23.096692     0.000863  
2020-05-15 05:45:00            24.289211           23.096692     0.000863  
2020-05-15 05:45:00            24.289211           23.096692     0.000863  
2020-05-15 05:45:00            24.28

In [3]:
# create new time-based features from the index
df_final['hour'] = df_final.index.hour
df_final['dayofyear'] = df_final.index.dayofyear
df_final['month'] = df_final.index.month
df_final['dayofweek'] = df_final.index.dayofweek

# create a lagged feature for AC_POWER from the previous time step
df_final['AC_POWER_lag1'] = df_final['AC_POWER'].shift(1)

# the first row will now have a NaN value
df_final.dropna(inplace=True)

print("Dataframe with new features:")
print(df_final.head())
print("\nFinal Dataframe Shape:", df_final.shape)

Dataframe with new features:
                     DC_POWER  AC_POWER  DAILY_YIELD  TOTAL_YIELD  \
DATE_TIME                                                           
2020-05-15 05:45:00       0.0       0.0          0.0    6183645.0   
2020-05-15 05:45:00       0.0       0.0          0.0    6987759.0   
2020-05-15 05:45:00       0.0       0.0          0.0    7602960.0   
2020-05-15 05:45:00       0.0       0.0          0.0    7158964.0   
2020-05-15 05:45:00       0.0       0.0          0.0    7206408.0   

                     AMBIENT_TEMPERATURE  MODULE_TEMPERATURE  IRRADIATION  \
DATE_TIME                                                                   
2020-05-15 05:45:00            24.289211           23.096692     0.000863   
2020-05-15 05:45:00            24.289211           23.096692     0.000863   
2020-05-15 05:45:00            24.289211           23.096692     0.000863   
2020-05-15 05:45:00            24.289211           23.096692     0.000863   
2020-05-15 05:45:00      

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# define our features (X) and target (y)
features = ['AC_POWER_lag1', 'AMBIENT_TEMPERATURE', 'IRRADIATION', 'hour', 'dayofyear']
X = df_final[features]
y = df_final['AC_POWER']

# perform a sequential split for time-series data(80% of the data for training and 20% for testing)
split_point = int(len(df_final) * 0.8)
X_train = X[:split_point]
X_test = X[split_point:]
y_train = y[:split_point]
y_test = y[split_point:]

# initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# print the shapes to confirm the split was correct
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Model training complete.")

Shape of X_train: (61677, 5)
Shape of X_test: (15420, 5)
Model training complete.


In [5]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

predictions = model.predict(X_test)

# Evaluate the Linear Regression Model
print("--- Linear Regression Model Evaluation ---")
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
r2 = r2_score(y_test, predictions)

print(f"Mean Absolute Error (MAE): {mae:.2f} kW")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} kW")
print(f"R-squared (R2): {r2:.2f}")

# Evaluate Baseline
print("\n--- Persistence Model Evaluation (Baseline) ---")
baseline_predictions = X_test['AC_POWER_lag1']

baseline_mae = mean_absolute_error(y_test, baseline_predictions)
baseline_rmse = np.sqrt(mean_squared_error(y_test, baseline_predictions))
baseline_r2 = r2_score(y_test, baseline_predictions)

print(f"Mean Absolute Error (MAE): {baseline_mae:.2f} kW")
print(f"Root Mean Squared Error (RMSE): {baseline_rmse:.2f} kW")
print(f"R-squared (R2): {baseline_r2:.2f}")

--- Linear Regression Model Evaluation ---
Mean Absolute Error (MAE): 151.87 kW
Root Mean Squared Error (RMSE): 285.93 kW
R-squared (R2): 0.33

--- Persistence Model Evaluation (Baseline) ---
Mean Absolute Error (MAE): 145.36 kW
Root Mean Squared Error (RMSE): 345.59 kW
R-squared (R2): 0.02


In [6]:
# find the index of the highest predicted value
peak_solar_index = predictions.argmax()

# get the corresponding timestamp from the test set
peak_solar_time = X_test.index[peak_solar_index]

print(f"The hour with the highest predicted solar output is: {peak_solar_time.hour}:00")

The hour with the highest predicted solar output is: 12:00


In [7]:
df_final.to_csv('solar_power_data.csv')