In [43]:
# Step 1 — Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [33]:
# Step 2 — Load all four datasets
plant_1_generation = pd.read_csv("Plant_1_Generation_Data.csv")
plant_1_weather = pd.read_csv("Plant_1_Weather_Sensor_Data.csv")
plant_2_generation = pd.read_csv("Plant_2_Generation_Data.csv")
plant_2_weather = pd.read_csv("Plant_2_Weather_Sensor_Data.csv")

In [None]:
# Step 3 — Convert DATE_TIME column to datetime
for df in [plant_1_generation, plant_1_weather, plant_2_generation, plant_2_weather]:
    df['DATE_TIME'] = pd.to_datetime(df['DATE_TIME'])


In [35]:
# Step 4 — Merge generation + weather by DATE_TIME only
plant1 = pd.merge(plant_1_generation, plant_1_weather, on='DATE_TIME', how='inner')
plant2 = pd.merge(plant_2_generation, plant_2_weather, on='DATE_TIME', how='inner')

In [36]:
# Step 5 — Feature creation (time-based features)
combined_data['hour'] = combined_data['DATE_TIME'].dt.hour
combined_data['day'] = combined_data['DATE_TIME'].dt.day
combined_data['month'] = combined_data['DATE_TIME'].dt.month

In [37]:

# Step 6 — Handle missing or invalid values
combined_data = combined_data.dropna()

In [42]:
# Step 7 — Select features and target variable
available_cols = combined_data.columns
possible_features = ['DC_POWER', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE',
                     'IRRADIATION', 'hour', 'day', 'month']
features = [f for f in possible_features if f in available_cols]

target = 'AC_POWER' if 'AC_POWER' in available_cols else 'DAILY_YIELD'

X = combined_data[features]
y = combined_data[target]

print(f" Features used: {features}")
print(f" Target: {target}")
print(f"Data shape → X: {X.shape}, y: {y.shape}")

 Features used: ['DC_POWER', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION', 'hour', 'day', 'month']
 Target: AC_POWER
Data shape → X: (136472, 7), y: (136472,)


In [39]:
# Step 8 — Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


In [41]:
# Step 9 — Train and evaluate Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n Model Evaluation Results:")
print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"R² Score: {r2:.4f}")


result_df = pd.DataFrame({'Actual': y_test.values[:20], 'Predicted': y_pred[:20]})
print("\n Sample Predictions:")
print(result_df)


 Model Evaluation Results:
Mean Absolute Error (MAE): 0.207
R² Score: 1.0000

 Sample Predictions:
         Actual    Predicted
0    430.040000   429.681562
1      0.000000     0.000000
2      0.000000     0.000000
3      0.000000     0.000000
4    166.537500   166.812500
5      0.000000     0.000000
6      0.000000     0.000000
7     96.314286    96.372196
8    287.360000   287.769605
9      0.000000     0.000000
10     0.000000     0.000000
11     0.000000     0.000000
12     0.000000     0.000000
13   178.600000   178.446125
14   480.071429   479.995161
15  1198.550000  1198.222532
16     0.000000     0.000000
17  1001.757143  1001.532446
18     0.000000     0.000000
19     0.000000     0.000000
