In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the renewable energy data
df_renewable = pd.read_csv('data/renewable_power_data.csv')

# Load the household consumption data
df_household = pd.read_csv('data/household_power_consumption.csv', sep=';', low_memory=False)

print("Renewable Data Head:")
print(df_renewable.head())
print("\nHousehold Data Head:")
print(df_household.head())

Renewable Data Head:
         Date  Consumption  Wind  Solar  Wind+Solar
0  01-01-2006     1069.184   NaN    NaN         NaN
1  01-02-2006     1380.521   NaN    NaN         NaN
2  01-03-2006     1442.533   NaN    NaN         NaN
3  01-04-2006     1457.217   NaN    NaN         NaN
4  01-05-2006     1477.131   NaN    NaN         NaN

Household Data Head:
         Date      Time Global_active_power Global_reactive_power  Voltage  \
0  16/12/2006  17:24:00               4.216                 0.418  234.840   
1  16/12/2006  17:25:00               5.360                 0.436  233.630   
2  16/12/2006  17:26:00               5.374                 0.498  233.290   
3  16/12/2006  17:27:00               5.388                 0.502  233.740   
4  16/12/2006  17:28:00               3.666                 0.528  235.680   

  Global_intensity Sub_metering_1 Sub_metering_2  Sub_metering_3  
0           18.400          0.000          1.000            17.0  
1           23.000          0.000         

In [2]:
# --- Cleaning the Renewable Energy DataFrame ---

# Convert the 'Date' column to a proper datetime format
df_renewable['Date'] = pd.to_datetime(df_renewable['Date'], format='mixed')

# Set the 'Date' column as the index of the DataFrame
df_renewable = df_renewable.set_index('Date')

# UPDATED: Using the modern .ffill() method instead of the old one
df_renewable['Solar'] = df_renewable['Solar'].ffill()
df_renewable['Wind'] = df_renewable['Wind'].ffill()

# Let's also fill any remaining missing values at the beginning with 0
df_renewable = df_renewable.fillna(0)

# Verify the changes
print("Renewable Data Info after cleaning:")
df_renewable.info()

print("\nMissing values check:")
print(df_renewable.isnull().sum())

Renewable Data Info after cleaning:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4383 entries, 2006-01-01 to 2017-12-31
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Consumption  4383 non-null   float64
 1   Wind         4383 non-null   float64
 2   Solar        4383 non-null   float64
 3   Wind+Solar   4383 non-null   float64
dtypes: float64(4)
memory usage: 171.2 KB

Missing values check:
Consumption    0
Wind           0
Solar          0
Wind+Solar     0
dtype: int64


In [4]:
# --- Cleaning the Household Consumption DataFrame ---

# Combine 'Date' and 'Time' into a single datetime index
df_household['datetime'] = pd.to_datetime(df_household['Date'] + ' ' + df_household['Time'], dayfirst=True)
df_household = df_household.set_index('datetime')

# The numerical columns have missing values represented as '?'. We'll replace them with NaN
# and convert the columns to a numeric type.
df_household['Global_active_power'] = pd.to_numeric(df_household['Global_active_power'], errors='coerce')

# UPDATED: Using the modern .ffill() method
df_household['Global_active_power'] = df_household['Global_active_power'].ffill()

# The data is per-minute. Let's resample it to hourly sums.
# UPDATED: Using 'h' (lowercase) for hourly resampling as suggested
df_household_hourly = df_household['Global_active_power'].resample('h').mean()

# Verify the changes
print("Household Data Info after cleaning and resampling:")
print(df_household_hourly.head())

print("\nMissing values check:")
print(df_household_hourly.isnull().sum())

Household Data Info after cleaning and resampling:
datetime
2006-12-16 17:00:00    4.222889
2006-12-16 18:00:00    3.632200
2006-12-16 19:00:00    3.400233
2006-12-16 20:00:00    3.268567
2006-12-16 21:00:00    3.056467
Freq: h, Name: Global_active_power, dtype: float64

Missing values check:
0


In [5]:
# Create features for the renewable energy data
def create_features(df):
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['month'] = df.index.month
    df['year'] = df.index.year
    return df

df_renewable = create_features(df_renewable)

# Create features for the household consumption data
# We need to convert the Series back to a DataFrame first
df_household_hourly = df_household_hourly.to_frame()
df_household_hourly = create_features(df_household_hourly)

print("Renewable data with new features:")
print(df_renewable.head())

print("\nHousehold data with new features:")
print(df_household_hourly.head())

Renewable data with new features:
            Consumption  Wind  Solar  Wind+Solar  hour  dayofweek  month  year
Date                                                                          
2006-01-01     1069.184   0.0    0.0         0.0     0          6      1  2006
2006-01-02     1380.521   0.0    0.0         0.0     0          0      1  2006
2006-01-03     1442.533   0.0    0.0         0.0     0          1      1  2006
2006-01-04     1457.217   0.0    0.0         0.0     0          2      1  2006
2006-01-05     1477.131   0.0    0.0         0.0     0          3      1  2006

Household data with new features:
                     Global_active_power  hour  dayofweek  month  year
datetime                                                              
2006-12-16 17:00:00             4.222889    17          5     12  2006
2006-12-16 18:00:00             3.632200    18          5     12  2006
2006-12-16 19:00:00             3.400233    19          5     12  2006
2006-12-16 20:00:00    

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib # For saving the model

# --- Train Solar Model ---

# Define our features (X) and our target (y)
FEATURES = ['hour', 'dayofweek', 'month', 'year']
TARGET_SOLAR = 'Solar'

X_solar = df_renewable[FEATURES]
y_solar = df_renewable[TARGET_SOLAR]

# Split data into training and testing sets
X_train_solar, X_test_solar, y_train_solar, y_test_solar = train_test_split(X_solar, y_solar, test_size=0.2, random_state=42)

# Initialize and train the model
solar_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
solar_model.fit(X_train_solar, y_train_solar)

# Save the trained model to a file
joblib.dump(solar_model, 'solar_production_model.joblib')

print("Solar model trained and saved successfully!")

Solar model trained and saved successfully!


In [7]:
# --- Train Wind Model ---

TARGET_WIND = 'Wind'
X_wind = df_renewable[FEATURES]
y_wind = df_renewable[TARGET_WIND]

X_train_wind, X_test_wind, y_train_wind, y_test_wind = train_test_split(X_wind, y_wind, test_size=0.2, random_state=42)

wind_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
wind_model.fit(X_train_wind, y_train_wind)

joblib.dump(wind_model, 'wind_production_model.joblib')
print("Wind model trained and saved successfully!")

Wind model trained and saved successfully!


In [8]:
# --- Train Household Demand Model ---

TARGET_DEMAND = 'Global_active_power'
X_demand = df_household_hourly[FEATURES]
y_demand = df_household_hourly[TARGET_DEMAND]

X_train_demand, X_test_demand, y_train_demand, y_test_demand = train_test_split(X_demand, y_demand, test_size=0.2, random_state=42)

demand_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
demand_model.fit(X_train_demand, y_train_demand)

joblib.dump(demand_model, 'household_demand_model.joblib')
print("Household demand model trained and saved successfully!")

Household demand model trained and saved successfully!
