LOADING THE DATASET 

In [None]:
import pandas as pd
# Load the dataset
df = pd.read_csv('/content/household_power_consumption.csv')
print(df.head())

PERFORMING EDA 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert numeric columns to proper types
numerical_features = ['Global_active_power', 'Global_reactive_power', 'Voltage',
                      'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']

for col in numerical_features:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, force invalid values to NaN

# Box plots to check for outliers
for col in numerical_features:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

DATA PREPROCESSING

In [None]:
missing_values = df.isna().sum()

# Calculate the percentage of missing values
missing_percentage = (missing_values / len(df)) * 100

# Display missing value percentages
print(missing_percentage)

# HANDLING MISSING OR INCONSISTENT DATA

# Drop highly missing columns
df.drop(columns=['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3'], inplace=True)

for col in ['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
for col in ['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity']:
    df[col].fillna(df[col].mean(), inplace=True)

In [None]:
# Parse Date and Time into Separate Features


import pandas as pd
# Convert 'Date' to datetime format
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)

# Convert 'Time' to string (if it's stored as datetime.time)
df['Time'] = df['Time'].astype(str)

# Concatenate 'Date' and 'Time' and convert to datetime
df['Datetime'] = pd.to_datetime(df['Date'].astype(str) + ' ' + df['Time'])
df.head()

In [None]:
#Creating additional features such as daily averages, peak hours, or rolling averages.


#Extract Year, Month, Day, and Hour after creating Datetime
df['Year'] = df['Datetime'].dt.year
df['Month'] = df['Datetime'].dt.month
df['Day'] = df['Datetime'].dt.day
df['Hour'] = df['Datetime'].dt.hour  # Extract Hour for peak hours

#Create daily averages
df['Daily_avg_power'] = df.groupby(['Year', 'Month', 'Day'])['Global_active_power'].transform('mean')

#Identify peak hours (6 AM - 9 AM & 5 PM - 9 PM)
df['Peak_Hour'] = df['Hour'].apply(lambda x: 1 if (6 <= x <= 9) or (17 <= x <= 21) else 0)

# Create a rolling average (e.g., 3-hour window)
df['Rolling_avg_power'] = df['Global_active_power'].rolling(window=3, min_periods=1).mean()

df.head()

In [None]:
#Normalizing the data for better model performance.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_cols = ['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity', 'Daily_avg_power', 'Rolling_avg_power']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
df.head()

FEATURE EXTRACTION

In [None]:
# Extract more temporal features
df['Weekday'] = df['Datetime'].dt.weekday  # Monday=0, Sunday=6
df['Is_Weekend'] = df['Weekday'].apply(lambda x: 1 if x >= 5 else 0)  # 1 if Sat/Sun, else 0
df['Is_Working_Hour'] = df['Hour'].apply(lambda x: 1 if 9 <= x <= 17 else 0)  # Office hours 9 AM - 5 PM

In [None]:
# Create lag features for previous hours
df['Lag_1'] = df['Global_active_power'].shift(1)  # 1-hour lag
df['Lag_3'] = df['Global_active_power'].shift(3)  # 3-hour lag
df['Lag_24'] = df['Global_active_power'].shift(24)  # 24-hour lag (previous day)

In [None]:
# Rolling mean features
df['Rolling_Mean_6'] = df['Global_active_power'].rolling(window=6, min_periods=1).mean()  # 6-hour average
df['Rolling_Mean_24'] = df['Global_active_power'].rolling(window=24, min_periods=1).mean()  # 24-hour average


In [None]:
# Interaction between power and voltage
df['Power_Voltage_Interaction'] = df['Global_active_power'] * df['Voltage']

# Ratio of reactive power to active power
df['Power_Ratio'] = df['Global_reactive_power'] / (df['Global_active_power'] + 1e-6)  # Avoid division by zero
df.fillna(0, inplace=True)

print(df.head())  # Preview the dataset
print(df.info())  # Check data types and missing values


MODEL TRAINING

In [None]:
from sklearn.model_selection import train_test_split

# Drop unnecessary columns (Datetime, Date, Time) and select features
features = df.drop(columns=['Global_active_power', 'Datetime', 'Date', 'Time'])
target = df['Global_active_power']

# Split dataset (80% train, 20% test) without shuffling (time series split)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, shuffle=False, random_state=42)

# Reset index after splitting (optional but cleaner)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# Display dataset shapes
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")

In [None]:
#Linear Regression

from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [None]:
#Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [None]:
#Gradient Boosting (XGBoost)

from xgboost import XGBRegressor

xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

In [None]:
#Neural Network (MLP Regressor)

from sklearn.neural_network import MLPRegressor

nn_model = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
nn_model.fit(X_train, y_train)


EVALUATION

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Function to evaluate models
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred) ** 0.5  # Manually compute RMSE
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return {"Model": model_name, "RMSE": rmse, "MAE": mae, "R² Score": r2}

# Evaluate all models
results = []
results.append(evaluate_model(lr_model, X_test, y_test, "Linear Regression"))
results.append(evaluate_model(rf_model, X_test, y_test, "Random Forest Regressor"))
results.append(evaluate_model(xgb_model, X_test, y_test, "XGBoost Regressor"))
results.append(evaluate_model(nn_model, X_test, y_test, "MLP Regressor (Neural Network)"))

# Convert results to DataFrame for better visualization
import pandas as pd
results_df = pd.DataFrame(results).sort_values(by="RMSE")
print("Model Performance Summary:")
print(results_df)