In [133]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import umap
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import Ridge

In [134]:
df = pd.read_csv('/content/ElectricityBill.csv')


numerical_cols = ['Construction_Year', 'Number_of_Floors', 'Energy_Consumption_Per_SqM',
                   'Water_Usage_Per_Building', 'Waste_Recycled_Percentage', 'Occupancy_Rate',
                   'Indoor_Air_Quality', 'Smart_Devices_Count', 'Maintenance_Resolution_Time',
                   'Energy_Per_SqM', 'Number_of_Residents']

categorical_cols = ['Building_Type', 'Green_Certified', 'Building_Status', 'Maintenance_Priority']

# Fill missing values....... fill with mean median mode depending on data type

# For numerical columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
# For categorical columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# One-Hot Encoding for categorical columns
df = pd.get_dummies(df, columns=categorical_cols)

# Defining the features and target variable
X = df.drop('Electricity_Bill', axis=1)
Y = df['Electricity_Bill']

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [136]:
# Initialize Ridge Regression model
ridge_model = Ridge(alpha=1.0)  # You can adjust alpha as needed

# Train the model
ridge_model.fit(X_train, Y_train)

# Make predictions
Y_train_pred = ridge_model.predict(X_train)
Y_test_pred = ridge_model.predict(X_test)

# Evaluate the model
def evaluate_model(y_true, y_pred, X):
    mse = mean_squared_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    n = len(y_true)
    p = X.shape[1]
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    return mse, rmse, mae, r2, adj_r2

# Training data metrics
train_mse, train_rmse, train_mae, train_r2, train_adj_r2 = evaluate_model(Y_train, Y_train_pred, X_train)
print(f"---- Training Data ----\nMSE: {train_mse}\nRMSE: {train_rmse}\nMAE: {train_mae}\nR2 Score: {train_r2}\nAdjusted R2 Score: {train_adj_r2}")

# Test data metrics
test_mse, test_rmse, test_mae, test_r2, test_adj_r2 = evaluate_model(Y_test, Y_test_pred, X_test)
print(f"---- Test Data ----\nMSE: {test_mse}\nRMSE: {test_rmse}\nMAE: {test_mae}\nR2 Score: {test_r2}\nAdjusted R2 Score: {test_adj_r2}")


---- Training Data ----
MSE: 24188931.451950934
RMSE: 4918.224420657411
MAE: 3976.711053643824
R2 Score: 0.025448510061495067
Adjusted R2 Score: 0.0024826450322065208
---- Test Data ----
MSE: 24129617.719904963
RMSE: 4912.190725114911
MAE: 3797.5717657896607
R2 Score: 0.00614957783259007
Adjusted R2 Score: -0.0949944916800225
