In [153]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import umap
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import FastICA
from sklearn.linear_model import ElasticNet

In [154]:
df = pd.read_csv('/content/ElectricityBill.csv')


numerical_cols = ['Construction_Year', 'Number_of_Floors', 'Energy_Consumption_Per_SqM',
                   'Water_Usage_Per_Building', 'Waste_Recycled_Percentage', 'Occupancy_Rate',
                   'Indoor_Air_Quality', 'Smart_Devices_Count', 'Maintenance_Resolution_Time',
                   'Energy_Per_SqM', 'Number_of_Residents']

categorical_cols = ['Building_Type', 'Green_Certified', 'Building_Status', 'Maintenance_Priority']

# Fill missing values....... fill with mean median mode depending on data type

# For numerical columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
# For categorical columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Label encoding for categorical columns
label_encoders = {}
for cat_col in categorical_cols:
    le = LabelEncoder()
    df[cat_col] = le.fit_transform(df[cat_col])
    label_encoders[cat_col] = le


# Defining the features and target variable
X = df.drop('Electricity_Bill', axis=1)
Y = df['Electricity_Bill']

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [155]:
#initialising gbr
gbr = GradientBoostingRegressor(random_state=42)

# train the model on the training data
gbr.fit(X_train, Y_train)

# Predicting on the training data
Y_train_pred = gbr.predict(X_train)

# Predicting on the testing data
Y_test_pred = gbr.predict(X_test)

In [156]:
# Training metrics
mse_train = mean_squared_error(Y_train, Y_train_pred)
rmse_train = mean_squared_error(Y_train, Y_train_pred, squared=False)
mae_train = mean_absolute_error(Y_train, Y_train_pred)
r2_train = r2_score(Y_train, Y_train_pred)
adj_r2_train = 1 - (1 - r2_train) * (len(Y_train) - 1) / (len(Y_train) - X_train.shape[1] - 1)

# Testing metrics
mse_test = mean_squared_error(Y_test, Y_test_pred)
rmse_test = mean_squared_error(Y_test, Y_test_pred, squared=False)
mae_test = mean_absolute_error(Y_test, Y_test_pred)
r2_test = r2_score(Y_test, Y_test_pred)
adj_r2_test = 1 - (1 - r2_test) * (len(Y_test) - 1) / (len(Y_test) - X_test.shape[1] - 1)

# Print the results
print("---- Training Data ----")
print(f"MSE: {mse_train}")
print(f"RMSE: {rmse_train}")
print(f"MAE: {mae_train}")
print(f"R2 Score: {r2_train}")
print(f"Adjusted R2 Score: {adj_r2_train}")
print()

print("---- Test Data ----")
print(f"MSE: {mse_test}")
print(f"RMSE: {rmse_test}")
print(f"MAE: {mae_test}")
print(f"R2 Score: {r2_test}")
print(f"Adjusted R2 Score: {adj_r2_test}")

---- Training Data ----
MSE: 14926446.25730777
RMSE: 3863.4759294329465
MAE: 3092.748188686501
R2 Score: 0.398626166333897
Adjusted R2 Score: 0.38945888228410885

---- Test Data ----
MSE: 24405496.61674575
RMSE: 4940.1919615279885
MAE: 3813.630549423027
R2 Score: -0.005213319055167753
Adjusted R2 Score: -0.06965007027665293
