In [130]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import umap
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import RFE

In [131]:
df = pd.read_csv('/content/ElectricityBill.csv')


numerical_cols = ['Construction_Year', 'Number_of_Floors', 'Energy_Consumption_Per_SqM',
                   'Water_Usage_Per_Building', 'Waste_Recycled_Percentage', 'Occupancy_Rate',
                   'Indoor_Air_Quality', 'Smart_Devices_Count', 'Maintenance_Resolution_Time',
                   'Energy_Per_SqM', 'Number_of_Residents']

categorical_cols = ['Building_Type', 'Green_Certified', 'Building_Status', 'Maintenance_Priority']

# Fill missing values....... fill with mean median mode depending on data type

# For numerical columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
# For categorical columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Label encoding for categorical columns
label_encoders = {}
for cat_col in categorical_cols:
    le = LabelEncoder()
    df[cat_col] = le.fit_transform(df[cat_col])
    label_encoders[cat_col] = le


# Defining the features and target variable
X = df.drop('Electricity_Bill', axis=1)
Y = df['Electricity_Bill']

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [132]:
# Initialize the Linear Regression model
lr_model = LinearRegression()

# Train the model on the training data
lr_model.fit(X_train, Y_train)

# On train and test sets
Y_train_pred = lr_model.predict(X_train)
Y_test_pred = lr_model.predict(X_test)

# Mean Squared Error (MSE)
train_mse = mean_squared_error(Y_train, Y_train_pred)
test_mse = mean_squared_error(Y_test, Y_test_pred)

# Root Mean Squared Error (RMSE)
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)

# Mean Absolute Error (MAE)
train_mae = mean_absolute_error(Y_train, Y_train_pred)
test_mae = mean_absolute_error(Y_test, Y_test_pred)

# R2 Score
train_r2 = r2_score(Y_train, Y_train_pred)
test_r2 = r2_score(Y_test, Y_test_pred)

# Adjusted R2 Score
def adjusted_r2(r2, X):
    return 1 - (1 - r2) * (len(Y_train) - 1) / (len(Y_train) - X.shape[1] - 1)

train_adjusted_r2 = adjusted_r2(train_r2, X_train)
test_adjusted_r2 = adjusted_r2(test_r2, X_test)

# Print all the metrics
print("---- Training Data ----")
print(f"MSE: {train_mse}")
print(f"RMSE: {train_rmse}")
print(f"MAE: {train_mae}")
print(f"R2 Score: {train_r2}")
print(f"Adjusted R2 Score: {train_adjusted_r2}\n")

print("---- Test Data ----")
print(f"MSE: {test_mse}")
print(f"RMSE: {test_rmse}")
print(f"MAE: {test_mae}")
print(f"R2 Score: {test_r2}")
print(f"Adjusted R2 Score: {test_adjusted_r2}")

---- Training Data ----
MSE: 24475013.16847547
RMSE: 4947.222773281538
MAE: 4006.32846932936
R2 Score: 0.013922520844610209
Adjusted R2 Score: -0.0011091480449536562

---- Test Data ----
MSE: 24278016.155742623
RMSE: 4927.272689403604
MAE: 3842.4093125585155
R2 Score: 3.7344733075372893e-05
Adjusted R2 Score: -0.015205988426481465
