In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import RFE

ModuleNotFoundError: No module named 'matplotlib'

In [9]:
df = pd.read_csv('/Users/shamiksinha/Desktop/ML assignment/ElectricityBill.csv')


numerical_cols = ['Construction_Year', 'Number_of_Floors', 'Energy_Consumption_Per_SqM',
                   'Water_Usage_Per_Building', 'Waste_Recycled_Percentage', 'Occupancy_Rate',
                   'Indoor_Air_Quality', 'Smart_Devices_Count', 'Maintenance_Resolution_Time',
                   'Energy_Per_SqM', 'Number_of_Residents']

categorical_cols = ['Building_Type', 'Green_Certified', 'Building_Status', 'Maintenance_Priority']

# Fill missing values....... fill with mean median mode depending on data type

# For numerical columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
# For categorical columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Label encoding for categorical columns
label_encoders = {}
for cat_col in categorical_cols:
    le = LabelEncoder()
    df[cat_col] = le.fit_transform(df[cat_col])
    label_encoders[cat_col] = le

# Defining the features and target variable
X = df.drop('Electricity_Bill', axis=1)
Y = df['Electricity_Bill']

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [10]:
model = LinearRegression()

# Initialize RFE with the model and specify the number of features to select
rfe = RFE(estimator=model, n_features_to_select=3)

# Fit RFE to the data
rfe.fit(X, Y)

# Get the ranking of features (1 means selected)
ranking = rfe.ranking_

# Get the support (True if the feature is selected)
support = rfe.support_

# Get the top 3 features
selected_features = X.columns[support]

In [11]:
#only the selected features
X_selected_train = X_train[selected_features]
X_selected_test = X_test[selected_features]

# Train the Linear Regression model using the selected features
model.fit(X_selected_train, Y_train)

# Predict on both train and test sets
Y_train_pred = model.predict(X_selected_train)
Y_test_pred = model.predict(X_selected_test)

# Calculate metrics
train_mse = mean_squared_error(Y_train, Y_train_pred)
test_mse = mean_squared_error(Y_test, Y_test_pred)
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)
train_mae = mean_absolute_error(Y_train, Y_train_pred)
test_mae = mean_absolute_error(Y_test, Y_test_pred)
train_r2 = r2_score(Y_train, Y_train_pred)
test_r2 = r2_score(Y_test, Y_test_pred)
train_adjusted_r2 = 1 - (1 - train_r2) * (len(Y_train) - 1) / (len(Y_train) - X_selected_train.shape[1] - 1)
test_adjusted_r2 = 1 - (1 - test_r2) * (len(Y_test) - 1) / (len(Y_test) - X_selected_test.shape[1] - 1)

# Print all the metrics
print("\n---- Training Data ----")
print(f"MSE: {train_mse}")
print(f"RMSE: {train_rmse}")
print(f"MAE: {train_mae}")
print(f"R2 Score: {train_r2}")
print(f"Adjusted R2 Score: {train_adjusted_r2}\n")

print("---- Test Data ----")
print(f"MSE: {test_mse}")
print(f"RMSE: {test_rmse}")
print(f"MAE: {test_mae}")
print(f"R2 Score: {test_r2}")
print(f"Adjusted R2 Score: {test_adjusted_r2}")


---- Training Data ----
MSE: 24673540.31152836
RMSE: 4967.246753638112
MAE: 4006.784035347106
R2 Score: 0.005924030979948536
Adjusted R2 Score: 0.0029298262539845243

---- Test Data ----
MSE: 24181190.647202764
RMSE: 4917.437406536332
MAE: 3825.6515746669897
R2 Score: 0.004025392685427787
Adjusted R2 Score: -0.008120639111091288
