In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
df = pd.read_csv("data/raw/electricity_bill_dataset.csv")

# Basic info
print(df.head())

   Fan  Refrigerator  AirConditioner  Television  Monitor  MotorPump  Month  \
0   16          23.0             2.0         6.0      1.0          0     10   
1   19          22.0             2.0         3.0      1.0          0      5   
2    7          20.0             2.0         6.0      7.0          0      7   
3    7          22.0             3.0        21.0      1.0          0      6   
4   11          23.0             2.0        11.0      1.0          0      2   

        City                                    Company  MonthlyHours  \
0  Hyderabad                    Tata Power Company Ltd.           384   
1   Vadodara                                       NHPC           488   
2     Shimla                            Jyoti Structure           416   
3     Mumbai                            Power Grid Corp           475   
4     Mumbai  Ratnagiri Gas and Power Pvt. Ltd. (RGPPL)           457   

   TariffRate  ElectricityBill  
0         8.4           3225.6  
1         7.8       

In [8]:
df.isnull()

Unnamed: 0,Fan,Refrigerator,AirConditioner,Television,Monitor,MotorPump,Month,City,Company,MonthlyHours,TariffRate,ElectricityBill
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
45340,False,False,False,False,False,False,False,False,False,False,False,False
45341,False,False,False,False,False,False,False,False,False,False,False,False
45342,False,False,False,False,False,False,False,False,False,False,False,False
45343,False,False,False,False,False,False,False,False,False,False,False,False


In [9]:
df.notnull()

Unnamed: 0,Fan,Refrigerator,AirConditioner,Television,Monitor,MotorPump,Month,City,Company,MonthlyHours,TariffRate,ElectricityBill
0,True,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...
45340,True,True,True,True,True,True,True,True,True,True,True,True
45341,True,True,True,True,True,True,True,True,True,True,True,True
45342,True,True,True,True,True,True,True,True,True,True,True,True
45343,True,True,True,True,True,True,True,True,True,True,True,True


In [10]:
import os
os.makedirs("models", exist_ok=True)
from sklearn.preprocessing import LabelEncoder
le_city = LabelEncoder()
le_company = LabelEncoder()
df['City'] = le_city.fit_transform(df['City'])
df['Company'] = le_company.fit_transform(df['Company'])
import joblib
joblib.dump(le_city, "models/le_city.pkl")
joblib.dump(le_company, "models/le_company.pkl")

# Features & target
X = df[['Fan', 'Refrigerator', 'AirConditioner', 'Television', 'Monitor', 
        'MotorPump', 'Month', 'City', 'Company', 'MonthlyHours', 'TariffRate']]
y = df['ElectricityBill']

In [11]:
print(df.head())
print("Shape:", df.shape)
print("Encoders saved in models/")


   Fan  Refrigerator  AirConditioner  Television  Monitor  MotorPump  Month  \
0   16          23.0             2.0         6.0      1.0          0     10   
1   19          22.0             2.0         3.0      1.0          0      5   
2    7          20.0             2.0         6.0      7.0          0      7   
3    7          22.0             3.0        21.0      1.0          0      6   
4   11          23.0             2.0        11.0      1.0          0      2   

   City  Company  MonthlyHours  TariffRate  ElectricityBill  
0     5       27           384         8.4           3225.6  
1    15       13           488         7.8           3806.4  
2    14        8           416         7.7           3203.2  
3     7       19           475         9.2           4370.0  
4     7       20           457         9.2           4204.4  
Shape: (45345, 12)
Encoders saved in models/


In [12]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models to compare
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}

results = {}

# Train and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MAE": mae, "RMSE": rmse, "R2": r2}

# Print results
for name, metrics in results.items():
    print(f"{name}: MAE={metrics['MAE']:.2f}, RMSE={metrics['RMSE']:.2f}, R2={metrics['R2']:.4f}")


LinearRegression: MAE=49.20, RMSE=70.50, R2=0.9956
RandomForest: MAE=1.40, RMSE=5.26, R2=1.0000
GradientBoosting: MAE=17.91, RMSE=22.78, R2=0.9995


In [13]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import numpy as np

# Define models
rf = RandomForestRegressor(n_estimators=100, random_state=42)
gb = GradientBoostingRegressor(random_state=42)

# K-Fold setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# RandomForest CV
rf_scores = cross_val_score(rf, X, y, cv=kf, scoring='r2')
print("RandomForest R2 CV Scores:", rf_scores)
print("RandomForest Mean R2:", np.mean(rf_scores))

# GradientBoosting CV
gb_scores = cross_val_score(gb, X, y, cv=kf, scoring='r2')
print("GradientBoosting R2 CV Scores:", gb_scores)
print("GradientBoosting Mean R2:", np.mean(gb_scores))


RandomForest R2 CV Scores: [0.99997522 0.99997185 0.9999636  0.99995791 0.99997222]
RandomForest Mean R2: 0.9999681593354112
GradientBoosting R2 CV Scores: [0.99954467 0.99952471 0.99956988 0.99956757 0.9995626 ]
GradientBoosting Mean R2: 0.9995538870127992


In [14]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Step 1: Create bill category
def bill_category(x):
    if x < 2000:
        return "Low"
    elif x < 4000:
        return "Medium"
    else:
        return "High"

df['BillCategory'] = df['ElectricityBill'].apply(bill_category)

# Step 2: Encode target
le_bill = LabelEncoder()
df['BillCategoryEncoded'] = le_bill.fit_transform(df['BillCategory'])

# Step 3: Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    df['BillCategoryEncoded'], 
    test_size=0.2, 
    random_state=42
)

# Step 4: Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Step 5: Predict & evaluate
y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le_bill.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Step 6: Save model & encoder
import joblib
joblib.dump(clf, "models/random_forest_classifier.pkl")
joblib.dump(le_bill, "models/le_bill.pkl")


Classification Report:
               precision    recall  f1-score   support

        High       1.00      1.00      1.00      5492
         Low       0.99      0.87      0.92        84
      Medium       0.99      1.00      1.00      3493

    accuracy                           1.00      9069
   macro avg       0.99      0.95      0.97      9069
weighted avg       1.00      1.00      1.00      9069

Confusion Matrix:
 [[5479    0   13]
 [   0   73   11]
 [   6    1 3486]]


['models/le_bill.pkl']

In [15]:
import joblib
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X, y)
joblib.dump(rf_regressor, "models/random_forest_regressor.pkl")


['models/random_forest_regressor.pkl']