In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [18]:
df = pd.read_csv("water_demand.csv")

In [19]:
# Encode categorical features
label_encoders = {}
for col in ["res_state", "res_district", "res_month"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [20]:
# Remove outliers using IQR
Q1 = df["water demand (MCM)"].quantile(0.25)
Q3 = df["water demand (MCM)"].quantile(0.75)
IQR = Q3 - Q1
df = df[(df["water demand (MCM)"] >= (Q1 - 1.5 * IQR)) & (df["water demand (MCM)"] <= (Q3 + 1.5 * IQR))]

In [27]:
import numpy as np

corr_matrix = df.corr()
print(corr_matrix["water demand (MCM)"].sort_values(ascending=False))


water demand (MCM)     1.000000
prev_month_demand      0.204071
population             0.186126
res_frl                0.110700
rainfall_population    0.026758
res_district           0.013046
res_month             -0.007772
rainfall              -0.024351
res_year              -0.048033
frl_liv_cap           -0.366456
liv_cap               -0.386806
res_state                   NaN
Name: water demand (MCM), dtype: float64


In [21]:
# Feature Engineering
df["rainfall_population"] = df["rainfall"] * df["population"]
df["frl_liv_cap"] = df["res_frl"] * df["liv_cap"]
df["prev_month_demand"] = df["water demand (MCM)"].shift(1).fillna(method='bfill')

  df["prev_month_demand"] = df["water demand (MCM)"].shift(1).fillna(method='bfill')


In [22]:

# Prepare data for training
X = df.drop(columns=["water demand (MCM)"])
y = df["water demand (MCM)"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [23]:
# Train Random Forest with Hyperparameter Tuning
rf_params = {
    "n_estimators": [100, 200],
    "max_depth": [10, None],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}
rf = RandomForestRegressor(random_state=42)
grid_rf = GridSearchCV(rf, rf_params, cv=3, scoring='r2', n_jobs=-1)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_

In [24]:
# Train XGBoost Model
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb.fit(X_train, y_train)


In [25]:
# Evaluate Models
def evaluate_model(model, name):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"{name} - RMSE: {rmse:.4f}, R²: {r2:.4f}")
    return model

evaluate_model(best_rf, "Random Forest")
evaluate_model(xgb, "XGBoost")

Random Forest - RMSE: 0.0111, R²: 0.0738
XGBoost - RMSE: 0.0117, R²: -0.0200


In [26]:
import joblib

# Save the trained model
joblib.dump(best_rf, "random_forest_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")

['label_encoders.pkl']