In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [None]:
# Load datasets
train = pd.read_csv('dataset/train.csv')
train


Unnamed: 0,Timestamp,Residents,Apartment_Type,Temperature,Humidity,Water_Price,Period_Consumption_Index,Income_Level,Guests,Amenities,Appliance_Usage,Water_Consumption
0,01/01/2002 00,1,Studio,15.31,46.61,1.06,0.970000,Low,0,Swimming Pool,0.0,64.85
1,01/01/2002 08,4,,21.01,66.11,2.98,0.910000,Upper Middle,1,Swimming Pool,1.0,192.50
2,01/01/2002 16,2,Cottage,12.86,60.86,1.44,1.430000,Middle,0,,1.0,116.62
3,02/01/2002 00,2,1BHK,20.16,50.58,1.48,0.910000,Middle,-1,Garden,0.0,76.96
4,02/01/2002 08,2,Cottage,16.23,52.25,1.14,1.110000,Middle,0,Fountain,0.0,104.70
...,...,...,...,...,...,...,...,...,...,...,...,...
13995,10/10/2014 00,2,1BHK,25.61,61.5,1.70,0.940000,Low,0,,0.0,78.59
13996,10/10/2014 08,5,2BHK,13.27,52.58,1.88,1.030000,Upper Middle,0,Garden,1.0,185.50
13997,10/10/2014 16,4,2BHK,,46.93,1.22,1.100000,Middle,0,,1.0,180.28
13998,11/10/2014 00,4,3BHK,11.62,64.48,2.86,1.120000,Upper Middle,1,Swimming Pool,0.0,212.19


In [4]:
test = pd.read_csv('dataset/test.csv')
test

Unnamed: 0,Timestamp,Residents,Apartment_Type,Temperature,Humidity,Water_Price,Period_Consumption_Index,Income_Level,Guests,Amenities,Appliance_Usage
0,11/10/2014 16,5,Bungalow,11.89,57.88,2.77,1.480000,Upper Middle,1,Jacuzzi,0.0
1,12/10/2014 00,4,Bungalow,29.22,61.41,2.85,1.300000,Upper Middle,0,Garden,
2,12/10/2014 08,3,Cottage,10.27,64.9,1.66,0.970000,Middle,0,,0.0
3,12/10/2014 16,3,1BHK,27.03,52.67,1.48,1.310000,Low,0,,0.0
4,13/10/2014 00,2,Cottage,12.32,55.23,1.19,1.300000,Middle,0,Swimming Pool,0.0
...,...,...,...,...,...,...,...,...,...,...,...
5995,01/04/2020 00,5,2BHK,16.22,54.79,1.09,1.430000,Middle,0,,1.0
5996,01/04/2020 08,3,1BHK,15.65,52.18,1.61,0.900000,Low,0,Garden,0.0
5997,01/04/2020 16,3,Bungalow,16.97,33.14,2.62,1.490000,Rich,0,Fountain,0.0
5998,02/04/2020 00,4,2BHK,19.86,36.94,1.42,1.882674,Middle,0,,0.0


In [6]:
# Preprocessing
le = LabelEncoder()
categorical_cols = ['Apartment_Type', 'Income_Level', 'Amenities', 'Appliance_Usage']
for col in categorical_cols:
    if train[col].dtype == 'object':
        train[col] = train[col].fillna('Missing')
        test[col] = test[col].fillna('Missing')
        le.fit(pd.concat([train[col], test[col]]))
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])
   

In [7]:
# Features and target
X = train.drop(['Timestamp', 'Water_Consumption'], axis=1)
y = train['Water_Consumption']

In [8]:
X

Unnamed: 0,Residents,Apartment_Type,Temperature,Humidity,Water_Price,Period_Consumption_Index,Income_Level,Guests,Amenities,Appliance_Usage
0,1,6,15.31,46.61,1.06,0.970000,193,0,4,0.0
1,4,7,21.01,66.11,2.98,0.910000,238,1,4,1.0
2,2,4,12.86,60.86,1.44,1.430000,196,0,3,1.0
3,2,0,20.16,50.58,1.48,0.910000,196,-1,1,0.0
4,2,4,16.23,52.25,1.14,1.110000,196,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...
13995,2,0,25.61,61.5,1.70,0.940000,193,0,3,0.0
13996,5,1,13.27,52.58,1.88,1.030000,238,0,1,1.0
13997,4,1,,46.93,1.22,1.100000,196,0,3,1.0
13998,4,2,11.62,64.48,2.86,1.120000,238,1,4,0.0


In [9]:
y

0         64.85
1        192.50
2        116.62
3         76.96
4        104.70
          ...  
13995     78.59
13996    185.50
13997    180.28
13998    212.19
13999    303.59
Name: Water_Consumption, Length: 14000, dtype: float64

In [11]:
# Convert to numeric
X = X.apply(pd.to_numeric, errors='coerce')
test_features = test.drop(['Timestamp'], axis=1).apply(pd.to_numeric, errors='coerce')

In [18]:
test_features

Unnamed: 0,Residents,Apartment_Type,Temperature,Humidity,Water_Price,Period_Consumption_Index,Income_Level,Guests,Amenities,Appliance_Usage
0,5,3,11.89,57.88,2.77,1.480000,,1,2,0.0
1,4,3,29.22,61.41,2.85,1.300000,,0,1,
2,3,4,10.27,64.90,1.66,0.970000,,0,3,0.0
3,3,0,27.03,52.67,1.48,1.310000,,0,3,0.0
4,2,4,12.32,55.23,1.19,1.300000,,0,4,0.0
...,...,...,...,...,...,...,...,...,...,...
5995,5,1,16.22,54.79,1.09,1.430000,,0,3,1.0
5996,3,0,15.65,52.18,1.61,0.900000,,0,1,0.0
5997,3,3,16.97,33.14,2.62,1.490000,,0,0,0.0
5998,4,1,19.86,36.94,1.42,1.882674,,0,3,0.0


In [12]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_features)

In [13]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [14]:
# Model training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [15]:
# Validation
y_pred = model.predict(X_val)
mse = metrics.mean_squared_error(y_val, y_pred)
score = max(0, 100 - np.sqrt(mse))
print(f"Validation Score: {score}")


Validation Score: 79.887845794786


In [16]:
# Prediction
test_predictions = model.predict(test_scaled)
submission = pd.DataFrame({'Timestamp': test['Timestamp'], 'Water_Consumption': test_predictions})
submission.to_csv('submission.csv', index=False)

In [20]:

# Model training and hyperparameter tuning
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20]}
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_



In [29]:
# Model comparison
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

models = {
    'RandomForest': best_rf,
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}


In [32]:
# Check for NaNs in X_train and y_train
print("Missing values in X_train:", np.isnan(X_train).sum())
print("Missing values in y_train:", np.isnan(y_train).sum())



Missing values in X_train: 997
Missing values in y_train: 0


In [33]:
X_train = np.nan_to_num(X_train, nan=np.nanmedian(X_train))
X_val = np.nan_to_num(X_val, nan=np.nanmedian(X_val))


In [34]:
X_train = np.nan_to_num(X_train, nan=np.nanmean(X_train))
X_val = np.nan_to_num(X_val, nan=np.nanmean(X_val))


In [35]:
mask = ~np.isnan(X_train).any(axis=1)
X_train = X_train[mask]
y_train = y_train[mask]


In [36]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mse = metrics.mean_squared_error(y_val, y_pred)
    score = max(0, 100 - np.sqrt(mse))
    print(f"{name} Validation Score: {score}")

RandomForest Validation Score: 80.15032533465447
GradientBoosting Validation Score: 82.87438766883764
XGBoost Validation Score: 86.83070011667424


In [37]:
# Prediction with best model
test_predictions = best_rf.predict(test_scaled)
submission = pd.DataFrame({'Timestamp': test['Timestamp'], 'Water_Consumption': test_predictions})
submission.to_csv('submission.csv', index=False)