In [2]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, StackingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import IterativeImputer
from sklearn.linear_model import Ridge
import pandas as pd
import numpy as np

In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test-2.csv")

In [4]:
test_uid = test_df["uid"]
train_df.drop(columns=['uid'], inplace=True)
test_df.drop(columns=['uid'], inplace=True)

In [5]:
target_col = "output_electricity_generation"
X_train_df = train_df.drop(columns=[target_col])
y_train = train_df[target_col]


In [7]:
categorical_cols = ['day']
for col in categorical_cols:
    le = LabelEncoder()
    X_train_df[col] = le.fit_transform(X_train_df[col])
    test_df[col] = le.transform(test_df[col])

In [8]:
imputer = IterativeImputer(random_state=42)
X_train_df[:] = imputer.fit_transform(X_train_df)
test_df[:] = imputer.transform(test_df)

  X_train_df[:] = imputer.fit_transform(X_train_df)
  test_df[:] = imputer.transform(test_df)


In [9]:
scaler = StandardScaler()
X_train, X_val, y_train, y_val = train_test_split(X_train_df, y_train, test_size=0.2, random_state=42)

In [10]:
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(test_df)

In [11]:
models = {
    "Random Forest": RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', random_state=42, n_jobs=-1),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=200, learning_rate=0.1, random_state=42),
    "Extra Trees": ExtraTreesRegressor(n_estimators=200, max_depth=15, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', random_state=42, n_jobs=-1)
}

In [12]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f"{name} RMSE: {rmse:.4f}")

Random Forest RMSE: 3.4780
Gradient Boosting RMSE: 5.7547
AdaBoost RMSE: 20.5211
Extra Trees RMSE: 5.2016


In [13]:
base_learners = [("Random Forest", models["Random Forest"]), ("Gradient Boosting", models["Gradient Boosting"]), ("Extra Trees", models["Extra Trees"])]
stacking_model = StackingRegressor(estimators=base_learners, final_estimator=Ridge(), n_jobs=-1)
stacking_model.fit(X_train, y_train)
y_pred_stack = stacking_model.predict(X_val)
stacking_rmse = np.sqrt(mean_squared_error(y_val, y_pred_stack))
print(f"Stacking Ensemble RMSE: {stacking_rmse:.4f}")

Stacking Ensemble RMSE: 3.6207


In [None]:
#I have tried to better the model using hyperparameter optimisation.