In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore') # ignore warnings

In [2]:
df = pd.read_csv('preprocessed_dataset.csv')

In [3]:
df.columns

Index(['distance-to-solar-noon', 'temperature', 'wind-direction', 'wind-speed',
       'sky-cover', 'visibility', 'humidity', 'average-wind-speed-(period)',
       'average-pressure-(period)', 'power-generated'],
      dtype='object')

In [4]:
X = df.drop(columns=['power-generated'])
y = df['power-generated']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "Support Vector Regression": SVR(),
    "XGBoost":XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "LightGBM":LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "R² Score": r2_score(y_test, y_pred)
    }

# Convert results to a DataFrame
results_df = pd.DataFrame(results).T
results_df

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000362 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 2336, number of used features: 9
[LightGBM] [Info] Start training from score 7003.485445


Unnamed: 0,MAE,MSE,R² Score
Linear Regression,4981.20148,39495180.0,0.625059
Ridge Regression,4980.978996,39495450.0,0.625057
Lasso Regression,4981.299737,39494670.0,0.625064
Decision Tree,2000.255137,20001730.0,0.810117
Random Forest,1546.435942,11572450.0,0.890139
Gradient Boosting,1677.653644,10426190.0,0.901021
Support Vector Regression,6775.595439,145415000.0,-0.380473
XGBoost,1588.283275,10627380.0,0.899111
LightGBM,1683.983245,11852780.0,0.887478


In [7]:
# Define hyperparameter grids
param_grids = {
    "RandomForest": {
        "n_estimators": [100, 200],
        "max_depth": [10, 20, None],
        "min_samples_split": [2, 5]
    },
    "GradientBoosting": {
        "n_estimators": [100, 200],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5]
    },
    "XGBoost": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 10]
    },
    "LightGBM": {
        "n_estimators": [100, 200],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [-1, 5, 10]
    }
}

# Models dictionary
models = {
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42),
    "LightGBM": LGBMRegressor(random_state=42)
}

# Store best models and results
best_models = {}
results = {}

# Hyperparameter tuning
for name, model in models.items():
    grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Best model and parameters
    best_models[name] = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    # Evaluate on test data
    y_pred = best_models[name].predict(X_test)
    
    results[name] = {
        "Best Params": best_params,
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "R² Score": r2_score(y_test, y_pred)
    }

# Convert results to DataFrame
results_df = pd.DataFrame(results).T
print(results_df)

# Select the best model (highest R² and lowest RMSE)
best_model_name = results_df["R² Score"].idxmax()
best_model = best_models[best_model_name]

print(f"\nBest Model for Deployment: {best_model_name}")
print(f"Best Hyperparameters: {results[best_model_name]['Best Params']}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000463 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 2336, number of used features: 9
[LightGBM] [Info] Start training from score 7003.485445
                                                        Best Params  \
RandomForest      {'max_depth': 20, 'min_samples_split': 5, 'n_e...   
GradientBoosting  {'learning_rate': 0.1, 'max_depth': 3, 'n_esti...   
XGBoost           {'learning_rate': 0.1, 'max_depth': 5, 'n_esti...   
LightGBM          {'learning_rate': 0.1, 'max_depth': 5, 'n_esti...   

                          MAE              MSE  R² Score  
RandomForest      1558.504173  11676156.235772  0.889154  
GradientBoosting  1677.653644  10426185.226077  0.901021  
XGBoost           1592.866847   9849999.733499  0.906491  
LightGBM          1617.686491  11393714.110108  0.891836  

Best M

In [8]:
import pickle

# Dump the best model to a file using pickle
with open("model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print("Best model saved successfully as 'model.pkl'")

Best model saved successfully as 'model.pkl'
