In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

# from kmodes.kprototypes import KPrototypes
from sklearn.manifold import TSNE
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sb

In [2]:
## Read the subcritical coal power plant data

sub_critical = pd.read_csv('CEEW_subcritical_with_ws_price.csv')

all_heat_rate = sub_critical['Actual SHR']* 3.96567/1000
all_capacity  = sub_critical['Capacity']
all_age       = sub_critical['Age']
all_region     = sub_critical['Region']
all_PLF        = sub_critical['Actual avg PLF']
all_water_stress = sub_critical['bws_score']
all_price = sub_critical['coal_price']


1. Find the correlation between the features

In [3]:
data = pd.DataFrame({'heat_rate': all_heat_rate,
                     'capacity': all_capacity,
                     'age': all_age, 
                     'PLF': all_PLF,
                     'water stress': all_water_stress,
                     'region': all_region,
                     'price': all_price
                     })

# change the state to one hot encoding
data = pd.get_dummies(data, columns=['region']) # aggregate the states here. 

print(data.head())

   heat_rate  capacity       age       PLF  water stress  price  region_ER  \
0  10.971691     600.0  1.670089  0.328097      0.480176   1.72      False   
1  10.979451     600.0  2.116359  0.332492      0.373747   3.43      False   
2  10.321183     600.0  3.425051  0.864876      0.250206   2.04      False   
3  10.628501     600.0  2.338125  0.635221      0.139977   1.59      False   
4  10.816011     600.0  4.175222  0.504753      0.591333   2.04      False   

   region_NER  region_NR  region_SR  region_WR  
0       False      False      False       True  
1       False      False       True      False  
2       False      False       True      False  
3       False      False      False       True  
4       False       True      False      False  


2. Fit the prediction model using data

In [4]:
X = data.drop('heat_rate', axis=1)
y = data['heat_rate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# add constant
X_train = np.c_[np.ones(X_train.shape[0]), X_train]
X_test = np.c_[np.ones(X_test.shape[0]), X_test]

In [5]:
# show the unique states
print(data.columns)


Index(['heat_rate', 'capacity', 'age', 'PLF', 'water stress', 'price',
       'region_ER', 'region_NER', 'region_NR', 'region_SR', 'region_WR'],
      dtype='object')


In [6]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge

models = {
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'RandomForestRegressor': RandomForestRegressor(), # depth
    'DecisionTreeRegressor': DecisionTreeRegressor(), # depth
    'XGBRegressor': XGBRegressor(), 
    'LinearRegression': LinearRegression(), # no hyperparameters
    'RidgeRegression': Ridge(), # alpha
    'SVR': SVR(), # 
    'KNeighborsRegressor': KNeighborsRegressor(),
}

from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

# Cross validation
kf = KFold(n_splits=5)
res = {}

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.metrics import mean_absolute_percentage_error

scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

# Hyperparameter grids for each model
param_grids = {
    'GradientBoostingRegressor': {
        'n_estimators': [100, 150, 200, 300],
        'learning_rate': [1, 0.5, 0.1, 0.01],
        'max_depth': [10, 15, 30, 40, ],
        'loss': ['squared_error', 'absolute_error', 'huber', 'quantile']
    },
    'RandomForestRegressor': {
        'max_depth': [None]
    },
    'DecisionTreeRegressor': {
        'max_depth': [None]
    },
    'XGBRegressor': {
        'n_estimators': [100, 150, 300, 400],
        'learning_rate': [1, 0.5, 0.1, 0.01],
        'max_depth': [2, 5, 10],
    },
    'LinearRegression': {},
    'RidgeRegression': {
        'alpha': [0.1, 1, 10, 100]
    },
    'SVR': {
        'C': [0.01, 0.1, 1, ],
        'gamma': [0.01, 0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly']
    },
    'KNeighborsRegressor': {
        'n_neighbors': [2, 3, 5, 6],
        'weights': ['uniform', 'distance']
    },
}

# Perform hyperparameter search
param_search_results = {}

for name, model in models.items():
    # if (name != 'LGBMRegressor'):
    #     continue
    print(f"Performing hyperparameter search for {name}")

    param_grid = param_grids.get(name, {})  # Get corresponding param grid, or an empty dict

    grid_search = GridSearchCV(model, param_grid, cv=kf, scoring=scorer, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    param_search_results[name] = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'cv_results': grid_search.cv_results_
    }

Performing hyperparameter search for GradientBoostingRegressor
Fitting 5 folds for each of 256 candidates, totalling 1280 fits


Performing hyperparameter search for RandomForestRegressor
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Performing hyperparameter search for DecisionTreeRegressor
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Performing hyperparameter search for XGBRegressor
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Performing hyperparameter search for LinearRegression
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Performing hyperparameter search for RidgeRegression
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Performing hyperparameter search for SVR
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Performing hyperparameter search for KNeighborsRegressor
Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [8]:
# show the best hyperparameters for each model in a table
for name, result in param_search_results.items():
    print(name, result['best_params'], result['best_score'])

GradientBoostingRegressor {'learning_rate': 0.1, 'loss': 'absolute_error', 'max_depth': 15, 'n_estimators': 200} -0.008004271191169745
RandomForestRegressor {'max_depth': None} -0.010236167071831389
DecisionTreeRegressor {'max_depth': None} -0.012403305607540122
XGBRegressor {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 400} -0.009292294672785317
LinearRegression {} -0.009387017514369642
RidgeRegression {'alpha': 1} -0.009383860544574286
SVR {'C': 0.1, 'gamma': 0.01, 'kernel': 'linear'} -0.008142652208678644
KNeighborsRegressor {'n_neighbors': 2, 'weights': 'distance'} -0.011820006937484936
