In [5]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

# from kmodes.kprototypes import KPrototypes
from sklearn.manifold import TSNE
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sb

In [2]:
## Read the subcritical coal power plant data

super_critical = pd.read_csv('CEEW_supercritical_with_ws_price.csv')

all_heat_rate = super_critical['Actual SHR']* 3.96567/1000
all_capacity  = super_critical['Capacity']
all_age       = super_critical['Age']
all_region     = super_critical['Region']
all_PLF        = super_critical['Actual avg PLF']
all_water_stress = super_critical['bws_score']
all_price = super_critical['coal_price']


1. Find the correlation between the features

In [3]:
data = pd.DataFrame({'heat_rate': all_heat_rate,
                     'capacity': all_capacity,
                     'age': all_age, 
                     'PLF': all_PLF,
                     'water stress': all_water_stress,
                     'region': all_region,
                     'price': all_price
                     })

# change the state to one hot encoding
data = pd.get_dummies(data, columns=['region']) # aggregate the states here. 

print(data.head())

   heat_rate  capacity       age       PLF  water stress  price  region_ER  \
0  10.619654     800.0  0.342231  0.005858      0.083791   1.66       True   
1  10.423931     800.0  1.067762  0.171927      0.480176   1.72      False   
2   9.845797     800.0  1.960301  0.633565      0.250206   2.04      False   
3  10.429779     800.0  2.091718  0.191133      0.139977   1.59      False   
4  10.657335     800.0  3.091034  0.040648      0.331244   3.46      False   

   region_NR  region_SR  region_WR  
0      False      False      False  
1      False      False       True  
2      False       True      False  
3      False      False       True  
4      False       True      False  


2. Fit the prediction model using data

In [7]:
X = data.drop('heat_rate', axis=1)
y = data['heat_rate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# add constant
X_train = np.c_[np.ones(X_train.shape[0]), X_train]
X_test = np.c_[np.ones(X_test.shape[0]), X_test]

In [8]:
# show the unique states
print(data.columns)


Index(['heat_rate', 'capacity', 'age', 'PLF', 'water stress', 'price',
       'region_ER', 'region_NR', 'region_SR', 'region_WR'],
      dtype='object')


In [11]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge

models = {
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'RandomForestRegressor': RandomForestRegressor(), # depth
    'DecisionTreeRegressor': DecisionTreeRegressor(), # depth
    'XGBRegressor': XGBRegressor(), 
    'LinearRegression': LinearRegression(), # no hyperparameters
    'RidgeRegression': Ridge(), # alpha
    'SVR': SVR(), # 
    'KNeighborsRegressor': KNeighborsRegressor(),
}

from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

# Cross validation
kf = KFold(n_splits=5)
res = {}

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.metrics import mean_absolute_percentage_error

scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

# Hyperparameter grids for each model
param_grids = {
    'GradientBoostingRegressor': {
        'n_estimators': [100, 150, 200, 300],
        'learning_rate': [1, 0.5, 0.1, 0.01],
        'max_depth': [30, 40, 60, 80],
        'loss': ['squared_error', 'absolute_error', 'huber', 'quantile']
    },
    'RandomForestRegressor': {
        'max_depth': [None]
    },
    'DecisionTreeRegressor': {
        'max_depth': [None]
    },
    'XGBRegressor': {
        'n_estimators': [150, 400, 600],
        'learning_rate': [1, 0.5, 0.1, 0.01],
        'max_depth': [1, 2, 5, 10],
    },
    'LinearRegression': {},
    'RidgeRegression': {
        'alpha': [0.1, 1, 10, 100]
    },
    'SVR': {
        'C': [.001, 0.01, 0.1],
        'gamma': [0.01, 0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly']
    },
    'KNeighborsRegressor': {
        'n_neighbors': [1, 2, 3, 5],
        'weights': ['uniform', 'distance']
    },
}

# Perform hyperparameter search
param_search_results = {}

for name, model in models.items():
    # if (name != 'LGBMRegressor'):
    #     continue
    print(f"Performing hyperparameter search for {name}")

    param_grid = param_grids.get(name, {})  # Get corresponding param grid, or an empty dict

    grid_search = GridSearchCV(model, param_grid, cv=kf, scoring=scorer, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    param_search_results[name] = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'cv_results': grid_search.cv_results_
    }

Performing hyperparameter search for GradientBoostingRegressor
Fitting 5 folds for each of 256 candidates, totalling 1280 fits
Performing hyperparameter search for RandomForestRegressor
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Performing hyperparameter search for DecisionTreeRegressor
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Performing hyperparameter search for XGBRegressor
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Performing hyperparameter search for LinearRegression
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Performing hyperparameter search for RidgeRegression
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Performing hyperparameter search for SVR
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Performing hyperparameter search for KNeighborsRegressor
Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [16]:
# show the best hyperparameters for each model in a table
for name, result in param_search_results.items():
    print(name, result['best_params'], result['best_score'])

GradientBoostingRegressor {'learning_rate': 0.1, 'loss': 'absolute_error', 'max_depth': 30, 'n_estimators': 300} -0.011141867841009286
RandomForestRegressor {'max_depth': None} -0.016043291723334657
DecisionTreeRegressor {'max_depth': None} -0.018258438903471112
XGBRegressor {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 600} -0.01235900730327035
LinearRegression {} -0.017552964596504118
RidgeRegression {'alpha': 10} -0.01727625738123621
SVR {'C': 0.01, 'gamma': 1, 'kernel': 'poly'} -0.00975251989649267
KNeighborsRegressor {'n_neighbors': 1, 'weights': 'uniform'} -0.008725328230836011
