In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
import joblib


### 2. Loading and Inspecting the Data

In [None]:
url = 'https://raw.githubusercontent.com/wri/global-power-plant-database/master/output_database/global_power_plant_database.csv'
data = pd.read_csv(url)

df.head()


In [None]:
print(df.describe())
print(df.info())

In [None]:
print(df.isnull().sum())

In [None]:

df.hist(figsize=(10, 8))
plt.show()


sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()

In [None]:

df.drop_duplicates(inplace=True)
df.fillna(df.mean(), inplace=True)


In [None]:

df['primary_fuel'] = LabelEncoder().fit_transform(df['primary_fuel'])
df = pd.get_dummies(df, drop_first=True)

In [None]:

scaler = StandardScaler()
df[['capacity_mw']] = scaler.fit_transform(df[['capacity_mw']])

In [None]:

X = df.drop(['primary_fuel', 'capacity_mw'], axis=1)
y_class = df['primary_fuel']
y_reg = df['capacity_mw']

In [None]:

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_class, test_size=0.2, random_state=42)
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_reg, test_size=0.2, random_state=42)


In [None]:

# Logistic Regression
logistic_model = LogisticRegression()
logistic_model.fit(X_train_c, y_train_c)
y_pred_log = logistic_model.predict(X_test_c)

print("Logistic Regression - Accuracy:", accuracy_score(y_test_c, y_pred_log))
print("Logistic Regression - F1 Score:", f1_score(y_test_c, y_pred_log, average='weighted'))

In [None]:

# Decision Tree Classifier
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train_c, y_train_c)
y_pred_tree = tree_model.predict(X_test_c)

print("Decision Tree - Accuracy:", accuracy_score(y_test_c, y_pred_tree))
print("Decision Tree - F1 Score:", f1_score(y_test_c, y_pred_tree, average='weighted'))

In [None]:

# Random Forest Classifier
forest_model = RandomForestClassifier()
forest_model.fit(X_train_c, y_train_c)
y_pred_forest = forest_model.predict(X_test_c)

print("Random Forest - Accuracy:", accuracy_score(y_test_c, y_pred_forest))
print("Random Forest - F1 Score:", f1_score(y_test_c, y_pred_forest, average='weighted'))


 Model Building & Evaluation - Regression Task (Capacity MW)

In [None]:

# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train_r, y_train_r)
y_pred_lin = linear_model.predict(X_test_r)

print("Linear Regression - MAE:", mean_absolute_error(y_test_r, y_pred_lin))
print("Linear Regression - MSE:", mean_squared_error(y_test_r, y_pred_lin))
print("Linear Regression - R2 Score:", r2_score(y_test_r, y_pred_lin))

In [None]:

# Decision Tree Regressor
tree_regressor = DecisionTreeRegressor()
tree_regressor.fit(X_train_r, y_train_r)
y_pred_tree_reg = tree_regressor.predict(X_test_r)

print("Decision Tree Regressor - MAE:", mean_absolute_error(y_test_r, y_pred_tree_reg))
print("Decision Tree Regressor - MSE:", mean_squared_error(y_test_r, y_pred_tree_reg))
print("Decision Tree Regressor - R2 Score:", r2_score(y_test_r, y_pred_tree_reg))

In [None]:

# Random Forest Regressor
forest_regressor = RandomForestRegressor()
forest_regressor.fit(X_train_r, y_train_r)
y_pred_forest_reg = forest_regressor.predict(X_test_r)

print("Random Forest Regressor - MAE:", mean_absolute_error(y_test_r, y_pred_forest_reg))
print("Random Forest Regressor - MSE:", mean_squared_error(y_test_r, y_pred_forest_reg))
print("Random Forest Regressor - R2 Score:", r2_score(y_test_r, y_pred_forest_reg))


Hyperparameter Tuning

In [None]:

# Hyperparameter tuning for Random Forest Classifier
param_grid_c = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]}
grid_c = GridSearchCV(RandomForestClassifier(), param_grid_c, cv=5)
grid_c.fit(X_train_c, y_train_c)

print("Best parameters for Random Forest Classifier:", grid_c.best_params_)

In [None]:
# Hyperparameter tuning for Random Forest Regressor
param_grid_r = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]}
grid_r = GridSearchCV(RandomForestRegressor(), param_grid_r, cv=5)
grid_r.fit(X_train_r, y_train_r)

print("Best parameters for Random Forest Regressor:", grid_r.best_params_)


In [None]:

# Saving the best model using joblib
joblib.dump(grid_c.best_estimator_, 'best_classification_model.pkl')
joblib.dump(grid_r.best_estimator_, 'best_regression_model.pkl')
