In [6]:
# Libraries & packages
import pandas as pd
import numpy as np

# Import pre-processed data

In [7]:
# Import and read the pre-processed data
data = pd.read_csv('/Users/tiagovhp/Ironhack/Final_Project/dataset/data_pre-processed.csv')

# Inspect the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2520 entries, 0 to 2519
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   LATITUDE                 2520 non-null   float64
 1   LONGITUDE                2520 non-null   float64
 2   MIN_MAXELECTRIC_POWER_W  2520 non-null   int64  
 3   MAX_MAXELECTRIC_POWER_W  2520 non-null   int64  
 4   CHARGING_CAPACITY        2520 non-null   int64  
 5   CHARGE_PERCENTAGE        2520 non-null   float64
 6   motorway                 2520 non-null   int64  
 7   trunk                    2520 non-null   int64  
 8   primary                  2520 non-null   int64  
 9   secondary                2520 non-null   int64  
 10  POPULATION               2520 non-null   int64  
 11  POP_DENSITY              2520 non-null   float64
 12  AGE_20                   2520 non-null   float64
 13  AGE_40                   2520 non-null   float64
 14  AGE_60                  

# Train-Test split

In [8]:
# Split the data into training and test for final model
from sklearn.model_selection import train_test_split

# Define the number of bins
n_bins = 10

# Create bins for population density
data['POP_DENSITY_bin'] = pd.qcut(data['POP_DENSITY'], q=n_bins, labels=False)

# Perform stratified train-test split
train, test = train_test_split(data, test_size=0.2, stratify=data['POP_DENSITY_bin'], random_state=42)

# Drop the bin column after the split if it's no longer needed
train = train.drop(columns=['POP_DENSITY_bin'])
test = test.drop(columns=['POP_DENSITY_bin'])


# Features and labels

In [9]:
# Features
X_train = train.drop(['CHARGE_PERCENTAGE'],axis=1)
X_test = test.drop(['CHARGE_PERCENTAGE'],axis=1)

# Label
y_train = train['CHARGE_PERCENTAGE']
y_test = test['CHARGE_PERCENTAGE']



# Feature scaling

In [10]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Standardize train features (fit_transform)
X_train_scaled = scaler.fit_transform(X_train)

# Standardize test features using stats of train features (transform)
X_test_scaled = scaler.transform(X_test)

# Convert back into pandas DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled,  columns = X_test.columns)

# Model Selection : Regressor

Compare
- Linear Regression
- Decision Tree Regressor
- Random Forest Regreesor
- Gradient Boosting Regressor
- Support Vector Regressor (SVR)
- k-Nearest Neighbors Regressor (KNN)
we will compare general performance metrics for all these models and choose two with the best score.

In [11]:
# Import necessary packages for each model

# Linear Regression
from sklearn.linear_model import LinearRegression

# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor

# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

# Gradient Boosting Regressor (from scikit-learn)
from sklearn.ensemble import GradientBoostingRegressor

# Support Vector Regressor (SVR)
from sklearn.svm import SVR

# k-Nearest Neighbors Regressor (KNN)
from sklearn.neighbors import KNeighborsRegressor

# cross valiadation package
from sklearn.model_selection import cross_val_score

# performance metrics package
from sklearn.metrics import  mean_absolute_error, mean_squared_error,r2_score


In [12]:
# Dictionary for models
models = {"Linear Regression": LinearRegression(),
          "Decision Tree Regressor" : DecisionTreeRegressor(max_depth=10),
          "Random Forest Regreesor" : RandomForestRegressor(n_estimators=100),
          "Gradient Boosting Regressor" : GradientBoostingRegressor(n_estimators=100),
          "Support Vector Regressor" : SVR(C=1.0),
          "k-Nearest Neighbors Regressor" : KNeighborsRegressor(n_neighbors=5)}

In [13]:
# Cross validation
for name, model in models.items():
    score = cross_val_score(estimator = model, X = X_train_scaled, y = y_train, cv = 5, scoring='r2')
    print(f'{name} mean score : {np.mean(score)}')
    print('-' * 40)

Linear Regression mean score : 0.7383190516185417
----------------------------------------
Decision Tree Regressor mean score : 0.84429002853468
----------------------------------------
Random Forest Regreesor mean score : 0.8721891985315026
----------------------------------------
Gradient Boosting Regressor mean score : 0.8702604681545726
----------------------------------------
Support Vector Regressor mean score : 0.8635486796875711
----------------------------------------
k-Nearest Neighbors Regressor mean score : 0.8794878901397076
----------------------------------------


In [14]:
# we perform grid search with the top 3 regressors: Random Forest, Gradient Boosting and k-NN

# import required libraries
from sklearn.model_selection import GridSearchCV

In [15]:
# Hyperparameter grids

## Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],              # Number of trees in the forest
    'max_depth': [10, 20, 30, None],              # Control overfitting by limiting depth
    'min_samples_split': [2, 5, 10],              # Minimum samples to split an internal node
    'min_samples_leaf': [1, 2, 4],                # Minimum samples in each leaf
    'max_features': ['auto', 'sqrt', 'log2'],     # Number of features to consider for splits
    'bootstrap': [True, False]                    # Use bootstrapped samples or not
}

## Gradient Boosting
gb_param_grid = {
    'n_estimators': [100, 200, 300],               # Number of boosting stages
    'learning_rate': [0.01, 0.05, 0.1, 0.2],      # Controls contribution of each tree
    'max_depth': [3, 5, 7, 9],                    # Depth of each tree
    'min_samples_split': [2, 5, 10],              # Minimum samples to split an internal node
    'min_samples_leaf': [1, 2, 4],                # Minimum samples in each leaf
    'subsample': [0.6, 0.8, 1.0],                 # Fraction of samples for each tree
    'max_features': ['auto', 'sqrt', 'log2']      # Number of features for each tree
}

# k-NN
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],             # Try a range of neighbors
    'weights': ['uniform', 'distance'],           # Weight by distance or keep uniform
    'metric': ['euclidean', 'manhattan', 'minkowski']  # Different distance metrics
}

In [16]:
# Gridsearch
from sklearn.model_selection import GridSearchCV

## Random Forest
rf_grid_search = GridSearchCV(estimator = RandomForestRegressor(), param_grid = rf_param_grid, scoring = 'r2', cv=5)
rf_grid_search.fit(X_train_scaled,y_train)

# Best parameters and scores
rf_best_params = rf_grid_search.best_params_
rf_best_score = rf_grid_search.best_score_


1080 fits failed out of a total of 3240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1080 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/tiagovhp/Ironhack/Final_Project/env/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tiagovhp/Ironhack/Final_Project/env/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/tiagovhp/Ironhack/Final_Project/env/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/tiagovhp/Ironhack/Final_Project/env/lib/python3.1

In [17]:
## Gradient Boosting
gb_grid_search = GridSearchCV(estimator = GradientBoostingRegressor(), param_grid = gb_param_grid, scoring = 'r2', cv=5)
gb_grid_search.fit(X_train_scaled,y_train)

# Best parameters and scores
gb_best_params = gb_grid_search.best_params_
gb_best_score = gb_grid_search.best_score_

6480 fits failed out of a total of 19440.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6480 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/tiagovhp/Ironhack/Final_Project/env/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tiagovhp/Ironhack/Final_Project/env/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/tiagovhp/Ironhack/Final_Project/env/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/tiagovhp/Ironhack/Final_Project/env/lib/python3.

In [18]:
## k-NN
knn_grid_search = GridSearchCV(estimator = KNeighborsRegressor(), param_grid = knn_param_grid, scoring = 'r2', cv=5)
knn_grid_search.fit(X_train_scaled,y_train)

# Best parameters and scores
knn_best_params = knn_grid_search.best_params_
knn_best_score = knn_grid_search.best_score_

In [19]:
# Print the results
print(f'Random Forest Regressor best Score: {rf_best_score}')
print(f'Random Forest Regressor best hyperparameters: {rf_best_params}')
print('-' * 40)
print(f'Gradient Boosting Regressor best Score: {gb_best_score}')
print(f'Gradient Boosting Regressor best hyperparameters: {gb_best_params}')
print('-' * 40)
print(f'k-NN Regressor best Score: {knn_best_score}')
print(f'k-NN Regressor best hyperparameters: {knn_best_params }')

Random Forest Regressor best Score: 0.8901641690480148
Random Forest Regressor best hyperparameters: {'bootstrap': True, 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
----------------------------------------
Gradient Boosting Regressor best Score: 0.8920426142078405
Gradient Boosting Regressor best hyperparameters: {'learning_rate': 0.05, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300, 'subsample': 1.0}
----------------------------------------
k-NN Regressor best Score: 0.8806962512918262
k-NN Regressor best hyperparameters: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'uniform'}


# Model Test & Evaluate

In [20]:
'''we choose the model with best performance:
    model: Gradient Boosting Regressor
    hyperparameters:'learning_rate': 0.1, 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200, 'subsample': 1.0
'''

# Instantiate the final model
trained_model = GradientBoostingRegressor(learning_rate = 0.1, max_depth= 5, max_features = 'log2', min_samples_leaf = 2, min_samples_split = 5, n_estimators = 200, subsample = 1.0)

# Train the model in the training data
trained_model.fit(X_train_scaled,y_train)



In [21]:
# Test on test set
y_pred = trained_model.predict(X_test_scaled)

# Evaluate the model : performance on test set
mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

# Print the resuls:
print('Performance metrics of Trained model:')
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'r2 score: {r2}')

Performance metrics of Trained model:
Mean Squared Error: 0.0073741312312610476
Mean Absolute Error: 0.05913409391995698
r2 score: 0.9050962182314624


# Saving and Exporting the Model

In [22]:
import pickle

# Save the model
pickle.dump(trained_model, open("trained_model.pkl","wb"))

# Save the scaler ->  to be used when making predictions on new data
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)