In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV, HalvingRandomSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error,mean_absolute_percentage_error, r2_score

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVC, SVR
from xgboost import XGBRegressor, XGBClassifier
from sklearn.neural_network import MLPRegressor
import warnings
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.preprocessing import FunctionTransformer
import joblib

warnings.filterwarnings('ignore')

### 3.1 Data Preprocessing

# Models Training

In [15]:
# Load training set
X_train = pd.read_csv('Data_n_model/X_train.csv')
y_train = pd.read_csv('Data_n_model/y_train.csv')

# Load testing set
X_test = pd.read_csv('Data_n_model/X_test.csv')
y_test = pd.read_csv('Data_n_model/y_test.csv')

In [23]:
# Metrics
def print_metrics(model_name, grid=None):
    train_pred = model_name.predict(X_train)
    test_pred = model_name.predict(X_test)
    print(f'Training Set\nRMSE: {np.sqrt(mean_squared_error(y_train, train_pred))}')
    print(f'MAE: {mean_absolute_error(y_train, train_pred)}')
    print(f'MAPE: {mean_absolute_percentage_error(y_train, train_pred)}')
    print(f'R2: {r2_score(y_train, train_pred)}\n\nTesting Set')
    print(f'RMSE: {np.sqrt(mean_squared_error(y_test, test_pred))}')
    print(f'MAE: {mean_absolute_error(y_test, test_pred)}')
    print(f'MAPE: {mean_absolute_percentage_error(y_test, test_pred)}')
    print(f'R2: {r2_score(y_test, test_pred)}')
    if grid:
        print(f'\n\nBest Estimator: {grid.best_estimator_}')
        print(f'Best Parameters: {grid.best_params_}')


## 1. Linear Regression

### Training

In [4]:
# Linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)
# Save the tuned model
joblib.dump(lr, 'r.pkl')

# Predictions
y_pred_train_slr = lr.predict(X_train)

### Model evaluation and Feature importance 

In [24]:
print_metrics(lr)

Training Set
RMSE: 90.79211205697973
MAE: 50.89764255574441
MAPE: 0.5843002561141697
R2: 0.2050113097803734

Testing Set
RMSE: 93.33888468844106
MAE: 52.69570060021923
MAPE: 0.5875019384299192
R2: 0.20296438308110387


## 2. RandomForest Model

### Training

In [17]:
# Linear regression model
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_pred_train_srf = rf.predict(X_train)

### Hyper-parameter Tuning

In [30]:
# Hyperparameter tuning for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 500, 1000],
    'max_depth': [None, 10, 20],
}
rf_grid = HalvingRandomSearchCV(rf, rf_param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2, random_state=42)
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_
joblib.dump(best_rf, 'best_rf.pkl')

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 7
min_resources_: 20
max_resources_: 14801
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 12
n_resources: 20
Fitting 10 folds for each of 12 candidates, totalling 120 fits
----------
iter: 1
n_candidates: 4
n_resources: 60
Fitting 10 folds for each of 4 candidates, totalling 40 fits
----------
iter: 2
n_candidates: 2
n_resources: 180
Fitting 10 folds for each of 2 candidates, totalling 20 fits


['best_rf.pkl']

In [25]:
print_metrics(best_rf, rf_grid)

Training Set
RMSE: 64.93426631469589
MAE: 34.564103087842874
MAPE: 0.3780695315297801
R2: 0.5933576673789547

Testing Set
RMSE: 80.71557183420876
MAE: 41.351539363275265
MAPE: 0.4240007320258586
R2: 0.4039713026420939


Best Estimator: RandomForestRegressor(max_depth=10, n_estimators=1000, random_state=42)
Best Parameters: {'n_estimators': 1000, 'min_samples_split': 2, 'max_depth': 10}


## 3. Gradient Boosting

### Training

In [28]:
# Gradient Boosting Regressor
gbm = GradientBoostingRegressor(random_state=1)
gbm.fit(X_train, y_train)

# Predictions
y_pred_train_sgbm = gbm.predict(X_train)

### Hyperparameter Tuning

In [31]:
# Hyperparameter tuning for Gradient Boosting
gbm_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}
gbm_grid = HalvingRandomSearchCV(gbm, gbm_param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2, random_state=42)
gbm_grid.fit(X_train, y_train)
best_gbm = gbm_grid.best_estimator_
joblib.dump(best_gbm, 'best_gbm.pkl')

print("\nTesting Metrics - Hyper-tuned model:")
print_metrics(best_gbm, gbm_grid)

n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 7
min_resources_: 20
max_resources_: 14801
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 8
n_resources: 20
Fitting 10 folds for each of 8 candidates, totalling 80 fits
----------
iter: 1
n_candidates: 3
n_resources: 60
Fitting 10 folds for each of 3 candidates, totalling 30 fits

Testing Metrics - Hyper-tuned model:
Training Set
RMSE: 89.82012984294454
MAE: 48.3058250713096
MAPE: 0.5504144012980455
R2: 0.22194182652784344

Testing Set
RMSE: 93.06992007902366
MAE: 50.22039974565596
MAPE: 0.5501204069603829
R2: 0.20755122819589866


Best Estimator: GradientBoostingRegressor(learning_rate=0.01, n_estimators=200, random_state=1)
Best Parameters: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.01}


## 4. Support Vector Machines

### Training

In [33]:
# Support Vector Classification
svm = SVR(kernel='linear')
svm.fit(X_train, y_train)

# Predcition
y_pred_train_ssvm = svm.predict(X_train)

### Hyper-parameter tuning

In [None]:
# Hyperparameter tuning for SVM
svm_param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
}
svm_grid = RandomizedSearchCV(svm, svm_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2, random_state=42)
svm_grid.fit(X_train, y_train)
best_svm = svm_grid.best_estimator_
joblib.dump(best_svm, 'best_svm.pkl')

print_metrics(best_svm, svm_grid)

### Feature importance

In [None]:
print("Training Metrics - Standard model:")
print_metrics(y_train, y_pred_train_ssvm)

print("\nTraining Metrics - Hyper-tuned model:")
print_metrics(y_train, y_pred_train_bsvm)

## 5. Logistic Model: Dependent is continuous so not applicable

### Training

### Training Logistic Regression
logit = LogisticRegression(max_iter=1000, random_state=1)
logit.fit(X_train, y_train)

y_pred_train_slogit = logit.predict(X_train)


## 6. Decision Tree Regressor


### Training

In [None]:
dt = DecisionTreeRegressor(random_state=1)
dt.fit(X_train, y_train)

# Predictions
y_pred_train_sdt = dt.predict(X_train)

### Hyperparameter Tuning

In [None]:
dt_param_grid = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
dt_grid = RandomizedSearchCV(dt, dt_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2, random_state=42)
dt_grid.fit(X_train, y_train)
best_dt = dt_grid.best_estimator_
joblib.dump(best_dt, 'best_dt.pkl')

# Predictions
y_pred_train_bdt = best_dt.predict(X_train)

### Model evaluation and Feature Importance

In [None]:
# print metrics
print("Training Metrics - Standard model:")
print_metrics(y_train, y_pred_train_sdt)

print("\nTraining Metrics - Hyper-tuned model:")
print_metrics(y_train, y_pred_train_bdt)

In [None]:
# Plot feature importances: Standard model
importances = dt.feature_importances_
indices = np.argsort(importances)[::-1]
features = X_train.columns  # Assuming df has columns attribute

plt.figure(figsize=(12, 6))
plt.title("Feature importances: Standard model")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), [features[i] for i in indices], rotation=90)
plt.tight_layout()
plt.savefig('feature_importances_standard_dt.png', dpi=300)
plt.show()

# Plot feature importances: Hyper-tuned model
importances = best_dt.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(12, 6))
plt.title("Feature importances: Hyper-tuned model")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), [features[i] for i in indices], rotation=90)
plt.tight_layout()
plt.savefig('feature_importances_hyper_dt.png', dpi=300)
plt.show()

## 7. Extra Trees Regressor

### Training

In [None]:
# Training Extra Trees Regressor
et = ExtraTreesRegressor(random_state=1)
et.fit(X_train, y_train)

# Predictions
y_pred_train_set = et.predict(X_train)

### Hyperparameter Tuning for Extra Trees

In [None]:
et_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
et_grid = RandomizedSearchCV(et, et_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2, random_state=42)
et_grid.fit(X_train, y_train)
best_et = et_grid.best_estimator_
joblib.dump(best_et, 'best_et.pkl')

# Predictions
y_pred_train_bet = best_et.predict(X_train)

### Feature Importance

In [None]:
# Model Metrics
print("Training Metrics - Standard model:")
print_metrics(y_train, y_pred_train_set)

print("\nTraining Metrics - Hyper-tuned model:")
print_metrics(y_train, y_pred_train_bet)

In [None]:
# Plot feature importances: Standard model
importances = et.feature_importances_
indices = np.argsort(importances)[::-1]
features = X_train.columns  # Assuming df has columns attribute

plt.figure(figsize=(12, 6))
plt.title("Feature importances: Standard model")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), [features[i] for i in indices], rotation=90)
plt.tight_layout()
plt.savefig('feature_importances_standard_et.png', dpi=300)
plt.show()

# Plot feature importances: Hyper-tuned model
importances = best_et.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(12, 6))
plt.title("Feature importances: Hyper-tuned model")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), [features[i] for i in indices], rotation=90)
plt.tight_layout()
plt.savefig('feature_importances_hyper_et.png', dpi=300)
plt.show()

## 8. Neural Network

### Training

In [None]:
nn = MLPRegressor(random_state=1)
nn.fit(X_train, y_train)

# Predictions
y_pred_train_snn = nn.predict(X_train)

### Hyperparameter Tuning for Neural Network

In [None]:
# parameters
nn_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam']
}
nn_grid = RandomizedSearchCV(nn, nn_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2, random_state=42)
nn_grid.fit(X_train, y_train)
best_nn = nn_grid.best_estimator_
joblib.dump(best_nn, 'best_nn.pkl')

# Predictions
y_pred_train_bnn = best_nn.predict(X_train)

### Training Metrics

In [None]:
# Print Training Metrics
print("Training Metrics - Standard model:")
print_metrics(y_train, y_pred_train_snn)

print("\nTraining Metrics - Hyper-tuned model:")
print_metrics(y_train, y_pred_train_bnn)

## 9. XGBOOST

### Training

In [None]:
# Training Extra Trees Regressor
y_train_xgb = pd.cut(y_train, bins=10, labels=False)

xgb = XGBClassifier(objective = 'binary:logistic', seed = 1, nthread=4, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train_xgb)

# Predictions
y_pred_train_sxgb = xgb.predict(X_train)

### Hyperparameter Tuning for XG Boost

In [None]:
# Define the parameter grid
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}
xgb_grid = RandomizedSearchCV(xgb, xgb_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2, random_state=42)
xgb_grid.fit(X_train, y_train_xgb)
best_xgb = xgb_grid.best_estimator_
joblib.dump(best_xgb, 'best_xgb.pkl')

# Predictions
y_pred_train_bxgb = best_et.predict(X_train)

### Feature Importance

In [None]:
# Model Metrics
print("Training Metrics - Standard model:")
print_metrics(y_train_xgb, y_pred_train_sxgb)

print("\nTraining Metrics - Hyper-tuned model:")
print_metrics(y_train_xgb, y_pred_train_bxgb)

In [None]:
# Plot feature importances: Standard model
importances = xgb.feature_importances_
indices = np.argsort(importances)[::-1]
features = X_train.columns  # Assuming df has columns attribute

plt.figure(figsize=(12, 6))
plt.title("Feature importances: Standard model")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), [features[i] for i in indices], rotation=90)
plt.tight_layout()
plt.savefig('feature_importances_standard_et.png', dpi=300)
plt.show()

# Plot feature importances: Hyper-tuned model
importances = best_xgb.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(12, 6))
plt.title("Feature importances: Hyper-tuned model")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), [features[i] for i in indices], rotation=90)
plt.tight_layout()
plt.savefig('feature_importances_hyper_et.png', dpi=300)
plt.show()