# Gradient Boosting Model training and tuning

## Importing necessary libraries

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import PolynomialFeatures
from scipy.stats import randint
import numpy as np
import joblib

## Load preprocessed subset of Selects data

In [17]:
# Load the Selects subset
data_path = "../data/subset_selected_parties_model.csv"
df = pd.read_csv(data_path)

## Model tuning and training

In [18]:
# Extract features and target
X = df.drop(['vdn1b'], axis=1)
y = df['vdn1b']

In [19]:
# Store original feature names
original_feature_names = X.columns.tolist()

# Polynomial Features
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X)

# Get polynomial feature names
poly_feature_names = poly.get_feature_names_out(original_feature_names)

# Interaction Feature: Age * Income
X_poly_interaction = np.hstack([X_poly, (X['age'].values * X['income'].values).reshape(-1, 1)])

# Create final feature names list
final_feature_names = poly_feature_names.tolist() + ['age_income_interaction']

In [20]:
# Combine SMOTE and ENN for oversampling and cleaning
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_poly_interaction, y)

# Further handle any class imbalance with random undersampling
undersample = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = undersample.fit_resample(X_resampled, y_resampled)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [21]:
# Define the parameter grid for hyperparameter tuning
param_dist = {
    'n_estimators': randint(100, 500),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 10, 20],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['sqrt', 'log2']
}

# Initialize the model
gb_model = GradientBoostingClassifier(random_state=42)

# Initialize RandomizedSearchCV with StratifiedKFold
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(estimator=gb_model, param_distributions=param_dist, n_iter=100, cv=stratified_kfold, n_jobs=-1, random_state=42, verbose=2)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_leaf=19, min_samples_split=12, n_estimators=187; total time=   3.3s
[CV] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_leaf=19, min_samples_split=12, n_estimators=187; total time=   3.4s
[CV] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_leaf=19, min_samples_split=12, n_estimators=187; total time=   3.4s
[CV] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_leaf=19, min_samples_split=12, n_estimators=187; total time=   3.3s
[CV] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_leaf=19, min_samples_split=12, n_estimators=187; total time=   3.3s
[CV] END learning_rate=0.1, max_depth=20, max_features=sqrt, min_samples_leaf=15, min_samples_split=12, n_estimators=171; total time=   8.3s
[CV] END learning_rate=0.1, max_depth=20, max_features=sqrt, min_samples_leaf=15, min_sampl

In [22]:
# Train the model with the best parameters
best_model_gb = GradientBoostingClassifier(**best_params, random_state=42)
best_model_gb.fit(X_train, y_train)

# Make predictions
best_predictions = best_model_gb.predict(X_test)

# Evaluate the model
classification_report_best = classification_report(y_test, best_predictions)

# Display the results
print("Best Parameters:", best_params)
print("\nClassification Report:\n", classification_report_best)

# Perform cross-validation
cv_scores = cross_val_score(best_model_gb, X_resampled, y_resampled, cv=stratified_kfold, scoring='f1_weighted')
print("\nCross-Validation F1 Weighted Scores:", cv_scores)
print("Mean CV F1 Weighted Score:", cv_scores.mean())

Best Parameters: {'learning_rate': 0.01, 'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 18, 'n_estimators': 198}

Classification Report:
               precision    recall  f1-score   support

         CVP       0.57      0.67      0.62        36
         EVP       0.56      0.50      0.53        30
         FDP       0.64      0.55      0.59        38
         GLP       0.67      0.53      0.59        34
         GPS       0.42      0.37      0.39        30
          SP       0.60      0.72      0.66        29
         SVP       0.41      0.52      0.46        27

    accuracy                           0.55       224
   macro avg       0.55      0.55      0.55       224
weighted avg       0.56      0.55      0.55       224


Cross-Validation F1 Weighted Scores: [0.57171306 0.56574771 0.54226914 0.55185958 0.52899797]
Mean CV F1 Weighted Score: 0.552117492067031


In [23]:
print("Final features used by the model:", final_feature_names)

Final features used by the model: ['sex', 'age', 'educ', 'income', 'religion', 'sg1', 'sg9', 'sc1', 'sc7b', 'pi1', 'pm3', 'vp1', 'pid1', 'sex^2', 'sex age', 'sex educ', 'sex income', 'sex religion', 'sex sg1', 'sex sg9', 'sex sc1', 'sex sc7b', 'sex pi1', 'sex pm3', 'sex vp1', 'sex pid1', 'age^2', 'age educ', 'age income', 'age religion', 'age sg1', 'age sg9', 'age sc1', 'age sc7b', 'age pi1', 'age pm3', 'age vp1', 'age pid1', 'educ^2', 'educ income', 'educ religion', 'educ sg1', 'educ sg9', 'educ sc1', 'educ sc7b', 'educ pi1', 'educ pm3', 'educ vp1', 'educ pid1', 'income^2', 'income religion', 'income sg1', 'income sg9', 'income sc1', 'income sc7b', 'income pi1', 'income pm3', 'income vp1', 'income pid1', 'religion^2', 'religion sg1', 'religion sg9', 'religion sc1', 'religion sc7b', 'religion pi1', 'religion pm3', 'religion vp1', 'religion pid1', 'sg1^2', 'sg1 sg9', 'sg1 sc1', 'sg1 sc7b', 'sg1 pi1', 'sg1 pm3', 'sg1 vp1', 'sg1 pid1', 'sg9^2', 'sg9 sc1', 'sg9 sc7b', 'sg9 pi1', 'sg9 pm3',

In [24]:
# Save the model and feature names
joblib.dump(best_model_gb, '../data/models/best_gb_model.pkl')
joblib.dump(final_feature_names, '../data/models/feature_names.pkl')

['../data/models/feature_names.pkl']