In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import PolynomialFeatures
from scipy.stats import randint
import numpy as np
import joblib


# Load the dataset
data_path = "/Users/ThaddaeusBraun/Desktop/Data Science in Public Policy/Project/Model/pythonProject/subset_parties_model.csv"
subset = pd.read_csv(data_path)

# Extract features and target
X = subset.drop(['vdn1b'], axis=1)
y = subset['vdn1b']

# Polynomial Features
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X)

# Interaction Feature: Age * Income
X_poly_interaction = np.hstack([X_poly, (X['age'].values * X['income'].values).reshape(-1, 1)])

# Combine SMOTE and ENN for oversampling and cleaning
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_poly_interaction, y)

# Further handle any class imbalance with random undersampling
undersample = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = undersample.fit_resample(X_resampled, y_resampled)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_dist = {
    'n_estimators': randint(100, 500),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 10, 20],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['sqrt', 'log2']
}

# Initialize the model
gb_model = GradientBoostingClassifier(random_state=42)

# Initialize RandomizedSearchCV with StratifiedKFold
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(estimator=gb_model, param_distributions=param_dist, n_iter=100, cv=stratified_kfold, n_jobs=-1, random_state=42, verbose=2)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_

# Train the model with the best parameters
best_model_gb = GradientBoostingClassifier(**best_params, random_state=42)
best_model_gb.fit(X_train, y_train)

# Make predictions
best_predictions = best_model_gb.predict(X_test)

# Evaluate the model
classification_report_best = classification_report(y_test, best_predictions)

# Display the results
print("Best Parameters:", best_params)
print("\nClassification Report:\n", classification_report_best)

# Perform cross-validation
cv_scores = cross_val_score(best_model_gb, X_resampled, y_resampled, cv=stratified_kfold, scoring='f1_weighted')
print("\nCross-Validation F1 Weighted Scores:", cv_scores)
print("Mean CV F1 Weighted Score:", cv_scores.mean())

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'learning_rate': 0.05, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 15, 'min_samples_split': 3, 'n_estimators': 421}

Classification Report:
               precision    recall  f1-score   support

         CSP       0.71      0.81      0.75        36
         CVP       0.65      0.47      0.55        55
         EVP       0.72      0.73      0.73        45
         FDP       0.45      0.48      0.47        42
         GLP       0.45      0.58      0.51        38
         PdA       0.71      0.67      0.69        52
          SP       0.74      0.76      0.75        49
         SVP       0.56      0.54      0.55        50

    accuracy                           0.62       367
   macro avg       0.62      0.63      0.62       367
weighted avg       0.63      0.62      0.62       367

Cross-Validation F1 Weighted Scores: [0.62070375 0.68565948 0.65431078 0.67118738 0.68925202]
Mean CV F1 Weigh

In [9]:
model_path = "/Users/ThaddaeusBraun/Desktop/Data Science in Public Policy/Project/Model/pythonProject/best_rf_model.joblib"
joblib.dump(best_model_gb, model_path)
print(f"Model saved to {model_path}")

Model saved to /Users/ThaddaeusBraun/Desktop/Data Science in Public Policy/Project/Model/pythonProject/best_rf_model.joblib
