In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from scipy.stats import randint
import numpy as np
import joblib

# Load the dataset
data_path = "/Users/ThaddaeusBraun/Desktop/Data Science in Public Policy/Project/Model/pythonProject/subset.csv"
subset = pd.read_csv(data_path)
subset = subset[subset["vdn1b"]!= "PSA"]

# Split the data into features and target
X = subset.drop(['vdn1b'], axis=1) 
y = subset['vdn1b']

# Combine SMOTE and ENN for oversampling and cleaning
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

# Further handle any class imbalance with random undersampling
undersample = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = undersample.fit_resample(X_resampled, y_resampled)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_resampled), y=y_resampled)
class_weights = dict(zip(np.unique(y_resampled), class_weights))

# Define the parameter grid for hyperparameter tuning
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['sqrt', 'log2']
}

# Initialize the model
rf_model = RandomForestClassifier(random_state=42, class_weight=class_weights)

# Initialize RandomizedSearchCV with StratifiedKFold
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=100, cv=stratified_kfold, n_jobs=-1, random_state=42, verbose=2)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_

# Train the model with the best parameters
best_model = RandomForestClassifier(**best_params, random_state=42, class_weight=class_weights)
best_model.fit(X_train, y_train)

# Make predictions
best_predictions = best_model.predict(X_test)

# Evaluate the model
classification_report_best = classification_report(y_test, best_predictions)

# Display the results
print("Best Parameters:", best_params)
print("\nClassification Report:\n", classification_report_best)

# Perform cross-validation
cv_scores = cross_val_score(best_model, X_resampled, y_resampled, cv=stratified_kfold, scoring='f1_weighted')
print("\nCross-Validation F1 Weighted Scores:", cv_scores)
print("Mean CV F1 Weighted Score:", cv_scores.mean())

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 267}

Classification Report:
               precision    recall  f1-score   support

         CSP       0.71      0.87      0.78        47
         CVP       0.59      0.48      0.53        60
         EVP       0.65      0.51      0.57        51
         FDP       0.61      0.34      0.44        41
         GLP       0.57      0.72      0.64        43
         LPS       0.60      0.77      0.67        47
         LdU       0.73      0.89      0.80        46
         PdA       0.65      0.62      0.64        45
          SP       0.64      0.64      0.64        45
         SVP       0.50      0.47      0.48        43

    accuracy                           0.63       468
   macro avg       0.63      0.63      0.62       468
weighted avg       0.63      0.63      0.62       468

Cross-Validation F1 Weight

In [26]:
# Save the model
model_path = "/Users/ThaddaeusBraun/Desktop/Data Science in Public Policy/Project/Model/pythonProject/best_rf_model.joblib"
joblib.dump(best_model, model_path)
print(f"Model saved to {model_path}")

Model saved to /Users/ThaddaeusBraun/Desktop/Data Science in Public Policy/Project/Model/pythonProject/best_rf_model.joblib
