In [1]:
import os
import time
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm.notebook import tqdm

In [2]:
import contextlib
import joblib

@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    """
    Context manager to display a tqdm progress bar for joblib parallel jobs.
    """
    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)

        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super().__call__(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()

### 1. Load feature vectors

In [3]:
print("Loading data from pickle files...")

# Load the datasets from the Pickle files
train_df = pd.read_pickle('../data/extracted/train.pkl')
print(f"Loaded 'train.pkl' with {len(train_df)} rows.")

test_df = pd.read_pickle('../data/extracted/test.pkl')
print(f"Loaded 'test.pkl' with {len(test_df)} rows.")

# --- Verify one of the loaded DataFrames ---
print("\nVerifying the first 5 rows of the loaded training data:")
train_df.head()

Loading data from pickle files...
Loaded 'train.pkl' with 4152 rows.
Loaded 'test.pkl' with 1039 rows.

Verifying the first 5 rows of the loaded training data:


Unnamed: 0,user_id,occupation_code,category,aggregated_words,fv
5127,265383481,8,"Process, Plant and Machine Operatives","[abandoned, abilities, able, able, able, able,...","[0.12392794012115328, 0.19688847030447434, 0.1..."
3607,22364420,5,Skilled Trades Occupations,"[abandoned, abiding, ability, ability, able, a...","[0.07217657331213086, 0.1431269316529759, 0.06..."
1689,16797684,2,Professional Occupations,"[ability, ability, able, able, absolutely, acc...","[0.3008244936371708, 0.16110806083995646, -0.2..."
4942,14871013,3,Associate Professional and Technical Occupations,"[abandon, abilities, ability, ability, able, a...","[0.04091567286601898, 0.14582023397535762, -0...."
4317,75687820,6,"Caring, Leisure and Other Service Occupations","[ability, ability, ability, absolutely, abuse,...","[-0.024895935193045577, 0.12986074412173476, -..."


In [27]:
class_names = sorted(test_df['occupation_code'].unique().tolist())
class_names = [str(item) for item in class_names]

print(class_names)
print("Successfully loaded classes from file.")

['1', '2', '3', '4', '5', '6', '7', '8', '9']
Successfully loaded classes from file.


### 2. Preparing Data for Scikit-learn

In [5]:
X_train = np.vstack(train_df['fv'].values)
X_test = np.vstack(test_df['fv'].values)

y_train = train_df['occupation_code'].values - 1
y_test = test_df['occupation_code'].values - 1

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Class names for reports: {class_names}")

Shape of X_train: (4152, 50)
Shape of y_train: (4152,)
Class names for reports: ['Managers, Directors and Senior Officials', 'Professional Occupations', 'Associate Professional and Technical Occupations', 'Administrative and Secretarial Occupations', 'Skilled Trades Occupations', 'Caring, Leisure and Other Service Occupations', 'Sales and Customer Service Occupations', 'Process, Plant and Machine Operatives', 'Elementary Occupations']


### 3. Training and validation (with grid search CV)

In [6]:
def run_grid_search(model_name, pipeline, param_grid, X_train, y_train):
    """
    Takes a pipeline, parameters, and data, runs GridSearchCV, evaluates on the
    validation set, and prints a formatted report.
    """
    print(f"\n========== Starting GridSearchCV for: {model_name} ==========")
    start_time = time.time()
    
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=5, # 5-fold cross-validation
        scoring='f1_macro',
        n_jobs=24, # Use all available cores
        verbose=0
    )
    
    if isinstance(param_grid, list):
        # If it's a list of dictionaries, sum the combinations from each dict
        n_combinations = sum(
            np.prod([len(v) for v in d.values()]) for d in param_grid
        )
    else:
        # If it's a single dictionary, calculate combinations directly
        n_combinations = np.prod([len(v) for v in param_grid.values()])
    
    total_fits = n_combinations * grid_search.cv
    
    print(f"Running GridSearchCV for {model_name} using {total_fits} configurations...")
    with tqdm_joblib(tqdm(desc=f"{model_name} Grid Search", total=total_fits)):
        grid_search.fit(X_train, y_train)
    
    end_time = time.time()
    
    # Evaluate the best model found on the separate validation set
    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_train)

    weighted_f1 = f1_score(y_train, y_pred, average='weighted', labels=class_names)
    accuracy = accuracy_score(y_train, y_pred)
    
    print(f"\n--- {model_name} Results ---")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"F1 macro score on training set: {grid_search.best_score_:.4f}")
    print(f"Weighted F1 score on training set: {weighted_f1:.4f}")
    print(f"Total Search Time: {end_time - start_time:.2f} seconds")
    print("=" * 60)

    model_scores = {
        'accuracy': accuracy,
        'f1_macro': grid_search.best_score_,
        'f1_weighted': weighted_f1,
    }
    
    # Return the validation score for final comparison
    return best_model, model_scores

In [7]:
scores = {}
best_models = {}

#### Logistic Regression (Linear Baseline)

In [8]:
# --- Logistic Regression (Baseline) ---
pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(random_state=42))
])

param_grid_lr = [
    # Configuration for the 'saga' solver
    {
        'scaler': [StandardScaler(), MinMaxScaler(), 'passthrough'],
        'classifier__solver': ['saga'],
        'classifier__penalty': ['l1', 'l2', 'elasticnet'],
        'classifier__l1_ratio': np.linspace(0.1, 0.9, 3),
        'classifier__max_iter': [1000, 2500]
    },
    # Configuration for 'lbfgs' solver
    {
        'scaler': [StandardScaler(), MinMaxScaler(), 'passthrough'],
        'classifier__solver': ['lbfgs'],
        'classifier__penalty': ['l2'],
        'classifier__max_iter': [1000, 2500]
    }
]

best_models['Logistic Regression'], scores['Logistic Regression'] = run_grid_search(
    'Logistic Regression', pipeline_lr, param_grid_lr, X_train, y_train
)


Running GridSearchCV for Logistic Regression using 300 configurations...


Logistic Regression Grid Search:   0%|          | 0/300 [00:00<?, ?it/s]




--- Logistic Regression Results ---
Best Parameters: {'classifier__l1_ratio': np.float64(0.1), 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga', 'scaler': StandardScaler()}
Best Cross-Validation Accuracy (on train set): 0.2844
Total Search Time: 50.81 seconds


#### Random Forest

In [9]:
# --- Random Forest ---
pipeline_rf = Pipeline([
    ('scaler', 'passthrough'), # RF is not sensitive to feature scaling
    ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_grid_rf = {
    # Test a standard number of trees vs. a larger number
    'classifier__n_estimators': [100, 300, 500, 1000],

    # Test a constrained tree depth vs. a deeper one and a fully grown one
    'classifier__max_depth': [10, 20, 40, None],

    # Test the two most common strategies for feature selection
    'classifier__max_features': ['sqrt', 'log2'],

    # Control overfitting by setting a minimum leaf size
    'classifier__min_samples_leaf': [1, 2, 4],
    
    # Check if balancing class weights helps (crucial for imbalanced data)
    'classifier__class_weight': [None, 'balanced_subsample']
}


best_models['Random Forest'], scores['Random Forest'] = run_grid_search(
    'Random Forest', pipeline_rf, param_grid_rf, X_train, y_train
)


Running GridSearchCV for Random Forest using 960 configurations...


Random Forest Grid Search:   0%|          | 0/960 [00:00<?, ?it/s]




--- Random Forest Results ---
Best Parameters: {'classifier__class_weight': 'balanced_subsample', 'classifier__max_depth': 20, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 4, 'classifier__n_estimators': 300}
Best Cross-Validation Accuracy (on train set): 0.3204
Total Search Time: 225.08 seconds


#### XGBoost

In [10]:
# --- XGBoost ---
pipeline_xgb = Pipeline([
    ('scaler', 'passthrough'),
    ('classifier', XGBClassifier(objective='multi:softprob', random_state=42))
])

param_grid_xgb = {
    # Number of boosting rounds. Often tuned first with a fixed learning rate.
    'classifier__n_estimators': [100, 200, 300, 400],

    # Step size shrinkage to prevent overfitting.
    'classifier__learning_rate': [0.01, 0.05, 0.1],

    # Maximum depth of a tree. Controls model complexity.
    'classifier__max_depth': [3, 5, 7],

    # L1 regularization term on weights.
    'classifier__reg_alpha': [0, 0.1, 1],
    
    # L2 regularization term on weights.
    'classifier__reg_lambda': [0.1, 1, 10],
}

best_models['XGBoost'], scores['XGBoost'] = run_grid_search(
    'XGBoost', pipeline_xgb, param_grid_xgb, X_train, y_train
)


Running GridSearchCV for XGBoost using 1620 configurations...


XGBoost Grid Search:   0%|          | 0/1620 [00:00<?, ?it/s]




--- XGBoost Results ---
Best Parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 7, 'classifier__n_estimators': 400, 'classifier__reg_alpha': 0, 'classifier__reg_lambda': 1}
Best Cross-Validation Accuracy (on train set): 0.3281
Total Search Time: 1473.32 seconds


### SVM

In [11]:
# --- D) Support Vector Machine (SVM) ---
pipeline_svm = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVC(random_state=42, probability=True))
])

param_grid_svm = [
    # --- Configuration for the RBF (Radial Basis Function) kernel ---12
    {
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier__kernel': ['rbf'],
        'classifier__C': np.logspace(-2, 2, 5),      # [0.01, 0.1, 1, 10, 100]
        'classifier__gamma': np.logspace(-2, 2, 5),
        'classifier__class_weight': [None, 'balanced']
    },
    
    # --- Configuration for the Linear kernel ---
    {
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier__kernel': ['linear'],
        'classifier__C': np.logspace(-2, 2, 5),
        'classifier__class_weight': [None, 'balanced']
    },
    
    # --- Configuration for the Polynomial kernel ---
    {
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier__kernel': ['poly'],
        'classifier__C': np.logspace(-2, 2, 5),
        'classifier__degree': [2, 3, 4],
        'classifier__class_weight': [None, 'balanced']
    }
]

best_models['SVM'], scores['SVM'] = run_grid_search(
    'SVM', pipeline_svm, param_grid_svm, X_train, y_train
)


Running GridSearchCV for SVM using 900 configurations...


SVM Grid Search:   0%|          | 0/900 [00:00<?, ?it/s]


--- SVM Results ---
Best Parameters: {'classifier__C': np.float64(100.0), 'classifier__class_weight': None, 'classifier__gamma': np.float64(0.01), 'classifier__kernel': 'rbf', 'scaler': StandardScaler()}
Best Cross-Validation Accuracy (on train set): 0.3535
Total Search Time: 1108.29 seconds


#### MLP

In [12]:
# --- Multi-Layer Perceptron (MLP) ---
pipeline_mlp = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', MLPClassifier(random_state=42,
                                 early_stopping=True, # Enable early stopping
                                 n_iter_no_change=10, # How many iterations with no improvement to wait
                                 max_iter=1000)) # A higher max_iter for convergence
])

param_grid_mlp = [
    # --- Configuration for the 'adam' solver ---
    {
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier__solver': ['adam'],
        'classifier__activation': ['relu', 'tanh'],
        # Test wider, deeper, and mixed architectures
        'classifier__hidden_layer_sizes': [(64,), (128,), (256,), (64, 128), (128, 64), (128, 256), (256, 128), (64, 128, 64), (128, 256, 128)],
        'classifier__alpha': [0.0001, 0.001, 0.01], # L2 regularization
        'classifier__learning_rate_init': [0.001, 0.01, 0.1]
    },
    
    # --- Configuration for the 'sgd' solver ---
    {
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier__solver': ['sgd'],
        'classifier__activation': ['relu', 'tanh'],
        'classifier__hidden_layer_sizes': [(64,), (128,), (256,), (64, 128), (128, 64), (128, 256), (256, 128), (64, 128, 64), (128, 256, 128)],
        'classifier__alpha': [0.0001, 0.001, 0.01],
        'classifier__learning_rate': ['constant', 'adaptive'], # Test learning rate schedules
        'classifier__learning_rate_init': [0.001, 0.01, 0.1],
        'classifier__momentum': [0.9, 0.95, 0.99] # Key parameter for sgd
    }
]

best_models['MLP'], scores['MLP'] = run_grid_search(
    'MLP (Neural Network)', pipeline_mlp, param_grid_mlp, X_train, y_train
)


Running GridSearchCV for MLP (Neural Network) using 11340 configurations...


MLP (Neural Network) Grid Search:   0%|          | 0/11340 [00:00<?, ?it/s]


--- MLP (Neural Network) Results ---
Best Parameters: {'classifier__activation': 'relu', 'classifier__alpha': 0.01, 'classifier__hidden_layer_sizes': (256,), 'classifier__learning_rate': 'adaptive', 'classifier__learning_rate_init': 0.1, 'classifier__momentum': 0.9, 'classifier__solver': 'sgd', 'scaler': StandardScaler()}
Best Cross-Validation Accuracy (on train set): 0.3545
Total Search Time: 927.98 seconds


### Summary

In [32]:
print("\n\n========== FINAL MODEL COMPARISON SUMMARY ==========")
f1_macro_summary = pd.DataFrame.from_dict(
    scores,
    orient='index', 
    columns=['f1-macro']
).sort_values(by='f1-macro', ascending=False)

print(scores)
print("=" * 52)



                     f1-macro
MLP                  0.354503
SVM                  0.353541
XGBoost              0.328089
Random Forest        0.320359
Logistic Regression  0.284426


### 4. Evaluating all the models

In [30]:
from matplotlib import pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
import os
print("========== FINAL MODEL EVALUATION ON TEST SET ==========")

# Store performance metrics for a final summary table
final_scores = {}

cm_output_dir = '../data/output/cm'
os.makedirs(cm_output_dir, exist_ok=True)

for model_name, model in best_models.items():
    print(f"\n--- Test Set Performance for: {model_name} ---")
    
    # Get predictions on the test set
    y_pred_test = model.predict(X_test)
    
    # Calculate and store scores
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_f1_macro = f1_score(y_test, y_pred_test, average='macro', labels=class_names, zero_division=0)
    test_f1_weighted = f1_score(y_test, y_pred_test, average='weighted', labels=class_names, zero_division=0)

    final_scores[model_name] = {'Accuracy': test_accuracy, 'F1 Macro': test_f1_macro, 'F1 Weighted': test_f1_weighted}
    
    # Print Classification Report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_test, target_names=class_names, zero_division=0))

    # Create the ConfusionMatrixDisplay object
    disp = ConfusionMatrixDisplay.from_predictions(
        y_test,
        y_pred_test,
        display_labels=class_names,
    )
    
    image_path = os.path.join(cm_output_dir, f"cm_{model_name.replace(' ', '_').lower()}.png")
    
    plt.savefig(image_path, bbox_inches='tight')
    print(f"Confusion Matrix saved to: {image_path}")
    
    # 5. Close the plot to prevent it from displaying inline if you don't want it to
    plt.close()
    print("-" * 55)

# --- 5. Final Summary Table ---
print("=" * 55)
print("\n\n========== FINAL MODEL PERFORMANCE SUMMARY ==========")
summary_df = pd.DataFrame.from_dict(final_scores, orient='index')
summary_df = summary_df.sort_values(by='Accuracy', ascending=False)
print(summary_df)
print("=" * 55)


--- Test Set Performance for: Logistic Regression ---
Test Accuracy: 0.4495
F1 Macro Score: 0.2844

Classification Report:
              precision    recall  f1-score   support

           1       0.36      0.20      0.25       101
           2       0.48      0.75      0.59       359
           3       0.35      0.25      0.29       215
           4       0.17      0.03      0.04        40
           5       0.48      0.47      0.47       174
           6       0.41      0.44      0.42        64
           7       0.14      0.08      0.10        13
           8       0.57      0.27      0.37        44
           9       0.20      0.03      0.06        29

    accuracy                           0.45      1039
   macro avg       0.35      0.28      0.29      1039
weighted avg       0.42      0.45      0.41      1039

Confusion Matrix saved to: ../data/output/cm/cm_logistic_regression.png
-------------------------------------------------------

--- Test Set Performance for: Random Fores

### 5. Save Models and Scores

In [31]:
import os

output_dir = '../data/output/models'
os.makedirs(output_dir, exist_ok=True)
print(f"Model files will be saved in the '{output_dir}' directory.")

results_for_csv = []

for model_name, model_object in best_models.items():
    # Create a descriptive filename for the model
    # e.g., "Logistic Regression" -> "logistic_regression_model.joblib"
    filename = f"{model_name.replace(' ', '_').lower()}.joblib"
    filepath = os.path.join(output_dir, filename)
    
    # a) Save the actual model object to its own file
    joblib.dump(model_object, filepath)
    print(f"- Saved '{model_name}' to '{filepath}'")
    
    # b) Prepare the data for the CSV row
    model_result = {
        'Model Name': model_name,
        'Accuracy': final_scores[model_name]['Accuracy'],
        'F1 Macro': final_scores[model_name]['F1 Macro'],
        'F1 Weighted': final_scores[model_name]['F1 Weighted'],
        'File Path': filepath
    }
    results_for_csv.append(model_result)

# 3. Create a pandas DataFrame from the results
results_df = pd.DataFrame(results_for_csv)

csv_output_path = '../data/output/results_summary.csv'
results_df.to_csv(csv_output_path, index=False)

print(f"\nSuccessfully created the summary CSV file at '{csv_output_path}'")
print("\nFinal Summary CSV:")
print(results_df)

Model files will be saved in the '../data/output/models' directory.
- Saved 'Logistic Regression' to '../data/output/models/logistic_regression.joblib'
- Saved 'Random Forest' to '../data/output/models/random_forest.joblib'
- Saved 'XGBoost' to '../data/output/models/xgboost.joblib'
- Saved 'SVM' to '../data/output/models/svm.joblib'
- Saved 'MLP' to '../data/output/models/mlp.joblib'

Successfully created the summary CSV file at '../data/output/results_summary.csv'

Final Summary CSV:
            Model Name  Test Accuracy  F1 Macro Score  \
0  Logistic Regression       0.449471        0.284426   
1        Random Forest       0.453321        0.320359   
2              XGBoost       0.483157        0.328089   
3                  SVM       0.476420        0.353541   
4                  MLP       0.472570        0.354503   

                                          File Path  
0  ../data/output/models/logistic_regression.joblib  
1        ../data/output/models/random_forest.joblib  
2   