## SVM

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score

In [2]:
salary_data_as_num = pd.read_csv('salary_data_as_num.csv')
salary_data_as_num = salary_data_as_num.drop(columns=['Unnamed: 0'])
data_clean = salary_data_as_num.dropna(subset=['q24'])
X = data_clean.drop(['q24'], axis=1)
y = data_clean['q24']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
#svm_model = SVC(random_state=42, probability=True)

# Define the parameter grid to search
# param_grid = {
#     'C': [0.1, 1, 10, 100],
#     'gamma': [1, 0.1, 0.01, 0.001],
#     'kernel': ['rbf']
# }

# grid_search = GridSearchCV(svm_model, param_grid, refit=True, verbose=2, cv=3)
# grid_search.fit(X_train, y_train)

# # Best parameter after tuning
# print("Best parameters found: ", grid_search.best_params_)
# print("Best score: ", grid_search.best_score_)

# best_grid_model = grid_search.best_estimator_

In [4]:
def bin_salary_categories(y):
    def bin_category(c):
        if c in ['$0-999', '1,000-1,999', '2,000-2,999', '3,000-3,999', '4,000-4,999', 
                 '5,000-7,499', '7,500-9,999']:
            return "0-9,999"
        elif c in ['10,000-14,999', '15,000-19,999', '20,000-24,999', '25,000-29,999', 
                   '30,000-39,999', '40,000-49,999', '50,000-59,999', '60,000-69,999', 
                   '70,000-79,999', '80,000-89,999', '90,000-99,999']:
            return "10,000-99,999"
        elif c in ['100,000-124,999', '125,000-149,999', '150,000-199,999', '200,000-249,999', 
                   '250,000-299,999', '300,000-500,000']:
            return "100,000-500,000"
        else:
            return "> $500,000"
    
    return np.array([bin_category(c) for c in y])

y_train_binned = bin_salary_categories(y_train)
y_test_binned = bin_salary_categories(y_test)

original_encoder = LabelEncoder()
binned_encoder = LabelEncoder()
y_train_encoded = original_encoder.fit_transform(y_train)
y_train_binned_encoded = binned_encoder.fit_transform(y_train_binned)

original_model = SVC(random_state=42, probability=True, C=1, gamma=0.01, kernel='rbf')
original_model.fit(X_train, y_train_encoded)

binned_model = SVC(random_state=42, probability=True, C=1, gamma=0.01, kernel='rbf')
binned_model.fit(X_train, y_train_binned_encoded)

def evaluate_model(original_model, binned_model, X_test, y_test, y_test_binned, original_encoder, binned_encoder):
    # Evaluation without bin
    y_test_encoded = original_encoder.transform(y_test)
    y_pred = original_model.predict(X_test)
    # print("Evaluation without bin:")
    # print(classification_report(y_test_encoded, y_pred))

    y_prob = original_model.predict_proba(X_test)
    auc_scores = [roc_auc_score((y_test_encoded == i).astype(int), y_prob[:, i]) for i in range(y_prob.shape[1])]
    average_auc = np.mean(auc_scores)
    print("ROC-AUC score without bin: ", average_auc)

    # Evaluation with bin
    y_test_binned_encoded = binned_encoder.transform(y_test_binned)
    y_pred_binned_encoded = binned_model.predict(X_test)
    y_pred_binned = binned_encoder.inverse_transform(y_pred_binned_encoded)
    #print("\nEvaluation with bin:")
    #print(classification_report(y_test_binned_encoded, y_pred_binned_encoded))

    y_prob_binned = binned_model.predict_proba(X_test)
    auc_scores_binned = [roc_auc_score((y_test_binned_encoded == i).astype(int), y_prob_binned[:, i]) for i in range(y_prob_binned.shape[1])]
    average_auc_binned = np.mean(auc_scores_binned)
    print("ROC-AUC score with bin: ", average_auc_binned)

In [5]:
evaluate_model(original_model, binned_model, X_test, y_test, y_test_binned, original_encoder, binned_encoder)

ROC-AUC score without bin:  0.7381606018189144
ROC-AUC score with bin:  0.7699812606662338
