In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import math

### IMPORTING PREPROCESSED DATA

In [2]:
%run Selected_Questions_Combined.ipynb
dataframes = preprocessed_data()
salary_data = dataframes["all_questions_dataframe"]
salary_data_as_num = dataframes["selected_numeric_questions"]
salary_data_selected_questions = dataframes["selected_questions_dataframe"]

### FURTHER PREPROCESSING

In [3]:
def binning_categories(c):
    if c in ['$0-999','1,000-1,999','2,000-2,999','3,000-3,999','4,000-4,999','5,000-7,499','7,500-9,999']:
        return "0-9,999"
    elif c in ['10,000-14,999','15,000-19,999','20,000-24,999',
             '25,000-29,999','30,000-39,999','40,000-49,999',
             '50,000-59,999','60,000-69,999','70,000-79,999',
             '80,000-89,999','90,000-99,999']:
        return "10,000-99,999"
    elif c in ['100,000-124,999','125,000-149,999','150,000-199,999',
             '200,000-249,999','250,000-299,999','300,000-500,000']:
        return "100,000-500,000"
    else:
        return "> $500,000" 

# Logistic Regression

In [4]:
from sklearn.calibration import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import make_scorer

# target variable encoding
y = salary_data["q24"]
y = y.apply(binning_categories)

# dataset splitting
x_dev, x_test, y_dev, y_test = train_test_split(salary_data_as_num, y, test_size = 0.2, random_state = 42)

# encoding the y variable

l_enc = LabelEncoder()
l_enc.fit_transform(y_dev)
l_enc.transform(y_test)

# baseline model
baseline = LogisticRegression(multi_class='auto', random_state = 42, max_iter=10000)
baseline.fit(x_dev, y_dev)
y_pred = baseline.predict_proba(x_test)
print("Baseline Model:")
print("Baseline score: ", accuracy_score(y_test, baseline.predict(x_test)))
print("Number of iterations: ", baseline.n_iter_)
print("AUC score: ", roc_auc_score(y_test, y_pred, multi_class='ovr'))

# tuning for solver, penalty, and regularization strength

## parameters
param_grid = dict(solver = ['newton-cg', 'lbfgs', 'sag'],
                  penalty = ["l2"], 
                  C = [100, 10, 1.0, 0.1, 0.01],
                  max_iter = [10000])

## tuning
'''
grid = GridSearchCV(estimator = baseline, param_grid = param_grid, cv = 5, verbose=2)
grid_result = grid.fit(x_dev, y_dev)
print("Hyperparameter tuning for solver, penalty, and regularization strength:")
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
'''
# Best parameters: {'C': 0.1, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'newton-cg'}


# testing the best model
best_model = LogisticRegression(multi_class='auto', random_state = 42, max_iter=10000, C = 0.1, penalty = "l2", solver = "newton-cg")
best_model.fit(x_dev, y_dev)
y_pred = best_model.predict_proba(x_test)
lr_roc = roc_auc_score(y_test, y_pred, multi_class='ovr')
print("Best model:")
print("Best model accuracy: ", accuracy_score(y_test, best_model.predict(x_test)))
print("AUC score: ", lr_roc)

Baseline Model:
Baseline score:  0.7614770459081837
Number of iterations:  [1669]
AUC score:  0.9143399460543309
Best model:
Best model accuracy:  0.7609780439121756
AUC score:  0.9145558764332691


In [5]:
def lrBT_eval_metrics():
    return({
        "Logistic Regression Classifier": lr_roc
    })