In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import math

### IMPORTING PREPROCESSED DATA

In [2]:
%run Selected_Questions_Combined.ipynb
dataframes = preprocessed_data()
salary_data = dataframes["all_questions_dataframe"]
salary_data_as_num = dataframes["selected_numeric_questions"]
salary_data_selected_questions = dataframes["selected_questions_dataframe"]

# Logistic Regression

In [3]:
from sklearn.calibration import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import make_scorer

# target variable encoding
y = LabelEncoder().fit_transform(salary_data["q24"])

# dataset splitting
x_dev, x_test, y_dev, y_test = train_test_split(salary_data_as_num, y, test_size = 0.2, random_state = 42)
x_train, x_val, y_train, y_val = train_test_split(x_dev, y_dev, test_size = 0.2, random_state = 42)

# baseline model
baseline = LogisticRegression(multi_class='auto', random_state = 42, max_iter=10000)
baseline.fit(x_train, y_train)
y_pred = baseline.predict_proba(x_val)
print("Baseline Model:")
print("Baseline score: ", accuracy_score(y_val, baseline.predict(x_val)))
print("Number of iterations: ", baseline.n_iter_)
print("AUC score: ", roc_auc_score(y_val, y_pred, multi_class='ovr'))

# tuning for solver, penalty, and regularization strength

## parameters
# param_grid = dict(solver = ['newton-cg', 'lbfgs', 'sag'],
#                   penalty = ["l2"], 
#                   C = [100, 10, 1.0, 0.1, 0.01],
#                   max_iter = [10000])

## tuning
# grid = GridSearchCV(estimator = baseline, param_grid = param_grid, cv = 5, verbose=2)
# grid_result = grid.fit(x_train, y_train)
# print("Hyperparameter tuning for solver, penalty, and regularization strength:")
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# Best: 0.229100 using {'C': 0.01, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'newton-cg'}
# Best model:
# Best model accuracy:  0.2079207920792079
# AUC score:  0.7034885198279999

# testing the best model
best_model = LogisticRegression(multi_class='auto', random_state = 42, max_iter=10000, C = 0.01, penalty = "l2", solver = "newton-cg")
best_model.fit(x_train, y_train)
y_pred = best_model.predict_proba(x_val)
lr_roc = roc_auc_score(y_val, y_pred, multi_class='ovr')
print("Best model:")
print("Best model accuracy: ", accuracy_score(y_val, best_model.predict(x_val)))
print("AUC score: ", lr_roc)

Baseline Model:
Baseline score:  0.5349344978165939
Number of iterations:  [3162]
AUC score:  0.8161497655753551
Best model:
Best model accuracy:  0.535870243293824
AUC score:  0.8284838050347924


In [4]:
def lr_eval_metrics():
    return({
        "Logistic Regression Classifier": lr_roc
    })