In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
warnings.filterwarnings('ignore')

adult = pd.read_csv("data/adult_training.csv",
                      delimiter=",",
                      skipinitialspace=True,
                        #nrows=10000,
                      dtype=None)

adult_test = pd.read_csv("data/adult_training.csv",
                      delimiter=",",
                      skipinitialspace=True,
                        #nrows=10000,
                      dtype=None)

from sklearn.model_selection import train_test_split
from __future__ import division
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


In [2]:
def adult_preprocess_balanced(adult):
    """Takes in an adult income pandas dataframe, removes '?', 
    expands categorical data returns X and Y arrays"""
    
    # remove rows with '?'s
    adult = adult[(adult != '?').all(1)]
    
    # convert categorical data into one-hot
    adult_one_hot = pd.get_dummies(adult)
    
    adult_over_50k = adult_one_hot[adult_one_hot['income_>50K'] == 1].sample(n=7500, random_state=0)
    adult_under_50k = adult_one_hot[adult_one_hot['income_>50K'] == 0].sample(n=7500, random_state=0)
    
    frames = [adult_over_50k, adult_under_50k]
    
    adult_clean = pd.concat(frames)
    adult_clean = adult_clean.sample(frac=1)
    
    # split into inputs and targets
    X = adult_clean.iloc[:,0:-2].values
    Y = adult_clean.loc[:,'income_>50K'].values
    
    return X, Y

In [3]:
def adult_preprocess_unbalanced(adult):
    """Takes in an adult income pandas dataframe, removes '?', 
    expands categorical data returns X and Y arrays"""
    
    # remove rows with '?'s
    adult = adult[(adult != '?').all(1)]
    
    # convert categorical data into one-hot
    adult_one_hot = pd.get_dummies(adult)

    # split into inputs and targets
    X = adult_one_hot.iloc[:,0:-2].values
    Y = adult_one_hot.loc[:,'income_>50K'].values
    
    return X, Y

In [4]:
X, Y = adult_preprocess_unbalanced(adult)
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.2, random_state = 0)

X_test, Y_test = adult_preprocess_unbalanced(adult_test)


In [5]:
# print(list(adult_one_hot))
# print(list(X))
print(len(Y[Y==1]))
print(len(Y[Y==0]))

7508
22654


In [6]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train,Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [7]:
p_thres = 0.4

Y_pred = classifier.predict(X_val)
Y_test_pred = classifier.predict(X_test)
Y_test_pred = classifier.predict_proba(X_test)
Y_test_pred = Y_test_pred[:,1]
Y_test_pred[Y_test_pred >= p_thres] = 1 
Y_test_pred[Y_test_pred < p_thres ] = 0

print(Y_test_pred)

[1. 0. 0. ... 0. 0. 1.]


In [8]:
def print_metrics(Y_true, Y_pred):
    """Prints metrics comparing true and predicted classifications"""
    
    cm_test = confusion_matrix(y_true=Y_true, y_pred=Y_pred)

    total = cm_test.sum()

    correct = 0
    for i in range(len(cm_test)):
        correct += cm_test[i,i]
    
    acc = correct/total
    
    print("Confusion Matrix:\n")
    print("      predicted class:")
    print("          0\t1")
    print("        _____________")
    print("true  0| {}\t{}".format(cm_test[0,0], cm_test[0,1]))
    print("class 1| {}\t{}".format(cm_test[1,0], cm_test[1,1]))
    print("")
    print("Correct: \t{}".format(correct))
    print("Misclassified: \t{}".format(total-correct))
    print("Accuracy: \t{:.2f}%".format(acc*100))
    print("Error rate: \t{:.2f}%".format((1-acc)*100))
    print("Sensitivity: \t{:.2f}% (true positive)".format(cm_test[1,1]*100 / cm_test[1].sum()))
    print("Specificity: \t{:.2f}% (true negative)".format(cm_test[0,0]*100 / cm_test[0].sum()))
    print("Precision: \t{:.2f}% (positive predict value)".format(100*cm_test[1,1] / cm_test[:,1].sum()))
    print("False Pos: \t{:.2f}%".format(100*cm_test[0,1] / cm_test[0].sum()))

In [9]:
print_metrics(Y_test, Y_test_pred)

Confusion Matrix:

      predicted class:
          0	1
        _____________
true  0| 21387	1267
class 1| 5268	2240

Correct: 	23627
Misclassified: 	6535
Accuracy: 	78.33%
Error rate: 	21.67%
Sensitivity: 	29.83% (true positive)
Specificity: 	94.41% (true negative)
Precision: 	63.87% (positive predict value)
False Pos: 	5.59%
