In [151]:
import numpy as np
import pandas as pd
import sklearn
import random
import scipy
import sklearn
import math

In [152]:
features = ['age',  #: continuous.
            'workclass',
            #: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
            'fnlwgt',  #: continuous.
            'education',
            #: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
            'education-num',  #: continuous.
            'marital-status',
            #: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
            'occupation',
            #: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
            'relationship',  #: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
            'race',  #: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
            'sex',  #: Female, Male.
            'capital-gain',  #: continuous.
            'capital-loss',  #: continuous.
            'hours-per-week',  #: continuous.
            'native-country',
            #: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.]
            'target']
category = ['workclass','education','marital-status','occupation','relationship','race','sex', 'native-country']

In [153]:
train_data = pd.read_csv('adult-new.data',header = None, names=features)
train_target = train_data.target
train_data = train_data[features[:-1]]

test_data = pd.read_csv('adult-new.test',header = None, names=features)
test_target = test_data.target
test_data = test_data[features[:-1]]


In [154]:
train_data_dummies = pd.get_dummies(train_data,columns = category)
test_data_dummies = pd.get_dummies(test_data,columns = category)

In [155]:
for col in train_data_dummies.columns:
    if col not in test_data_dummies.columns:
        test_data_dummies[col] = 0

In [156]:
test_female_dummies = test_data_dummies[test_data_dummies['sex_ Female']==1]
test_male_dummies = test_data_dummies[test_data_dummies['sex_ Male']==1]
female = list(test_female_dummies.index.values)
male = list(test_male_dummies.index.values)

In [157]:
train_target = train_target.replace(' >50K',1)
train_target = train_target.replace(' <=50K',0)
test_target = test_target.replace(' >50K',1)
test_target = test_target.replace(' <=50K',0)
test_female_target = test_target[female]
test_male_target = test_target[male]

In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [53]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [59]:
models = {'LR':LogisticRegression(),
         'RF':RandomForestClassifier(n_estimators=150),
         'GB':GradientBoostingClassifier()}

n_features = int(math.sqrt(train_data_dummies.shape[1]))

parameter_choice = {'LR':{'C':[0.001,0.01,0.1,1,10,100]},
         'RF':{'max_features':[n_features-5,n_features,n_features+5,n_features+10]},
         'GB':{'max_depth':[3,5,7]}}

grid_result = {}

10


In [60]:
for model in models:
    grid = GridSearchCV(models[model], parameter_choice[model],return_train_score=True)
    grid.fit(train_data_dummies,train_target)

    grid_result[model] = {'best_params_':grid.best_params_,
                            'best_score_':grid.best_score_,
                            'best_estimator_':grid.best_estimator_,
                            'cv_results':pd.DataFrame(grid.cv_results_)}

LR's raw data score is: 0.7978563311937594
RF's raw data score is: 0.8547649027978256
GB's raw data score is: 0.8715641411504561


In [51]:
# cross_val_score(RandomForestClassifier(n_estimators=150),train_data_dummies,train_target,cv = 5)

array([0.85766928, 0.85613389, 0.85168125, 0.84641376, 0.85194287])

In [12]:
# cross_val_score(LogisticRegression(),train_data_dummies,train_target,cv = 5)

array([0.79947797, 0.79487179, 0.8003992 , 0.79649823, 0.79941637])

In [13]:
# cross_val_score(RidgeClassifier(),train_data_dummies,train_target,cv = 5)

array([0.83847689, 0.84507907, 0.83939813, 0.83228383, 0.84533866])

In [175]:
for model in grid_result:
    print(model,'train score',1-grid_result[model]['best_estimator_'].score(train_data_dummies,train_target))
    print(model,'test score',1-grid_result[model]['best_estimator_'].score(test_data_dummies,test_target))

LR train score 0.20214366880624057
LR test score 0.20170751182359803
RF train score 6.142317496393712e-05
RF test score 0.13832074196916655
GB train score 0.10282239488959188
GB test score 0.1315644002211166


In [63]:
grid_result['RF']['best_estimator_'].score(test_data_dummies,test_target)

0.8616792580308335

In [64]:
grid_result['LR']['best_estimator_'].score(test_data_dummies,test_target)

0.798292488176402

In [165]:
def rate(classifier,testdata,testtarget):
    prediction = classifier.predict(testdata)
    wholeset = set(range(len(testdata)))
#     print(wholeset)
    predict_positive = {i for i in np.nonzero(prediction)[0]}
    real_positive = {i for i in  np.nonzero(testtarget)[0]}
    true_positive_rate = len(predict_positive & real_positive)/len(real_positive)
    
    predict_negative = wholeset-predict_positive
    real_negative = wholeset - real_positive
    
    false_positive_rate = len(real_negative&predict_positive)/len(real_negative)
#     print('False Positive Rate of:',false_positive_rate)
    false_negative_rate = 1- true_positive_rate
#     print('False Negative Rate:',false_negative_rate)
    return(false_positive_rate,false_negative_rate)
    

In [166]:
rate(grid_result['GB']['best_estimator_'],test_data_dummies,test_target)

(0.048101673101673105, 0.4011431540659912)

In [177]:
for model in grid_result:
    print('female_data',model,'False Positive, False Negative',rate(grid_result[model]['best_estimator_'],test_female_dummies,test_female_target))

female_data LR False Positive, False Negative (0.031237070748862224, 0.7408056042031523)
female_data RF False Positive, False Negative (0.019031857674803476, 0.46760070052539404)
female_data GB False Positive, False Negative (0.01779064956557716, 0.4623467600700525)


In [170]:
for model in grid_result:
    print('male_data',model,'False Positive, False Negative',rate(grid_result[model]['best_estimator_'],test_male_dummies,test_male_target))

male_data LR False Positive, False Negative (0.036193735193471965, 0.7428309945088469)
male_data RF False Positive, False Negative (0.07962621742563833, 0.3929225137278829)
male_data GB False Positive, False Negative (0.06738615425111871, 0.3904820012202562)
