In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy.linalg
import scipy 
import scipy.linalg
from scipy import stats
from sklearn.model_selection import KFold
import importlib

In [2]:
#import sys
#sys.path.insert(0, './utils')
#import evaluate_utils
from utils import evaluate_utils

  from numpy.core.umath_tests import inner1d


In [3]:
from sklearn.linear_model import LogisticRegression

In [6]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *

In [7]:
import json

In [8]:
from utils.fair_var import *

In [11]:
def get_corr(x,y):
    return np.corrcoef(x, y)[0,1]

In [12]:
def trans_proba(x0):
    x1 = np.minimum(x0,+400)
    x2 = np.maximum(x1,-400)
    return 1./(1. + np.exp(-x2))

In [13]:
def do_fair_logestic_regression(df_train, df_test, feature_list, protect_list, outcome, lbd, hps):
    ## this function will perfrom a logestic regression
    ## df_train, df_test : two dataframes.
    ## feature_list : should include protected features
    ## protect : a list of protected features
    ## outcome : a column name for the outcome column
    ## lbd: a real number, lambda factor
    ##
    ## Return : predicting scores on test data
    
    ## create the fair features
    fair_feature_list = []
    df_train_tmp = pd.DataFrame()
    df_test_tmp = pd.DataFrame()
    for column in feature_list:
        if(column in protect_list): continue
        column_fair = column + "_fair"
        fair_feature_list.append(column_fair)
        df_train_tmp[column_fair] = gen_latent_nonparam_regula(df_train[feature_list], protect_list, column, lbd)
        df_test_tmp[column_fair] = gen_latent_nonparam_regula(df_test[feature_list], protect_list, column, lbd)
        #print("corr train = ", get_corr(df_train[column], df_train[protect_list[0]]))
        #print("corr test = ",  get_corr(df_train[column], df_train[protect_list[0]]))
        
    ## add protect to the tmp dataframe
    for column in protect_list:
        df_train_tmp[column] = df_train[column].values
        df_test_tmp[column] = df_test[column].values
    
    ## add the outcome to tmp dataframe
    df_train_tmp[outcome] = df_train[outcome].values
    df_test_tmp[outcome] = df_test[outcome].values
    
    
    ## logestic regression on train data
    X_train = df_train_tmp[fair_feature_list]
    X_train = X_train.values
    
    Y_train = df_train_tmp[outcome].astype('int')
    Y_train = Y_train.values
    
    if(False):
        print("hyper parameters = ", hps)

    
    ## build random forest model
    #clf = RandomForestClassifier(n_estimators=hps["n_estimators"],random_state=0)
    
    clf = LogisticRegression(penalty='l2', C=hps["C"], solver='newton-cg', max_iter=200)
    clf.fit(X_train, Y_train)
    
    
    X_test = df_test_tmp[fair_feature_list]
    X_test = X_test.values
    
    if(False):
        for i in range(X_train.shape[1]):
            print("Train feature corr = ", get_corr(X_train[:,i], df_train[protect_list[0]]))
            print("Test feature corr = ", get_corr(X_test[:,i], df_test[protect_list[0]]))
            
    
    #Ypred_r_train = clf.predict_proba(X_train)
    #Ypred_r_test = clf.predict_proba(X_test)
    Ypred_l_train = clf.decision_function(X_train)
    Ypred_l_test = clf.decision_function(X_test)
    Ypred_r_train = trans_proba(Ypred_l_train)
    Ypred_r_test = trans_proba(Ypred_l_test)
    
    return Ypred_r_train, Ypred_r_test

In [14]:
def cross_validation(data_path, test_folder, feature_list, protect_list, outcome, lbd, hps, n_fold, result_name):
    df_res_test = pd.DataFrame()
    
    for i in range(n_fold):
        path_train = data_path+"train/trans_train-%d.csv"%(i+1)
        path_test = None
        if(test_folder == "valid"): 
            path_test = data_path+"valid/trans_valid-%d.csv"%(i+1)
        elif(test_folder == "test"):                
            path_test = data_path+"test/trans_test-%d.csv"%(i+1)
        else:
            raise Exception("Unkonwn test folder = %s"%(test_folder))
        
        df_i_test = pd.DataFrame()
        
        df_train = pd.read_csv(path_train)
        df_test = pd.read_csv(path_test)
            
        score_train, score_test = do_fair_logestic_regression(df_train, df_test, feature_list, protect_list, outcome, lbd, hps)    
        
        protect_train = df_train[protect_list[0]].values
        protect_test = df_test[protect_list[0]].values
        
        ## socre discrimination with nn
        pred_protct_train, pred_protect_test = score_dscr(score_train, protect_train, score_test, protect_test, hps["hps_score_dscr"])
        df_i_test["_my_infer_protect"] = pred_protect_test
        
        ## score discrimination with random fortest
        pred_protct_train, pred_protect_test = score_dscr_rf(score_train, protect_train, score_test, protect_test, hps["hps_score_dscr"])
        df_i_test["_my_infer_protect_rf"] = pred_protect_test        
        
        ## score discrimination with naive bayes ber
        pred_protct_train, pred_protect_test = score_dscr_nb_ber(score_train, protect_train, score_test, protect_test, hps["hps_score_dscr"])
        df_i_test["_my_infer_protect_nb_ber"] = pred_protect_test 
        
        if(False):
            print(df_train.shape, df_test.shape)
            print(score_train.shape, score_test.shape)
            print(pred_protct_train.shape, pred_protect_test.shape)
        
        df_i_test[outcome] = df_test[outcome].values
        df_i_test[result_name] = score_test
        for column in protect_list:
            df_i_test[column] = df_test[column].values    
        df_res_test = df_res_test.append(df_i_test, ignore_index = True)
        
    return df_res_test

In [15]:
## use a discriminator (MLP) to infer proetected feature from prediting scores

def score_dscr(x_train, y_train, x_test, y_test, hps):
    n_train = x_train.shape[0]
    n_test = x_test.shape[0]
    
    if(False): 
        print(hps)
        print(y_train)
        
    clf = MLPClassifier(
        hidden_layer_sizes = hps["hidden"],
        activation = "relu",
        solver = "adam",
        max_iter = 800,
        #verbose = True,
        )
    clf.fit(x_train.reshape(n_train,1), y_train)
    
    pred_train = clf.predict(x_train.reshape(n_train,1))
    pred_test = clf.predict(x_test.reshape(n_test,1))
    
    if(hps["verbos"]):
        acc_train = accuracy_score(y_train, pred_train)
        acc_test = accuracy_score(y_test, pred_test)
        print("NN disriminator acc on train/test %4f/%4f"%(acc_train, acc_test))
    
    return pred_train, pred_test

In [16]:
## use a discriminator (RF) to infer proetected feature from prediting scores

def score_dscr_rf(x_train, y_train, x_test, y_test, hps):
    n_train = x_train.shape[0]
    n_test = x_test.shape[0]
    
    if(False):
        print(hps)
    
    clf = RandomForestClassifier(n_estimators=hps["rf_nes"], max_depth=hps["rf_maxd"])
    clf.fit(x_train.reshape(n_train,1), y_train)
    
    pred_train = clf.predict(x_train.reshape(n_train,1))
    pred_test = clf.predict(x_test.reshape(n_test,1))
    
    if(hps["verbos"]):
        acc_train = accuracy_score(y_train, pred_train)
        acc_test = accuracy_score(y_test, pred_test)
        print("RF disriminator acc on train/test %4f/%4f"%(acc_train, acc_test))
    
    return pred_train, pred_test

In [17]:
from sklearn.naive_bayes import BernoulliNB

def score_dscr_nb_ber(x_train, y_train, x_test, y_test, hps):
    n_train = x_train.shape[0]
    n_test = x_test.shape[0]
    
    if(False):
        print(hps)  
        
    clf = BernoulliNB(binarize=0.5)
    clf.fit(x_train.reshape(n_train,1), y_train)
    pred_train = clf.predict(x_train.reshape(n_train,1))
    pred_test = clf.predict(x_test.reshape(n_test,1))
    
    if(hps["verbos"]):
        acc_train = accuracy_score(y_train, pred_train)
        acc_test = accuracy_score(y_test, pred_test)
        print("BernoulliNB disriminator acc on train/test %4f/%4f"%(acc_train, acc_test))
    
    return pred_train, pred_test    
    
    

In [18]:
def evaluate_single_lbd(data_path, test_folder, feature_list, protect_list, outcome, lbd, hps, n_loop, save_score=False, save_path=None):
    df_res_test = cross_validation(data_path, test_folder, feature_list, protect_list, outcome, lbd, hps, 5, "pred_score")
    res_test = evaluate_utils.do_evaluate_score(df_res_test, "pred_score", outcome, protect_list, hps["nbins"])
    if(save_score):
        df_res_test.to_csv(save_path)    
    return res_test

In [19]:
data_path = "./dataset/adult/"
feature_list = ["x%d"%(i) for i in range(103)] + ["s"]
protect_list = ["s"]
outcome = "y"

In [20]:
## hyperparameter tune
if(False):
    lbd = 1.0
    hps_score_dscr = {"hidden":(64,64,64), "rf_nes":50, "rf_maxd":3, "verbos":False}
    hps = {'C':-1, "hps_score_dscr":hps_score_dscr, "nbins":10}
    n_loop = -1
    c_list = [1e-4,1e-2,1e0,1e2,1e4,1e6,1e8,1e10,1e20]
    print("C, acc_y, acc_p, acc_p_rf, corr, f1_y, f1_p")
    for c in c_list:
        hps["C"] = c
        res_test = evaluate_single_lbd(data_path, "valid", feature_list, protect_list, outcome, lbd, hps, n_loop)
        #print("c = %e, acc_y = %4f, acc_protect = %4f"%(c,res_test["acc"],res_test["acc_infer_protect_cross"]))
        print("%4e %4f %4f %4f %4f %4f %4f"%(c, res_test["acc_y"], res_test["acc_p"], res_test["acc_p_rf"], res_test["corr"], res_test["f1_y"], res_test["f1_p"]))

In [21]:
## fine tuning
if(False):
    lbd = 1.0
    hps_score_dscr = {"hidden":(64,64,64), "rf_nes":50, "rf_maxd":3, "verbos":False}
    hps = {'C':-1, "hps_score_dscr":hps_score_dscr, "nbins":10}
    n_loop = -1

    c_list = [1e-1,2e-1,5e-1,1e0,2e0,5e0,1e1,2e1,5e1,1e2,2e2,5e2,1e3]
    print("C, acc_y, acc_p, acc_p_rf, corr, f1_y, f1_p")
    for c in c_list:
        hps["C"] = c
        res_test = evaluate_single_lbd(data_path, "valid", feature_list, protect_list, outcome, lbd, hps, n_loop)
        #print("c = %e, acc_y = %4f, acc_protect = %4f"%(c,res_test["acc"], res_test["acc_infer_protect_cross"]))
        print("%4e %4f %4f %4f %4f %4f %4f"%(c, res_test["acc_y"], res_test["acc_p"], res_test["acc_p_rf"], res_test["corr"], res_test["f1_y"], res_test["f1_p"]))
## C, acc_y, acc_p, corr, f1_y, f1_p
## 1.000000e+04 0.850290 0.673705 -0.286733 0.668623 0.000000

In [22]:
## test on optimal hyper parameters
importlib.reload(evaluate_utils)
lbd = 0.0
hps_score_dscr = {"hidden":(64,64,64), "rf_nes":50, "rf_maxd":3, "verbos":False}
hps = {'C':2e0, "hps_score_dscr":hps_score_dscr, "nbins":10}
n_loop = -1
res_test = evaluate_single_lbd(data_path, "test", feature_list, protect_list, outcome, lbd, hps, n_loop)
print(("acc_y = %4f, acc_protect = %4f/%4f, corr = %4f")%(res_test["acc_y"], res_test["acc_p"], res_test["acc_p_rf"], res_test["corr"]))

acc_y = 0.843665, acc_protect = 0.673081/0.673772, corr = -0.100952


In [None]:
import os
importlib.reload(evaluate_utils)
evaluate_result = {}
lbd_list = np.arange(0.0,1.05,0.05)

os.system("rm -rf ./pred_scores_model0; mkdir ./pred_scores_model0")
i = -1
for lbd in lbd_list:
    i+=1
    save_path = "./pred_scores_model0/pred_%d.csv"%(i) 
    
    hps_score_dscr = {"hidden":(64,64,64), "rf_nes":50, "rf_maxd":3, "verbos":False}
    hps = {'C':2e0, "hps_score_dscr":hps_score_dscr, "nbins":10}
    n_loop = -1
    res_test = evaluate_single_lbd(data_path, "test", feature_list, protect_list, outcome, lbd, hps, n_loop, save_score=True, save_path=save_path)
    print(("lbd = %4f, acc_y = %4f, acc_protect = %4f/%4f, corr = %4f")%(lbd, res_test["acc_y"], res_test["acc_p"], res_test["acc_p_rf"], res_test["corr"]))    #print("%4f %4f %4f %4f"%(lbd, res_test["acc_y"], res_test["acc_p"], res_test["corr"]))
    
    res_test["lbd"] = lbd
    for key in res_test:
        if(key in evaluate_result): evaluate_result[key].append(res_test[key])
        else: evaluate_result[key] = [res_test[key]]

In [None]:
with open('adult_model0_evaluate.txt', 'w') as outfile:
    json.dump(evaluate_result, outfile)