In [1]:
def warn(*args, **kwargs):
    pass

from imblearn.over_sampling import SMOTE
import warnings
warnings.warn = warn
from scipy.stats import randint as sp_randint
import seaborn as sns
import scipy.stats as stats
import pandas as pd
import numpy as np
from random import choices
from collections import Counter
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier 
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
synthetic = pd.read_csv("synthetic.csv")
synthetic.pop('gender')
synthetic.head()

Unnamed: 0,admit,iq,sat
0,False,92.472816,959.728157
1,False,77.055728,862.557282
2,False,96.986655,1046.866552
3,False,104.129996,1105.299959
4,False,101.215587,1047.155871


In [3]:
synthetic['admit'] = [0 if x == False else 1 for x in synthetic['admit']]
synthetic.describe()

Unnamed: 0,admit,iq,sat
count,5000.0,5000.0,5000.0
mean,0.404,100.037458,1045.088979
std,0.490747,19.909603,201.439475
min,0.0,23.912771,271.127706
25%,0.0,86.664101,908.881888
50%,0.0,99.936357,1040.152307
75%,1.0,113.993465,1185.707223
max,1.0,159.906606,1688.765366


In [4]:
def add_Fx(df_p,df_n,ppc, npc):
    """
    
    Add sensitive feature {0,1} to the dataframe
    
    parameter:
    - df_p   : dataframe for Y = 1
    - df_n   : dataframe for Y = 0
    - ppc    : percentage of S = 1 in Y = 0
    - npc    : percentage of S = 1 in Y = 0
    
    output:
    - X      : 8-dim array of independent variable
    - y      : 1-dim array of target variable
    
    """
    
    
    df_p.loc[:, 'Feature_X'] = choices([0,1],cum_weights = [ppc,100], k = len(df_p))
    df_n.loc[:, 'Feature_X'] = choices([0,1],cum_weights = [npc,100], k = len(df_n))
    dall = df_p.append(df_n)   
    
#     print("Distribution of Sensitive Attribute in Y = 1: {}".format(Counter(df_p['Feature_X'])))
#     print("Distribution of Sensitive Attribute in Y = 0: {}".format(Counter(df_n['Feature_X'])))
#     print("Distribution of Y (total): {}".format(Counter(dall['Class'])))
#     print("Distribution of Sensitive Attribute in Y (total): {}".format(Counter(dall['Feature_X'])))
    y = dall.pop("admit").values
    X = dall.values
    #print("Dimension of X after adding sensitive attribute: {}".format(X.shape))
    #print("Shape of y: {}".format(y.shape))
    return X, y 

def df_subsample_pos_class(all_df, rpc = 0, subsample = True):
    
    """
    
    Add sensitive feature {0,1} to the dataframe
    
    parameter:
    - all_df : original dataframe
    - rpc    : requested positive percentage to subsample
    
    output:
    - df_p      : dataframe for Y = 1 after subsampling
    - df_n      : dataframe for Y = 0
    
    """
    
    if subsample :
        rp = rpc/100 
        df_p = all_df[all_df['admit']==1].copy()
        df_n = all_df[all_df['admit']==0].copy()
        np = len(df_p)
        nn = len(df_n)
        perc_p = np/(np+nn)
    
        if rp > perc_p:
            print('Requested positive percentage (pcpc) is too high.',perc_p)
            return df_p, df_n
    
        np_dash = rp/(1-rp)* nn
        df_p = df_p.sample(int(np_dash+0.5))
    
        #print(np,nn,'--',np_dash)
        return df_p, df_n
    else:
        df_p = all_df[all_df['admit']==1].copy()
        df_n = all_df[all_df['admit']==0].copy()
        return df_p,df_n

def oversample_minority_smote_adasyn(X_N,Y_N,X_P,Y_P):

    #oversample (SMOTE) minority for both y=0 and y=1
    X_N_smote, Y_N_smote = SMOTE(sampling_strategy='minority',n_jobs=-1,random_state=0).fit_resample(X_N, Y_N)
    X_P_smote, Y_P_smote = SMOTE(sampling_strategy='minority',n_jobs=-1,random_state=0).fit_resample(X_P, Y_P)
    X_N_smote['sex'] = Y_N_smote
    X_N_smote['admit'] = np.zeros(Y_N_smote.shape)
    X_P_smote['sex'] = Y_P_smote
    X_P_smote['admit'] = np.ones(Y_P_smote.shape)
    train_smote = pd.concat([X_N_smote,X_P_smote])

    #get xtrain and ytrain for SMOTE
    y_train_smote = train_smote['admit']
    train_smote.pop('admit')
    Xtrain_smote = train_smote
    
    return Xtrain_smote,y_train_smote
    
def df_count_feat_val_match(df1, f1, v1, f2, v2):
    return len (df1[(df1[f1]==int(v1)) & (df1[f2]==int(v2))])

def optimize_model(clf,param,Xtrain,Xtest,y_train,y_test,scoring,cv):
    grid = RandomizedSearchCV(clf,param,cv=cv,scoring=scoring,n_jobs=-1,n_iter=50)
    grid.fit(Xtrain,y_train)
#     print("Classifier: {}".format(clf.__class__.__name__))
#     print("Best parameter: {}".format(grid.best_params_))
#     print("Best {}: {}".format(str(scoring),grid.best_score_))
#     print("-"*30)
    y_pred = grid.predict(Xtest)
    return y_pred,grid.best_params_

## return best param
## return best score

def underestimation_score(y_true,y_pred,SA):
    """
    
    parameter:
    - y_true : ground truth for prediction outcomes
    - y_pred : predicted outcomes
    - SA     : sensitive attributes
    
    output:
    - us_0: underestimation for S = 0
    - us_1: underestimation for S = 1
    
    """
    mydict = {}
    mydict['actual'] = y_true
    mydict['predicted'] = y_pred
    mydict['sex'] = SA
    us = pd.DataFrame(mydict)

    P_dash_FX0 = df_count_feat_val_match(us, 'predicted', 1, 'sex',0)
    P_FX0 = df_count_feat_val_match(us, 'actual', 1, 'sex',0)
    Bias_FX0 = P_dash_FX0/P_FX0
    
    if P_FX0 == 0:
        print("Divsion by zero detected!")
               
    return Bias_FX0

def evaluate_model(y_true,y_pred,SA):
    """
    
    parameter:
    - y_true : ground truth for prediction outcomes
    - y_pred : predicted labels
    - y_prob : predicted probability
    - SA     : sensitive attributes in the test data
    
    output:
    - accuracy : accuracy scores
    - rocauc   : roc auc scores
    - us_0     : underestimation score for S = 0
    - us_1     : underestimation score for S = 1
    
    """
    Counter
    accuracy = balanced_accuracy_score(y_true,y_pred)
    b = underestimation_score(y_true,y_pred,SA)

    
    ## add counter for SA
    ## add counter for class imbalance
    ## add TP TN etc
    
    return accuracy,b


    
def preprocess_data(X,y):
    """
    Numerical features are scaled using MinMaxScaler, while categorical features one-hot-encoded.
    
    parameter:
    - X    : 8-dim array of independent variable
    - y    : 1-dim array of target variable
    
    output:
    - Xtrain      : 8-dim array containing independent variable in the train test
    - Xtest       : 8-dim array containing independent variable in the test set
    - y_train     : 1-dim array of target variable in the train set
    - y_test      : 1-dim array of target variable in the test set
    
    """
    
    minority_in_Pos = 0
    
    while not minority_in_Pos:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle = True, stratify = y)
        temp = pd.DataFrame({'Feature_X': X_test[:,2], 'admit': y_test}, columns=['Feature_X', 'admit'])
#         print(df_count_feat_val_match(temp, 'admit', 1, 'Feature_X',0) )
        minority_in_Pos = df_count_feat_val_match(temp, 'admit', 1, 'Feature_X',0)    

    minmax = MinMaxScaler()
    xtrain_num = minmax.fit_transform(X_train[:,0:2])
    xtest_num = minmax.transform(X_test[:,0:2])
    
    Xtrain = np.hstack((xtrain_num,X_train[:,2].reshape(-1,1)))
    Xtest = np.hstack((xtest_num,X_test[:,2].reshape(-1,1)))

#     print("Shape of train data: {}".format(Xtrain.shape))
#     print("Shape of test data : {}".format(Xtest.shape))
    
    return Xtrain,Xtest,y_train,y_test

def US_scorer(clf, X_tst, y_tst):
    y_pred = clf.predict(X_tst)
    Fx_tst = [int(i) for i in X_tst[:,-1]]
    US_s = underestimation_score(y_tst,y_pred,Fx_tst)
    try:
        US = 1/(abs(1-US_s))
    except:
        US= 1/(abs((1-US_s))+0.0001)
    return US

from sklearn.metrics import balanced_accuracy_score

def custom_loss_function(clf,X_tst,y_test):
    y_pred = clf.predict(X_tst)
    Fx_tst = [int(i) for i in X_tst[:,-1]]
    
    US_s = underestimation_score(y_test,y_pred,Fx_tst)
    try:
        US = 1/(abs(1-US_s))
    except:
#         print("smoothing added")
        US= 1/(abs((1-US_s))+0.0001)

    accuracy = balanced_accuracy_score(y_test,y_pred)
    
    weight_ACC = 0.5
    
    return (weight_ACC*accuracy) + ((1-weight_ACC) * US)

In [5]:
models = MLPClassifier(random_state=0,max_iter=20000) 
params = {'hidden_layer_sizes': sp_randint(1, 200),\
          'activation':['tanh','relu','logistic'],'solver':['adam','sgd'],\
                        'alpha':stats.reciprocal(a=1e-7,b=1e2), 'learning_rate': ['constant','adaptive']}

In [6]:
counts = synthetic['admit'].value_counts().to_dict()
print(counts)
MCPc = counts[1]/(counts[0]+counts[0])*100
print("Percent of positive labels (Y = 1): {0:.2f}%".format(MCPc))

{0: 2980, 1: 2020}
Percent of positive labels (Y = 1): 33.89%


In [None]:
class_incidence = [30]
feature_incidence = [50]
heatmap_DF = {}
result = []
result_1 = []
result_2 = []
result_3 = []

best_model = {}


temp_heatmap = pd.DataFrame(columns=feature_incidence)
temp_heatmap_1 = pd.DataFrame(columns=feature_incidence)
temp_heatmap_2 = pd.DataFrame(columns=feature_incidence)
temp_heatmap_3 = pd.DataFrame(columns=feature_incidence)

acc_heatmap = pd.DataFrame(columns=feature_incidence)
acc_heatmap_1 = pd.DataFrame(columns=feature_incidence)
acc_heatmap_2 = pd.DataFrame(columns=feature_incidence)
acc_heatmap_3 = pd.DataFrame(columns=feature_incidence)

for class_imb in class_incidence:
    accuracies = []
    biases = []
    accuracies_1 = []
    biases_1 = []
    accuracies_2 = []
    biases_2 = []
    accuracies_3 = []
    biases_3 = []
    for feature_imb in feature_incidence:
        print((class_imb,feature_imb))
            
        df_p,df_n = df_subsample_pos_class(synthetic,class_imb)
        X,y = add_Fx(df_p,df_n,feature_imb,40)
        
        acc = []
        bias = []
        acc_1 = []
        bias_1 = []
        acc_2 = []
        bias_2 = []
        acc_3 = []
        bias_3 = [] 
        
        for n in range(20):
        #progress bar
            print(">> {}".format(n+1))
            
            for xx in range(20):
                    try:
                        Xtrain,Xtest,y_train,y_test = preprocess_data(X,y)
                        y_pred,b_ = optimize_model(models,params,Xtrain,Xtest,y_train,y_test,'balanced_accuracy',10)
                        break
                    except:
                        print("{} - Division by zero detected!",format(xx))
                        pass   
            
            
            #dataset description
            train = pd.DataFrame(np.hstack((Xtrain,np.array(y_train).reshape(-1,1))), columns=['IQ', 'SAT','sex','admit'])
            NF = train[(train['sex']==0) & (train['admit']==0)]
            NM = train[(train['sex']==1) & (train['admit']==0)]
            PF = train[(train['sex']==0) & (train['admit']==1)]
            PM = train[(train['sex']==1) & (train['admit']==1)]
            # split dataset by classes
            N = pd.concat([NF.iloc[:,0:8],NM.iloc[:,0:8]])
            Y_N = N['sex']
            N.pop('sex')
            X_N = N
            P = pd.concat([PF.iloc[:,0:8],PM.iloc[:,0:8]])
            Y_P = P['sex']
            P.pop('sex')
            X_P = P
            
            
            #SMOTE
            Xtrain_smote,y_train_smote = oversample_minority_smote_adasyn(X_N,Y_N,X_P,Y_P)
            y_pred_1,_ = optimize_model(models,params,Xtrain_smote,Xtest,y_train_smote,y_test,'balanced_accuracy',10)
            print("SMOTE done!")
            
            #Counterfactual-1
            to_be_sampled = abs(PM.shape[0]-PF.shape[0])
            sampled_NF = NF.sample(n=to_be_sampled, random_state=1)
            sampled_NF['admit'] = np.ones(to_be_sampled)
            counterfactual_1 = pd.concat([sampled_NF,PF,PM,NF,NM])
            y_train_counterfactual_1 = counterfactual_1['admit']
            counterfactual_1.pop('admit')
            Xtrain_counterfactual_1 = counterfactual_1
            y_pred_2,_ = optimize_model(models,params,Xtrain_counterfactual_1,Xtest,y_train_counterfactual_1,y_test,'balanced_accuracy',10)

                                        
            ## counterfactual - 2
            sampled_PM = PM.sample(n=to_be_sampled, random_state=1)
            sampled_PM['sex'] = np.zeros(to_be_sampled)
            sampled_PM['admit'] = np.ones(to_be_sampled)
            counterfactual_2 = pd.concat([sampled_PM,PF,PM,NF,NM])
            y_train_counterfactual_2 = counterfactual_2['admit']
            counterfactual_2.pop('admit')
            Xtrain_counterfactual_2 = counterfactual_2
            y_pred_3,_ = optimize_model(models,params,Xtrain_counterfactual_2,Xtest,y_train_counterfactual_2,y_test,'balanced_accuracy',10)
                           
            accuracy,bi = evaluate_model(y_test,y_pred,Xtest[:,2].ravel()) 
            accuracy_1,bi_1 = evaluate_model(y_test,y_pred_1,Xtest[:,2].ravel()) 
            accuracy_2,bi_2 = evaluate_model(y_test,y_pred_2,Xtest[:,2].ravel())
            accuracy_3,bi_3 = evaluate_model(y_test,y_pred_3,Xtest[:,2].ravel())
            
            print("Accuracy:")
            print(accuracy,accuracy_1,accuracy_2,accuracy_3) 
            print("Underestimation:")
            print(bi,bi_1,bi_2,bi_3)
                                        
                                        
                                        
            acc.append(accuracy)
            bias.append(bi)
            acc_1.append(accuracy_1)
            bias_1.append(bi_1)
            acc_2.append(accuracy_2)
            bias_2.append(bi_2)
            acc_3.append(accuracy_3)
            bias_3.append(bi_3)            
            
        accuracies.append(np.median(acc))
        biases.append(np.median(bias))
        accuracies_1.append(np.median(acc_1))
        biases_1.append(np.median(bias_1))
        accuracies_2.append(np.median(acc_2))
        biases_2.append(np.median(bias_2))
        accuracies_3.append(np.median(acc_3))
        biases_3.append(np.median(bias_3))
        result.append([class_imb,feature_imb,np.median(acc),np.median(bias)])
        result_1.append([class_imb,feature_imb,np.median(acc_1),np.median(bias_1)])
        result_2.append([class_imb,feature_imb,np.median(acc_2),np.median(bias_2)])
        result_3.append([class_imb,feature_imb,np.median(acc_3),np.median(bias_3)])

    temp_heatmap.loc[class_imb] = biases
    temp_heatmap_1.loc[class_imb] = biases_1
    temp_heatmap_2.loc[class_imb] = biases_2 
    temp_heatmap_3.loc[class_imb] = biases_3
    acc_heatmap.loc[class_imb] = accuracies
    acc_heatmap_1.loc[class_imb] = accuracies_1
    acc_heatmap_2.loc[class_imb] = accuracies_2 
    acc_heatmap_3.loc[class_imb] = accuracies_3
heatmap_DF["Test Data"] = temp_heatmap
heatmap_DF["SMOTE"] = temp_heatmap_1
heatmap_DF["Counterfactual 1"] = temp_heatmap_2
heatmap_DF["Counterfactual 2"] = temp_heatmap_3

(30, 50)
>> 1
SMOTE done!
Accuracy:
0.6614029201977916 0.6516883760958035 0.667936169902417 0.658603789547384
Underestimation:
0.8216216216216217 0.5513513513513514 0.8540540540540541 0.7891891891891892
>> 2
SMOTE done!
Accuracy:
0.6658780285018306 0.6520472015986697 0.6709176305847688 0.6757749026357629
Underestimation:
0.825136612021858 0.5245901639344263 0.8743169398907104 0.9180327868852459
>> 3
SMOTE done!
Accuracy:
0.667530667911373 0.6645375381069767 0.6751870706127748 0.6764867190804732
Underestimation:
0.7675675675675676 0.5459459459459459 0.7837837837837838 0.7675675675675676
>> 4
SMOTE done!
Accuracy:
0.6710766223726242 0.6507358840089269 0.6703297985617807 0.6641728780430882
Underestimation:
0.7377049180327869 0.5409836065573771 0.726775956284153 0.7049180327868853
>> 5
SMOTE done!
Accuracy:
0.6593564479192496 0.6487127499744738 0.6672068497746401 0.6517058797788702
Underestimation:
0.847953216374269 0.5614035087719298 0.9707602339181286 0.8538011695906432
>> 6
SMOTE done!


In [None]:
class_incidence = list(range(10, int(MCPc/10)*10+1, 5)) 
feature_incidence = list(range(10,51,5))
heatmap_DF = {}
result = []
result_2 = []
best_model = {}


temp_heatmap = pd.DataFrame(columns=feature_incidence)
temp_heatmap_2 = pd.DataFrame(columns=feature_incidence)

for class_imb in class_incidence:
    accuracies = []
    biases = []
    accuracies_2 = []
    biases_2 = []
    for feature_imb in feature_incidence:
        print((class_imb,feature_imb))
            
        df_p,df_n = df_subsample_pos_class(synthetic,class_imb)
        X,y = add_Fx(df_p,df_n,feature_imb,40)
        
        acc = []
        bias = []
        acc_2 = []
        bias_2 = []
            
        for n in range(5):
        #progress bar
            print(">> {}".format(n+1))
            
            for xx in range(20):
                    try:
                        Xtrain,Xtest,y_train,y_test = preprocess_data(X,y)
                        y_pred,bestparam1 = optimize_model(models,params,Xtrain,Xtest,y_train,y_test,custom_loss_function,10)
                        y_pred_2,bestparam2 = optimize_model(models,params,Xtrain,Xtest,y_train,y_test,US_scorer,10)
                        break
                    except:
                        print("{} - Division by zero detected!",format(xx))
                        pass   
                    

            accuracy,bi = evaluate_model(y_test,y_pred,Xtest[:,2].ravel()) 
            accuracy_2,bi_2 = evaluate_model(y_test,y_pred_2,Xtest[:,2].ravel())
            print("custom - {}".format(bestparam1))
            print("acc - {}".format(accuracy))
            print("bias - {}".format(bi))
            print("unders - {}".format(bestparam2))
            print("acc - {}".format(accuracy_2))
            print("bias - {}".format(bi_2))
            
            acc.append(accuracy)
            bias.append(bi)
            acc_2.append(accuracy_2)
            bias_2.append(bi_2)
            
            
        accuracies.append(np.median(acc))
        biases.append(np.median(bias))
        accuracies_2.append(np.median(acc_2))
        biases_2.append(np.median(bias_2))
        result.append([class_imb,feature_imb,np.median(acc),np.median(bias)])
        result_2.append([class_imb,feature_imb,np.median(acc_2),np.median(bias_2)])

    temp_heatmap.loc[class_imb] = biases
    temp_heatmap2.loc[class_imb] = biases_2
heatmap_DF["custom"] = temp_heatmap
heatmap_DF["underestimation"] = temp_heatmap_2