### Try Feature Selection with Lasso

In [1]:
import os
os.chdir("..")
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from utils.data import Dataset
from utils.data import create_adult_dataset, create_bank_dataset
from utils.data import create_communities_dataset, create_compas_dataset
from utils.data import create_german_dataset, create_titanic_dataset

In [3]:
from utils.generator import gen_complete_random
from utils.completer import complete_by_mean_col, complete_by_mean_col_v2
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE

In [4]:
from sklearn.linear_model import LassoCV

In [None]:
def metrics(targetScore, protectedScore):
    """
    Compute the metrics to determine which feature to choose first
    """
    targetScore = abs(targetScore)
    protectedScore = abs(protectedScore)
    

In [5]:
def newBias(data, A=1, B=1):
    FPR_A = data[1] / (data[1] + data[0])
    FNR_A = data[2] / (data[2] + data[3])
    FPR_B  = data[5] / (data[5] + data[4])
    FNR_B  = data[6] / (data[6] + data[7])
    bias = A*abs(FPR_A - FPR_B) + B*abs(FNR_A - FNR_B)
    return bias

In [6]:
def cross_val(data_original: Dataset, data_config, clf_config, complete_function=None, selected_cols=[]):
    bias = []
    acc = []
    smote = SMOTE()
    scaler = StandardScaler()
    for i in range(10):
        if complete_function: data = gen_complete_random(data_original, random_ratio=0.4, selected_cols=selected_cols)
        else: data = data_original
        print("Running Cross Validation {}".format(i))
        bias_cv = []
        acc_cv = []
        for train_idx, test_idx in StratifiedShuffleSplit(n_splits=20).split(data.X, data.y):
            X_train, X_test = data.X.iloc[train_idx].copy(), data.X.iloc[test_idx].copy()
            Y_train, Y_test = data.y[train_idx], data.y[test_idx]
            X_train.reset_index(drop=True, inplace=True)
            X_test.reset_index(drop=True, inplace=True)

            if complete_function:
                data_incomplete = Dataset("tmp", X_train, Y_train, types=data.types, 
                    protected_features=data.protected_features, categorical_features=data.categorical_features,
                    encoders=[data.X_encoders, data.y_encoder])
                try:
                    data_complete = complete_function(data_incomplete)
                except Exception as e:
                    print("Error: {}. Skipped".format(e))
                    continue
                if data_complete.X.isnull().sum().sum() > 0:
                    print("Complete function error, skipped")
                    continue
                X_train = data_complete.X.copy()
                Y_train = data_complete.y.copy()
            X_train.drop(columns=data.protected_features, inplace=True)

            if complete_function:
                data_incomplete = Dataset("tmp", X_test, Y_test, types=data.types, 
                    protected_features=data.protected_features, categorical_features=data.categorical_features,
                    encoders=[data.X_encoders, data.y_encoder])
                try:
                    data_complete = complete_function(data_incomplete)
                except Exception as e:
                    print("Error: {}. Skipped".format(e))
                    continue
                if data_complete.X.isnull().sum().sum() > 0:
                    print("Complete function error, skipped")
                    continue
                X_test = data_complete.X.copy()
                Y_test = data_complete.y.copy()
            
            X_train_res, Y_train_res = smote.fit_resample(X_train, Y_train)
            X_scaled = scaler.fit_transform(X_train_res)
            clf = LogisticRegression(max_iter=clf_config["max_iter"], C=clf_config["C"], tol=clf_config["tol"])
            clf.fit(X_scaled, Y_train_res)
            X_test_scaled = pd.DataFrame(scaler.transform(X_test.drop(columns=data.protected_features)), columns=X_test.drop(columns=data.protected_features).columns)
            X_test_scaled = pd.concat([X_test_scaled, X_test[data.protected_features]], axis=1)
            X_test_A = X_test_scaled[X_test_scaled[data_config["target"]] == data_config["A"]].drop(columns=data.protected_features).to_numpy()
            X_test_B = X_test_scaled[X_test_scaled[data_config["target"]] == data_config["B"]].drop(columns=data.protected_features).to_numpy()
            Y_test_A = Y_test[X_test_scaled[X_test_scaled[data_config["target"]] == data_config["A"]].index.tolist()]
            Y_test_B = Y_test[X_test_scaled[X_test_scaled[data_config["target"]] == data_config["B"]].index.tolist()]
            matrix_A = confusion_matrix(Y_test_A, clf.predict(X_test_A)).ravel().tolist()
            matrix_B = confusion_matrix(Y_test_B, clf.predict(X_test_B)).ravel().tolist()
            try:
                bias_cv.append(newBias(matrix_A+matrix_B))
            except Exception as e:
                print("\tError: {}, skipped".format(e))
            acc_cv.append(accuracy_score(clf.predict(X_test_scaled.drop(columns=data.protected_features).to_numpy()), Y_test))
        bias.append(np.mean(bias_cv))
        acc.append(np.mean(acc_cv))
    return (np.mean(bias), np.mean(acc))

In [7]:
def drop_na(data: Dataset) -> Dataset:
    data = data.copy()
    tmp_concat = pd.concat([data.X, pd.DataFrame(data.y, columns=["_TARGET_"])], axis=1)
    tmp_concat.dropna(inplace=True)
    tmp_concat.reset_index(drop=True, inplace=True)
    data.X = tmp_concat.drop(columns=["_TARGET_"]).copy()
    data.y = tmp_concat["_TARGET_"].copy().to_numpy().ravel()
    return data

In [8]:
def convert_protected(data: Dataset):
    data = data.copy()
    encoder = LabelEncoder()
    for feature in data.protected_features:
        data.X[feature] = encoder.fit_transform(data.X[feature])
    return data, encoder

In [9]:
def concat(data: Dataset) -> pd.DataFrame:
    data = data.copy()
    return pd.concat([data.X, pd.DataFrame(data.y, columns=["_TARGET_"])], axis=1)

In [10]:
def compare_corr_lasso(data_fn):
    """
    Compare Correlation and Lasso
    """
    data = drop_na(data_fn())
    cdata, encoder = convert_protected(data)
    ccdata = concat(cdata)
    print("Dataset: {}".format(data.name))
    features = data.X.columns.drop(data.protected_features[0]).tolist()
    # correlation with protected feature 
    correlation_protected = ccdata.corr()[data.protected_features[0]]
    del correlation_protected["_TARGET_"]
    del correlation_protected[data.protected_features[0]]
    # correlation_protected = correlation_protected[correlation_protected.abs().sort_values(ascending=False).head(30).index]
    # correlation with target
    correlation_target = ccdata.corr()["_TARGET_"]
    del correlation_target["_TARGET_"]
    del correlation_target[data.protected_features[0]]
    # correlation_target = correlation_target[correlation_target.abs().sort_values(ascending=False).head(30).index]
    # lasso result with target
    result_target = []
    reg1 = LassoCV(cv=5, max_iter=10000)
    reg1.fit(data.X.drop(columns=data.protected_features).copy(), data.y)
    for i, f in enumerate(features):
        result_target.append([correlation_target[f], reg1.coef_[i]]) # corr, lasso
    result_target = pd.DataFrame(np.array(result_target), index=features, columns=["Correlation", "Lasso Weight"])
    # lasso result with protected feature
    result_protected = []
    reg2 = LassoCV(cv=5, max_iter=10000)
    reg2.fit(data.X.drop(columns=data.protected_features).copy(), cdata.X[data.protected_features[0]])
    for i, f in enumerate(features):
        result_protected.append([correlation_protected[f], reg2.coef_[i]]) # corr, lasso
    result_protected = pd.DataFrame(np.array(result_protected), index=features, columns=["Correlation", "Lasso Weight"])
    print("\nComparison on target:")
    print(result_target)
    print("Lasso Score: {:.5f}".format(reg1.score(data.X.drop(columns=data.protected_features).copy(), data.y)))
    print("\nComparison on protected feature:")
    print(result_protected)
    print("Lasso Score: {:.5f}".format(reg2.score(data.X.drop(columns=data.protected_features).copy(), cdata.X[data.protected_features[0]])))

In [11]:
compare_corr_lasso(create_adult_dataset)

Dataset: adult

Comparison on target:
                Correlation  Lasso Weight
age                0.242431      0.005318
workclass          0.000955     -0.007416
education          0.078255     -0.004126
education-num      0.334640      0.050853
marital-status    -0.194582     -0.023935
occupation         0.050144      0.001606
relationship      -0.251255     -0.036218
race               0.070927      0.014931
hours-per-week     0.228547      0.004246
Lasso Score: 0.21768

Comparison on protected feature:
                Correlation  Lasso Weight
age                0.082117     -0.002639
workclass          0.072763      0.014454
education         -0.028155     -0.002693
education-num      0.007443     -0.009565
marital-status    -0.120360     -0.006966
occupation         0.062068      0.003807
relationship      -0.585792     -0.167759
race               0.086147      0.007167
hours-per-week     0.230321      0.003602
Lasso Score: 0.36155


In [12]:
compare_corr_lasso(create_bank_dataset)

Dataset: bank

Comparison on target:
           Correlation  Lasso Weight
job           0.040438      0.000000
marital       0.045588      0.000000
education     0.066241      0.000000
default      -0.022419     -0.000000
balance       0.052838      0.000004
housing      -0.139173     -0.000000
loan         -0.068185     -0.000000
contact      -0.148395     -0.000000
day          -0.028348     -0.000000
month        -0.024471     -0.000000
duration      0.394521      0.000413
campaign     -0.073172     -0.000000
pdays         0.103621      0.000000
previous      0.093236      0.000000
poutcome     -0.077840     -0.000000
Lasso Score: 0.15369

Comparison on protected feature:
           Correlation  Lasso Weight
job           0.043744      0.000000
marital       0.380677      0.086795
education     0.089655      0.000000
default       0.008954      0.000000
balance      -0.053570     -0.000008
housing       0.074313      0.000000
loan          0.000707      0.000000
contact      -0.0657

In [13]:
pd.set_option('display.max_rows', None)
compare_corr_lasso(create_communities_dataset)

Dataset: communities

Comparison on target:
                       Correlation  Lasso Weight
population                0.303123      0.212268
householdsize             0.005678     -0.000000
agePct12t21               0.082278     -0.000000
agePct12t29               0.167099     -0.150180
agePct16t24               0.111654      0.000000
agePct65up                0.039884      0.000000
numbUrban                 0.293348      0.000000
pctUrban                  0.026095      0.053678
medIncome                -0.404295      0.000000
pctWWage                 -0.271527     -0.069305
pctWFarmSelf             -0.088559      0.058654
pctWInvInc               -0.548007     -0.338188
pctWSocSec                0.094657      0.086555
pctWPubAsst               0.543245      0.087482
pctWRetire               -0.112958     -0.123506
medFamInc                -0.419950      0.119416
perCapInc                -0.346496     -0.000000
whitePerCap              -0.224699      0.003943
blackPerCap              

In [14]:
compare_corr_lasso(create_compas_dataset)

Dataset: compas

Comparison on target:
                         Correlation  Lasso Weight
age                        -0.153418     -0.007127
age_cat                     0.036969      0.015404
c_charge_degree            -0.079450     -0.000000
priors_count                0.263480      0.030396
juv_misd_count              0.104788      0.000000
juv_fel_count               0.067886      0.000000
juv_other_count             0.107027      0.000000
days_b_screening_arrest     0.057642      0.003923
sex                         0.099693      0.000000
length_of_stay             -0.016544     -0.000019
Lasso Score: 0.11279

Comparison on protected feature:
                         Correlation  Lasso Weight
age                         0.137085      0.018583
age_cat                    -0.023542     -0.000000
c_charge_degree             0.099344      0.000000
priors_count               -0.206018     -0.062475
juv_misd_count             -0.074672     -0.000000
juv_fel_count              -0.048369   

In [15]:
compare_corr_lasso(create_german_dataset)

Dataset: german

Comparison on target:
                    Correlation  Lasso Weight
Status_account        -0.350847     -0.000000
Duration_month         0.214927      0.005122
Credit_history        -0.228785     -0.000000
Purpose                0.001514     -0.000000
Credit_amount          0.154739      0.000011
Savings_account       -0.178943     -0.000000
Employment_since      -0.116002     -0.000000
Installment_rate       0.072404      0.000000
Personal_status       -0.088184     -0.000000
Debtors_guarantors    -0.025137     -0.000000
Residence_since        0.002967     -0.000000
Property               0.142612      0.000000
Installment_plans     -0.109844     -0.000000
Housing               -0.019315     -0.000000
Number_credits        -0.045732     -0.000000
Job                    0.032735     -0.000000
Num_liable_people     -0.003015     -0.000000
Telephone             -0.036466     -0.000000
Foreign               -0.082079     -0.000000
Lasso Score: 0.04474

Comparison on prote

In [16]:
compare_corr_lasso(create_titanic_dataset)

Dataset: titanic

Comparison on target:
          Correlation  Lasso Weight
Pclass      -0.356462     -0.219414
Age         -0.082446     -0.008103
SibSp       -0.015523     -0.042031
Parch        0.095265      0.035061
Fare         0.266100      0.000582
Embarked    -0.181979     -0.038274
Lasso Score: 0.19762

Comparison on protected feature:
          Correlation  Lasso Weight
Pclass       0.150826      0.073024
Age          0.099037      0.003833
SibSp       -0.106296      0.000000
Parch       -0.249543     -0.110238
Fare        -0.182457     -0.000641
Embarked     0.109639      0.025642
Lasso Score: 0.10482
