# Import Libs & Datasets

In [27]:
import dowhy, os
import dowhy.datasets
import pandas as pd
import numpy as np
from dowhy import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression

# Data preprocess

In [28]:
def dataread(dataloc):
    data = pd.read_csv(dataloc, index_col= 0)
    df = str_to_int(data)
    return df

In [29]:
def isnull_extract(df):
    # print(df.columns)
    to_be_extracted = []
    for i in df.columns:
        if(df[i].isnull().sum()>0):
            to_be_extracted.append(i)
    # print(to_be_extracted)
    extracted_df = df.drop(to_be_extracted, axis=1)
    return extracted_df

In [30]:
def str_to_int(df):
    df = isnull_extract(df) # first, extract columns that has NaN value
    target_df = df
    # print(target_df)
    for i in target_df.columns:
        # print(i)
        # print(target_df[i].dtype == 'object')
        if (target_df[i].dtype == 'object'):
        #    print(len(target_df[i].unique()))
           target_col = target_df[i]
           dictionary = {}
           for c in range(len(target_col.unique())):
               dictionary[target_col.unique()[c]] = c
        
        #    print(dictionary)

        #    print(target_df[i][0])
           for r in range(len(target_df[i])):
               target_df[i][r] = dictionary[target_df[i][r]]
    target_df = target_df.astype('int')
    return target_df
          
# dataset['different_room_assigned']= dataset['different_room_assigned'].replace(1,True)         

In [31]:
def split(df, random_state = 42):
    X = df.iloc[:,0:-1]
    y= df.iloc[:, -1]
    y= y.astype('int')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=random_state)
    return(X_train, X_test, y_train, y_test)

## - Causal Variables Setting, Causal Loss Function Definition

In [19]:
def treatment_variable_setting(data, treatment = None):
    if not treatment:
        pass
    else:
        if len(data[treatment].unique()) > 2:
            pass
        else:
            data[treatment]= data[treatment].replace(1,True) 
            data[treatment]= data[treatment].replace(0,False)   
            data = data.astype({treatment:'bool'}, copy = False)
    return data

In [20]:
# Causal Loss function
def cm(data, treatment, outcome, common_causes):
    print(data)
    model=CausalModel(
            data = data,
            treatment= treatment,
            outcome= outcome,
            common_causes= common_causes)
    identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
    estimate = model.estimate_effect(identified_estimand,
            method_name="backdoor.propensity_score_weighting",
            method_params={"weighting_scheme":"ips_weight"})
    return estimate.value

## - Logistic Regression with Lowest ATE( = CIF Model)

We make logistic regression model awaring causality and fairness.

$$M^* = argmin_M (E[Y = 1 | T = 1] - E[Y = 1 | T = 0])$$ which will be denoted as
$$M^* = argmin_M (\text{Average Treatment Effect})$$

In [32]:
def lr(dataloc, rep = 100, treatment = None, outcome = None, common_causes = None):
    """Data Read & Declare Variables"""
    # Data Read
    df = dataread(dataloc)

    # Declare Lists needed
    acc_list = []
    ate_list = []
    sum_list = []
    data_list = []
    model_list = []

    """Train Model"""
    for i in range(rep):
        X_train, X_test, y_train, y_test = split(df, random_state = i)
        # Logistic Regression
        lr = LogisticRegression()
        lr.fit(X_train, y_train)
        acc_list.append(lr.score(X_test, y_test)) #, i, X_train, y_train)]

        # ATE
        concat_df = pd.concat([X_train, y_train], axis= 1)
        booled_df = treatment_variable_setting(concat_df, treatment = treatment)
        ate = cm(booled_df, treatment, outcome, common_causes)
        ate_list.append(ate)

        # Data Append
        data_list.append((X_train, y_train))

        # Model Append
        model_list.append(lr)

    """Define Target Functions"""
    # Target 1. Logistic Regression Accuracy - the higher the better
    acc_list = np.asarray(acc_list)

    # Target 2. Causal Inference Average Treatement Effect - the nearer to zero, the less causality
    ate_list = np.asarray(ate_list)

    # Sum of Above
    sum_list = acc_list - abs(ate_list)
    sum_list = np.asarray(sum_list)

    """Pick two models to compare"""
    """1. Plain-Vanilla Logistic Regression (We Call 'PV Model')"""
    # PV Model Index - which has highest Accuracy
    pv_index = np.argmax(acc_list)

    # PV Model
    pv_model = model_list[pv_index]

    # Pick  Metrics
    pv_acc, pv_ate = acc_list[pv_index], ate_list[pv_index]

    """2. Logistic Regression awaring Causality for Fairness (We Call 'CF Model')"""
    # CF Model Index - which has lowest ATE
    cf_index = np.argmin(ate_list)

    # CF Model
    cf_model = model_list[cf_index]

    # Pick  Metrics
    cf_acc, cf_ate = acc_list[cf_index], ate_list[cf_index]

    """Print"""
    # PV Model
    print("---------------------------------------------------------------------------------------------------------------------")
    print(f'PV Model returned with the Accuracy of {pv_acc} and the ATE of {pv_ate}')
    print("---------------------------------------------------------------------------------------------------------------------")

    # CF Model
    print("---------------------------------------------------------------------------------------------------------------------")
    print(f'CF Model returned with the Accuracy of {cf_acc} and the ATE of {cf_ate}')
    print("---------------------------------------------------------------------------------------------------------------------")
    return pv_model, cf_model

## Fairness Metrics from Paper 'Fairness Definitions Explained(Verma et al.)'

Those are notations, metrics and ratios from [paper (Verma et al.)](https://www.ece.ubc.ca/~mjulia/publications/Fairness_Definitions_Explained_2018.pdf)  
The Paper explains 25 definitions of fairness.


#### Notations
- $Y :$ Actual Binary
- $\hat{Y} :$ Predicted Binary
- $G :$ Protected Group(ex. Gender)
- $m :$ Male (0, False)
- $f :$ Female (1, True)

### Metrics from Confusion Matrix
- $TP = P(Y = 1, \hat{Y} = 1)$  
- $FP = P(Y = 0, \hat{Y} = 1)$ a.k.a Type I error
- $FN = P(Y = 1, \hat{Y} = 0)$ a.k.a Type II error
- $TN = P(Y = 0, \hat{Y} = 0)$

### Ratios from Metrics
- $PPV = \frac{TP}{TP+FP} = P(Y=1 | \hat{Y}=1)$ , Positive Predictive Value
- $FDR = \frac{FP}{TP+FP} = P(Y=0 | \hat{Y}=1)$ , False Discovery Rate 
- $FOR = \frac{FN}{TN+FN} = P(Y=1 | \hat{Y}=0)$, False Omission Rate 
- $NPV = \frac{TN}{TN+FN} = P(Y=0 | \hat{Y}=0)$, Negative Predictive Value 
- $TPR = \frac{TP}{TP+FN} = P(\hat{Y}=1 | Y=1)$, True Positive Rate 
- $FPR = \frac{FP}{FP+TN} = P(\hat{Y}=1 | Y=0)$, False Positive Rate 
- $FNR = \frac{FN}{TP+FN} = P(\hat{Y}=0 | Y=1)$, False Negative Rate
- $TNR = \frac{TN}{FP+TN} = P(\hat{Y}=0 | Y=0)$, True Negative Rate 


We will compare CIF Model with plain-vanilla logistic regression model below metrics

#### 1. Type I error
- $FP = P(Y = 0, \hat{Y} = 1)$
#### 2. Type II error
- $FN = P(Y = 1, \hat{Y} = 0)$
#### 3. Group Fairness (3.1.1. in paper)
- $P(\hat{Y} = 1 | G = m) = P(\hat{Y} = 1 | G = f)$

#### 4. Predictive Parity(3.2.1. in paper)
- $P(Y = 1 | \hat{Y} = 1, G = m) = P(Y = 1 | \hat{Y} = 1, G = f)$

#### 5. False Positive Error Rate(3.2.2. in paper)
- $P(\hat{Y} = 1 | Y = 0, G = m) = P(\hat{Y} = 1 | Y = 0, G = f)$

#### 6. Treatment Equality(3.2.7. in paper)
- $\frac{FN}{FP}(m) = \frac{FN}{FP}(f)$


In [22]:
def fairness_metrics(df):
    df_1 = df[df['Sex'] == 1]
    df_0 = df[df['Sex'] == 0]

    # 0. Accuracy 
    acc = len(df[df['Risk'] == df['Pred_y']]) / len(df)

    # 1. Type I Error
    ## higher - less preferred
    def type1(df_n):
        return len(df_n[(df_n['Risk'] == 0) & (df_n['Pred_y'] == 1)]) / len(df_n)

    # 2. Type II Error
    ## higher - more preferred
    def type2(df_n):
        return len(df_n[(df_n['Risk'] == 1) & (df_n['Pred_y'] == 0)]) / len(df_n)
    
    # 3. Group Fairness
    ## higher - more preferred
    def gf(df_n):
        return len(df_n[df_n['Pred_y'] == 0]) / len(df_n)
    
    # 4. Predictive Parity
    ## higher - more precise
    def pp(df_n):
        return len(df_n[(df_n['Risk'] == 1) & (df_n['Pred_y'] == 1)]) / len(df_n[df_n['Pred_y'] == 1])

    # 5. False Positive Error Rate
    ## higher - less preferred
    def fp(df_n):
        return len(df_n[(df_n['Risk'] == 0) & (df_n['Pred_y'] == 1)]) / len(df_n[df_n['Risk'] == 0])
    
    # 6. Treatment Equality
    ## higher - more preferred
    def te(df_n):
        fn = len(df_n[(df_n['Risk'] == 1) & (df_n['Pred_y'] == 0)]) / len(df_n)
        fp = len(df_n[(df_n['Risk'] == 0) & (df_n['Pred_y'] == 1)]) / len(df_n)
        return fn/fp

    return [[type1(df_0), type2(df_0), gf(df_0), pp(df_0), fp(df_0), te(df_0)], [type1(df_1), type2(df_1), gf(df_1), pp(df_1), fp(df_1), te(df_1)]]
    

def show(metrics):
    metrics = pd.DataFrame(metrics)
    diff = metrics.iloc[0, :] - metrics.iloc[1, :]
    # print(diff)
    metrics = metrics.append(diff, ignore_index=True)
    metrics.columns = ['Type I Error', 'Type II Error', 'Group Fairness', 'Predictive Parity', 'False Positive Error Rate', 'Treatment Equality']
    metrics.index = ['Male' , 'Female', 'Diff']
    metrics = metrics.round(2)
    return metrics

## Model Performance Comparison
- Here we will compare the result of 1. Plain-Vanilla Logistic Regression and 2. Logistic Regression awaring Causality

1. Plain-Vanilla Logistic Regression (We Call 'PV')
2. Logistic Regression awaring Causality(for Fairness) (We Call 'CF')


In [23]:
def fairness_comparison(dataloc, models):
    # Data & Model Read
    df = dataread(dataloc)
    pv, cf = models

    # Plain-Vanilla Regression Model
    # X_train, X_test, y_train, y_test = split(df)
    X = df.iloc[:, 0:-1]
    # lr = LogisticRegression()
    # lr.fit(X_train, y_train)
    pred_y = pv.predict(X)
    pred_y = pd.DataFrame(pred_y, columns=["Pred_y"])
    pv_df = pd.concat([df, pred_y], axis= 1)

    reg_acc = len(pv_df[pv_df['Risk'] == pv_df['Pred_y']]) / len(pv_df)
    
    # CF Model
    X = df.iloc[:, 0:-1]
    pred_y = cf.predict(X)
    pred_y = pd.DataFrame(pred_y, columns=["Pred_y"])
    cf_df = pd.concat([df, pred_y], axis= 1)

    cf_acc = len(cf_df[cf_df['Risk'] == cf_df['Pred_y']]) / len(cf_df)

    return pv_df, cf_df

In [33]:
dataloc = './dataset/german/german_credit.csv'
models = lr(dataloc, rep = 50, treatment = 'Sex', outcome = 'Risk', common_causes = 'Age+Job+Housing+Credit amount+Duration+Purpose'.split('+'))
## treatment = 'Sex' 0, False if Men and 1, True if Women
## outcome = 'Risk' 0 if one is credit-riskless, 1 if one has credit-risk
## common_causes = 'Age+Job+Housing+Credit amount+Duration+Purpose'.split('+')
pv_df, cf_df = fairness_comparison(dataloc, models)

1766         6        2     0
911   25   True    1        0           4736        24        2     1

[500 rows x 8 columns]
     Age    Sex  Job  Housing  Credit amount  Duration  Purpose  Risk
736   23   True    3        2          11560        24        3     1
480   23   True    1        0           3573        12        0     0
469   35  False    1        0           4679        24        3     0
908   46   True    1        0           3594        15        3     0
375   37   True    2        2           7685        48        4     1
..   ...    ...  ...      ...            ...       ...      ...   ...
316   38  False    1        0            708        12        2     0
467   32  False    2        0           7238        48        0     0
53    31  False    2        0           3378        18        3     0
843   50  False    2        0           1559        24        4     0
417   23   True    2        2           8471        18        1     0

[500 rows x 8 columns]
     Age    

## Which model is Fairer?

1. Plain-Vanilla Logistic Regression (We Call 'PV')
2. Logistic Regression awaring Causality(for Fairness) (We Call 'CF')

#### 1. Type I error
- $FP = P(Y = 0, \hat{Y} = 1)$
- Higher Type I error means that more people who actually are riskless are predicted as riskful.

#### 2. Type II error
- $FN = P(Y = 1, \hat{Y} = 0)$
- Higher Type II error means that more people who actually are riskful are predicted as riskless.

#### 3. Group Fairness (3.1.1. in paper)
- $P(\hat{Y} = 1 | G = m) = P(\hat{Y} = 1 | G = f)$
- Group Fairness is the possibility of predicted as riskful conditioned on gender.
- Higher Group Fairness means that group(male or female) has bigger possibility of being predicted as riskful.

#### 4. Predictive Parity(3.2.1. in paper)
- $P(Y = 1 | \hat{Y} = 1, G = m) = P(Y = 1 | \hat{Y} = 1, G = f)$
- Predictive Parity is the possibility of actually riskful people is predicted as riskful.
- Higher Predictive Parity means that group(male or female) prediction is more precise(if gap is big, fairness assessment is further needed)

#### 5. False Positive Error Rate(3.2.2. in paper)
- $P(\hat{Y} = 1 | Y = 0, G = m) = P(\hat{Y} = 1 | Y = 0, G = f)$
- False Positive Error Rate is the possibility of people predicted as riskful people were actually riskless.
- Higher False Positive Error Rate means that the group(male or female) has higher possibility of wrongfully discriminated as riskless.

#### 6. Treatment Equality(3.2.7. in paper)
- $\frac{FN}{FP}(m) = \frac{FN}{FP}(f)$
- Higher Treatment Equality means that the group(male or female) has higher Type II error divided by Type I error.


In [34]:
# fairness_metrics(cf_df)
show(fairness_metrics(cf_df))

Unnamed: 0,Type I Error,Type II Error,Group Fairness,Predictive Parity,False Positive Error Rate,Treatment Equality
Male,0.03,0.24,0.93,0.51,0.05,7.26
Female,0.03,0.29,0.91,0.64,0.05,9.1
Diff,0.0,-0.05,0.02,-0.13,-0.0,-1.84


In [35]:
# fairness_metrics(pv_df)
show(fairness_metrics(pv_df))

Unnamed: 0,Type I Error,Type II Error,Group Fairness,Predictive Parity,False Positive Error Rate,Treatment Equality
Male,0.04,0.23,0.91,0.52,0.06,5.3
Female,0.06,0.26,0.85,0.57,0.1,4.1
Diff,-0.02,-0.03,0.06,-0.06,-0.04,1.2
