# PoC on German Credit

In [91]:
%reload_ext autoreload
%autoreload 2

## Data Processing

In [92]:
import pandas as pd

data = pd.read_csv('data/german_credit.csv')
data.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


## Model Training

In [93]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

target_name = 'Risk'

# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Split the data into features and target variable
X = data.drop(target_name, axis=1)
y = data[target_name]

# Split the data into training and testing sets
X_labeled, X_unlabeled, y_labeled, _ = train_test_split(X, y, test_size=0.7, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)


In [94]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy = {accuracy}')
print(report)


Accuracy = 0.6833333333333333
              precision    recall  f1-score   support

           0       0.67      0.27      0.39        22
           1       0.69      0.92      0.79        38

    accuracy                           0.68        60
   macro avg       0.68      0.60      0.59        60
weighted avg       0.68      0.68      0.64        60



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Fairness Checking (Naive)

In [95]:
import numpy as np

sensitive_attribute = 'Sex'

# Extract the coefficient for the "sex" feature
sex_coefficient = model.coef_[0][X.columns.get_loc(sensitive_attribute)]

# Check the distribution of predictions for different sexes
predicted_proba = model.predict_proba(X_test)[:, 1]
X_test['predicted_proba'] = predicted_proba
X_test['actual'] = y_test

# Group by sex
grouped_by_sex = X_test.groupby(sensitive_attribute).agg({
    'predicted_proba': ['mean'],
    'actual': ['mean']
})

sex_coefficient, grouped_by_sex


(0.6232179716088875,
     predicted_proba    actual
                mean      mean
 Sex                          
 0          0.650737  0.666667
 1          0.763060  0.611111)

## DataMatcher Module Test

In [96]:
from src.matching import NearestNeighborDataMatcher

In [97]:
X_observe = X_train


In [98]:
matching = NearestNeighborDataMatcher(X_labeled=X_observe, X_unlabeled=X_unlabeled).match(n_neighbors=2)

## Fairness Attribution Module Test

In [99]:
from src.attribution import FairnessExplainer

In [100]:
sen_att_name = ["Sex"]

sen_att = [X_observe.columns.get_loc(name) for name in sen_att_name]

In [101]:
explainer = FairnessExplainer(model=model, sen_att=sen_att, priv_val=[1])

In [110]:
explainer.shap_values(
    X=X_observe.values,
    X_baseline=X_unlabeled.values,
    matching=matching,
    sample_size=1000,
    shap_sample_size="auto",
)



array([[[ 0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00],
        ...,
        [ 0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00]],

       [[ 0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00],
        ...,
        [ 0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00]],

       [[-4.80275669e-03, -4.80275669e-03],
        [ 0.00000000e+00,  0.00000000e+00],
        [ 5.49011171e-03,  5.49011171e-03],
        ...,
        [ 1.15397558e-05,  1.15397558e-05],
        [ 2.42158276e-03,  2.42158276e-03],
        [ 5.35125886e-03,  5.35125886e-03]],

       ...,

       [[-5.73690473e-03, -5.73690473e-03],
        [-9.38127064e-03, -9.38127064e-03],
        [-8.44776448e-03, -8.44776448e-03],
        ...,
     