# Classical Implementation
In this notebook I have trained a classical (and likely biased) machine learning model on the original dataset

## 1.1 Read in the data and export it as a CSV

In [205]:
import re

with open('../german-credit-dataset/german.data-numeric', 'r') as infile:
    data_contents = infile.read()    
    data_contents = re.sub(r'[ ]+', ",", data_contents)
    data_contents = re.sub(r'^,', "", data_contents)
    data_contents = re.sub(r'\n,', "\n", data_contents)
    data_contents = re.sub(r',\n', "\n", data_contents)
    # data_contents = re.sub(r'^,|\n,|,\n', "\n", data_contents)

    with open('../german-credit-dataset/german-numeric.csv', 'w') as outfile:
        outfile.write(data_contents)

## 1.2 Add column names to the data

In [206]:
import pandas as pd

# data = pd.read_csv('../german-credit-dataset/german.csv')
data = pd.read_csv('../german-credit-dataset/german-numeric.csv', header=None)
data.columns = [
    'A1',
    'A2',
    'A3',
    'A5*',
    'A6',
    'A7',
    'A9',
    'A11',
    'A12',
    'A13',
    'A14',
    'A16',
    'A18',
    'A19',
    'A20',
    'A4????',
    'A8',
    'A10a',
    'A10b',
    'A15a',
    'A15b',
    'A17a',
    'A17b',
    'A17c',
    'Score'
]


print('data read in and column names applied')

data read in and column names applied


## 1.3 Encode the age data as Young (0) and Aged (1)

In [207]:
data.loc[data.A13 <= 25, "A13"] = 0
data.loc[data.A13 > 25, "A13"] = 1

## 1.4 Split the data into Features and labels and into training and testing

In [208]:
from sklearn.model_selection import train_test_split

features = data.iloc[:, :24] # columns 0 to 19
labels = data.iloc[:, 24] # column 20

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=0) # This also shuffles the data


## 2.1 Train a Naive Bayes model

In [209]:
# Import and fit a naive bayes model
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## 2.2 Evaluate the model

In [210]:
nb_predictions = nb_classifier.predict(X_test)

In [211]:
from sklearn import metrics

print(f'Accuracy of the Naive Bayes classifier {metrics.accuracy_score(y_test, nb_predictions)}\n')

print(f'Classification report for Naive Bayes (0:died, 1:recovered):')
print(metrics.classification_report(y_test, nb_predictions, target_names=['Good','Bad'])[:166])

Accuracy of the Naive Bayes classifier 0.7266666666666667

Classification report for Naive Bayes (0:died, 1:recovered):
              precision    recall  f1-score   support

        Good       0.85      0.75      0.80       214
         Bad       0.52      0.67      0.59        86

  


# Fairness adjustment

In [212]:
print(nb_classifier.predict_proba(X_test.iloc[0:10]))
print(nb_classifier.predict(X_test.iloc[0:10]))

# The predict_proba method returns an array containing the prediction for the positive class (1) and the probability for the negative class (2)

[[3.24721995e-01 6.75278005e-01]
 [1.00000000e+00 3.48465693e-11]
 [9.79243740e-01 2.07562596e-02]
 [7.77353687e-01 2.22646313e-01]
 [2.28606178e-02 9.77139382e-01]
 [4.93996034e-01 5.06003966e-01]
 [6.57806245e-01 3.42193755e-01]
 [8.53883342e-01 1.46116658e-01]
 [9.97970908e-01 2.02909234e-03]
 [5.05790104e-02 9.49420990e-01]]
[2 1 1 1 2 2 1 1 1 2]


In [213]:
young_group = data[data['A13'] == 0]
young_group_good = young_group[young_group['Score'] == 1]
aged_group = data[data['A13'] == 1]
aged_group_good = aged_group[aged_group['Score'] == 1]

print(young_group.shape)
print(young_group_good.shape)
print(aged_group.shape)
print(aged_group_good.shape)

(190, 25)
(110, 25)
(810, 25)
(590, 25)


# Evaluation and comparison