# Classical Implementation
In this notebook I have trained a classical (and likely biased) machine learning model on the original dataset

## 1.1 Read in the data and export it as a CSV

In [58]:
import re

with open('../german-credit-dataset/german.data-numeric', 'r') as infile:
    data_contents = infile.read()    
    data_contents = re.sub(r'[ ]+', ",", data_contents)
    data_contents = re.sub(r'^,', "", data_contents)
    data_contents = re.sub(r'\n,', "\n", data_contents)
    data_contents = re.sub(r',\n', "\n", data_contents)
    # data_contents = re.sub(r'^,|\n,|,\n', "\n", data_contents)

    with open('../german-credit-dataset/german-numeric.csv', 'w') as outfile:
        outfile.write(data_contents)

## 1.2 Add column names to the data

In [59]:
import pandas as pd

# data = pd.read_csv('../german-credit-dataset/german.csv')
data = pd.read_csv('../german-credit-dataset/german-numeric.csv', header=None)
data.columns = [
    'A1',
    'A2',
    'A3',
    'A5*',
    'A6',
    'A7',
    'A9',
    'A11',
    'A12',
    'A13',
    'A14',
    'A16',
    'A18',
    'A19',
    'A20',
    'A4????',
    'A8',
    'A10a',
    'A10b',
    'A15a',
    'A15b',
    'A17a',
    'A17b',
    'A17c',
    'Score'
]


print('data read in and column names applied')

data read in and column names applied


## 1.3 Encode the age data as Young (0) and Aged (1)

In [60]:
data.loc[data.A13 <= 25, "A13"] = 0
data.loc[data.A13 > 25, "A13"] = 1

## 1.4 Split the data into Features and labels and into training and testing

In [61]:
from sklearn.model_selection import train_test_split

features = data.iloc[:, :24] # columns 0 to 19
labels = data.iloc[:, 24] # column 20

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=0) # This also shuffles the data


## 2.1 Train a Naive Bayes model

In [62]:
# Import and fit a naive bayes model
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## 2.2 Evaluate the model

In [63]:
nb_predictions = nb_classifier.predict(X_test)

In [64]:
from sklearn import metrics

print(f'Accuracy of the Naive Bayes classifier {metrics.accuracy_score(y_test, nb_predictions)}\n')

print(f'Classification report for Naive Bayes (0:died, 1:recovered):')
print(metrics.classification_report(y_test, nb_predictions, target_names=['Good','Bad'])[:166])

Accuracy of the Naive Bayes classifier 0.7266666666666667

Classification report for Naive Bayes (0:died, 1:recovered):
              precision    recall  f1-score   support

        Good       0.85      0.75      0.80       214
         Bad       0.52      0.67      0.59        86

  


# Fairness adjustment

## 1.1 Discrimination Measure
We use the KCDM measure to test the Discrimination level present within the dataset.

In [65]:
young_group = data[data['A13'] == 0]
young_group_good = young_group[young_group['Score'] == 1]
aged_group = data[data['A13'] == 1]
aged_group_good = aged_group[aged_group['Score'] == 1]

print(young_group.shape[0])
print(young_group_good.shape[0])
print(aged_group.shape[0])
print(aged_group_good.shape[0])

discrimination = aged_group_good.shape[0] / aged_group.shape[0] - young_group_good.shape[0] / young_group.shape[0]
print(discrimination)


190
110
810
590
0.14944769330734242


## Apply the CND algorithm

Next we use the naive bayes classifier to rank the instances by the probability of appearing in the positive class

In [66]:
# Identify the two groups, candidates for promotion and candidates for demotion
# these will need identifying along with their index in the real dataset so that their labels can be flipped later

In [83]:
# Make another column and store the predicted score

# print(nb_classifier.predict_proba(X_test.iloc[0:10]))
# print(nb_classifier.predict(X_test.iloc[0:10]))

def nb_predict(row):
    '''
    INPUT: A row from the feature data
    RETURNS: The probability of that row belonging to the positive class
    '''
    a = row.values
    a = a.reshape(1,-1)
    ps = nb_classifier.predict_proba(a)
    return ps[0][0]

# print(rank(X_test.iloc[0]))

X_train['rank scores'] = X_train.apply(nb_predict, axis=1, result_type='expand')

candidates_for_promotion = X_train[X_train.A13==0 and Y_train == 2]
print(candidates_for_promotion.shape)

# Order the groups by the scores generated ae

# Calculate how many swaps we need
swaps_required = ( (young_group.shape[0] * aged_group_good.shape[0]) - (aged_group.shape[0] * young_group_good.shape[0]) ) / (young_group.shape[0] + aged_group.shape[0])

# Make that many swaps
for i in range(int(swaps_required)):
    pass
    # Swap the top object from CP
    # Swap the top object from CD
    # remove both from the lists


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

# Evaluation and comparison