In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)


In [None]:
df = pd.read_csv("datasets/german_credit.csv")

In [None]:
df['Sex & Marital Status'] = ((df['Sex & Marital Status'] == 2) | (df['Sex & Marital Status'] == 5)).astype(int)

In [None]:
df_male = df[df['Sex & Marital Status'] == 0]
df_female = df[df['Sex & Marital Status'] == 1]
df.drop(columns = ['Sex & Marital Status'])
df_male.drop(columns = ['Sex & Marital Status'])
df_female.drop(columns = ['Sex & Marital Status'])

In [None]:
df_male = pd.get_dummies(df_male, columns = ['Account Balance', 'Payment Status of Previous Credit', 'Purpose', 'Guarantors','Most valuable available asset', 'Type of apartment', 'Occupation', 'Telephone', 'Foreign Worker'])
df_female = pd.get_dummies(df_female, columns = ['Account Balance', 'Payment Status of Previous Credit', 'Purpose', 'Guarantors','Most valuable available asset', 'Type of apartment', 'Occupation', 'Telephone', 'Foreign Worker'])
df = pd.get_dummies(df, columns = ['Account Balance', 'Payment Status of Previous Credit', 'Purpose', 'Guarantors','Most valuable available asset', 'Type of apartment', 'Occupation', 'Telephone', 'Foreign Worker'])

In [None]:
df_male.head(10)

In [None]:
data = np.array(df)
#print(data.shape)
cols = list(df.columns)
#print(cols)
cols.append('Intercept')
cols.remove('Creditability')

In [None]:
data = np.array(df)
Y = data[:,0]
X = np.hstack((data[:,1:], np.ones((data.shape[0],1))))

In [None]:
print(X)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(fit_intercept = False).fit(X,Y)
print(clf.predict(np.reshape(X[6,:], (1,X.shape[1]))))
print(clf.predict_proba(np.reshape(X[6,:], (1,X.shape[1]))))

In [None]:
print(clf.score(X,Y))

In [None]:
def accuracy(preds, actuals):
    return 1 - np.sum(abs(preds-actuals)) / preds.shape[0]

In [None]:
preds = clf.predict(X)
print(accuracy(preds, Y))

In [None]:
def Pr_Y_given_X(coef,yhat,X):
#     print(X.shape)
#     print(coef.shape)
    return 1/(1+np.exp(-yhat*np.matmul(X,coef)))

In [None]:
#print(1/(1 + np.exp(-1*np.dot(np.reshape(X[6,:],X.shape[1]),np.reshape(coef, coef.shape[1])))))

In [None]:
#print(coef.reshape(coef.shape[1]))

In [None]:
#print(Pr_Y_given_X(coef.reshape(coef.shape[1]),1,X))

In [None]:
def P_Y(actuals):
    return np.sum(actuals) / actuals.shape[0]

In [None]:
print("P(Y=1 | Male) = ", P_Y(df_male['Creditability']))
print("P(Y=1 | Female) = ", P_Y(df_female['Creditability']))

In [None]:
def P_Yhat_given_Y(preds,actuals):
    return np.sum(np.multiply(preds, actuals)) / np.sum(actuals)

In [None]:
def P_Y_given_Yhat(preds,actuals):
    return np.sum(np.multiply(preds, actuals)) / np.sum(preds)

In [None]:
data = np.array(df_male)
X_male = np.hstack((data[:,1:], np.ones((data.shape[0],1))))
Y_male = data[:,0]
preds_male = clf.predict(X_male)
print("P(Y = 1 | Yhat = 1, Male) = ", P_Y_given_Yhat(preds_male, Y_male))
data = np.array(df_female)
X_female = np.hstack((data[:,1:], np.ones((data.shape[0],1))))
Y_female = data[:,0]
preds_female = clf.predict(X_female)
print("P(Y = 1 | Yhat = 1, Female) = ", P_Y_given_Yhat(preds_female, Y_female))
print("P(Yhat = 1 | Y = 1, Male) = ", P_Yhat_given_Y(preds_male, Y_male))
print("P(Yhat = 1 | Y = 1, Female) = ", P_Yhat_given_Y(preds_female, Y_female))


In [None]:
print("P(Y=1 | Male) = ", P_Y(preds_male))
print("P(Y=1 | Female) = ", P_Y(preds_female))

In [None]:
def P_X(X):
    return np.sum(X, axis = 0)/X.shape[0]

In [None]:
diff = P_X(X_female) - P_X(X_male)

In [None]:
coef_df = pd.DataFrame(np.vstack((clf.coef_, diff)), columns = cols)

In [None]:
coef_df.head()

In [None]:
print(np.array(coef_df.loc[[0]]).flatten())
coef_df_adj = coef_df.copy()
print(coef_df_adj['Type of apartment_2'][0])

In [None]:
coefs = np.array(coef_df.loc[[0]]).flatten()
probs = Pr_Y_given_X(coefs,1,X_male)
preds = np.array([1 if p >= 0.5 else 0 for p in probs])
# preds = np.apply_along_axis(lambda x: 1 if x >= 0.5 else 0, 0, Pr_Y_given_X(coefs,1,X_male))
# print(preds)
# print(accuracy(preds, Y_male))
print(coefs)
coefs[0] = 0
print(probs[1:10])
probs = Pr_Y_given_X(coefs,1,X_male)
print(probs[1:10])

In [None]:
x = np.array(range(-60,61))/10
delta = np.array(range(-60,61))/10
group1 = []
group2 = []
for d in delta:
    coef_df_adj = coef_df.copy()
    coef_df_adj.loc[0, 'Foreign Worker_2'] = coef_df_adj['Foreign Worker_2'][0] + d
    #print(coef_df_adj.head())
    coefs = np.array(coef_df_adj.loc[[0]]).flatten()
    #print(coefs)
    probs = Pr_Y_given_X(coefs,1,X_male)
    #print(probs[1])
    preds = np.array([1 if p >= 0.5 else 0 for p in probs])
    #print(P_Yhat_given_Y(preds, Y_male))
    group1.append(P_Yhat_given_Y(preds, Y_male))
    
    probs = Pr_Y_given_X(coefs,1,X_female)
   # print(probs[1])
    preds = np.array([1 if p >= 0.5 else 0 for p in probs])
    #print(P_Yhat_given_Y(preds, Y_female))
    group2.append(P_Yhat_given_Y(preds, Y_female))
    
plt.scatter(x, np.array(group1)-np.array(group2))
plt.ylabel('Difference in opportunity between group 1 and 2')
plt.xlabel('w')
    

In [None]:
x = np.array(range(-100,101))/10
delta = np.array(range(-100,101))/10
group1 = []
group2 = []
for d in delta:
    coef_df_adj = coef_df.copy()
    coef_df_adj.loc[0, 'Foreign Worker_2'] = coef_df_adj['Foreign Worker_2'][0] + d
    #print(coef_df_adj.head())
    coefs = np.array(coef_df_adj.loc[[0]]).flatten()
    #print(coefs)
    probs = Pr_Y_given_X(coefs,1,X_male)
    #print(probs[1])
    preds = np.array([1 if p >= 0.5 else 0 for p in probs])
    #print(P_Yhat_given_Y(preds, Y_male))
    group1.append(P_Y_given_Yhat(preds, Y_male))
    
    probs = Pr_Y_given_X(coefs,1,X_female)
   # print(probs[1])
    preds = np.array([1 if p >= 0.5 else 0 for p in probs])
    #print(P_Yhat_given_Y(preds, Y_female))
    group2.append(P_Y_given_Yhat(preds, Y_female))
    
plt.scatter(x, np.array(group1)-np.array(group2))
    