In [1]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import normalize
import sklearn.utils
import matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
np.set_printoptions(precision=4)
np.set_printoptions(suppress=True)
torch.set_printoptions(precision=4)
torch.set_printoptions(sci_mode=False)
from imblearn.over_sampling import SMOTE

In [2]:
csv_data = pd.read_csv('application_record.csv')

# Some Preprocessing
csv_data = csv_data.drop_duplicates('ID', keep='last').copy()
csv_data['DAYS_BIRTH'] = round(csv_data['DAYS_BIRTH']/-365,0)
csv_data.rename(columns={'DAYS_BIRTH':'AGE_YEARS'}, inplace=True)
csv_data['DAYS_EMPLOYED'].replace(365243, 0, inplace=True)
csv_data['DAYS_EMPLOYED'] = abs(round(csv_data['DAYS_EMPLOYED']/-365,0))
csv_data.rename(columns={'DAYS_EMPLOYED':'YEARS_EMPLOYED'}, inplace=True)
csv_data.drop('FLAG_MOBIL', axis=1, inplace=True)

# Remove Outlier
high_bound = csv_data['CNT_CHILDREN'].quantile(0.999)
low_bound = csv_data['CNT_CHILDREN'].quantile(0.001)
csv_data = csv_data[(csv_data['CNT_CHILDREN']>=low_bound) & (csv_data['CNT_CHILDREN']<=high_bound)]

high_bound = csv_data['AMT_INCOME_TOTAL'].quantile(0.999)
low_bound = csv_data['AMT_INCOME_TOTAL'].quantile(0.001)
csv_data = csv_data[(csv_data['AMT_INCOME_TOTAL']>=low_bound) & (csv_data['AMT_INCOME_TOTAL']<=high_bound)]

high_bound = csv_data['YEARS_EMPLOYED'].quantile(0.999)
low_bound = csv_data['YEARS_EMPLOYED'].quantile(0.001)
csv_data = csv_data[(csv_data['YEARS_EMPLOYED']>=low_bound) & (csv_data['YEARS_EMPLOYED']<=high_bound)]

high_bound = csv_data['CNT_FAM_MEMBERS'].quantile(0.999)
low_bound = csv_data['CNT_FAM_MEMBERS'].quantile(0.001)
csv_data = csv_data[(csv_data['CNT_FAM_MEMBERS']>=low_bound) & (csv_data['CNT_FAM_MEMBERS']<=high_bound)]

# Using dummy variable to handle categorical data
one_hot = pd.get_dummies(csv_data)

In [3]:
credit_df = pd.read_csv('credit_record.csv')

# Some Preprocessing
credit_df['STATUS'].replace(['C', 'X'],0, inplace=True)
credit_df['STATUS'].replace(['2','3','4','5'],1, inplace=True)
credit_df['STATUS'] = credit_df['STATUS'].astype('int')
credit_df_trans = credit_df.groupby('ID').agg(max).reset_index()
credit_df_trans.drop('MONTHS_BALANCE', axis=1, inplace=True)

In [4]:
# Merge
final_df = pd.merge(one_hot, credit_df_trans, on='ID', how='inner')
final_df.drop('ID', axis=1, inplace=True)

In [5]:
# Normalize & Split to input and label
normalized_df=(final_df-final_df.min())/(final_df.max()-final_df.min())
X = normalized_df[normalized_df.columns[:-1]]
y = normalized_df[normalized_df.columns[-1:]]

In [6]:
# Upsampling responsble to label to solve imbalanced learn
oversample = SMOTE(random_state = 42)
X_over, y_over = oversample.fit_resample(X, y)
normalized_df = pd.concat([X_over, y_over],axis=1)

In [7]:
# Shuffle
normalized_df = sklearn.utils.shuffle(normalized_df, random_state = 1)
normalized_df = normalized_df.reset_index(drop = True)

In [8]:
# Merge & Zero out some colume
ndf_value = normalized_df.values

merge_occupy = False
zero_out = False

if merge_occupy:
    for pairs in [[45, 49], [38, 41], [51, 36], [44, 48]]:
        x, y = pairs 
        ndf_value[:,x] += ndf_value[:, y]
        ndf_value[:, y] = np.zeros(ndf_value[:, y].shape)


if zero_out:
    for col in [40, 42]:
        ndf_value[:, col] = np.zeros(ndf_value[:, col].shape)
        
# Split to training and test set
train_set = ndf_value[:-4000]
test_set = ndf_value[-4000:]

In [9]:
# Remove gender information from input and extract then to label
gender_mask = np.asarray(list(map(lambda x: "GENDER" in x, normalized_df.columns)))
def get_label(data):
    data = data * gender_mask
    data = data[:, ~np.all(np.swapaxes(data,0,1) == 0, axis=1)]
    return data

train_input = train_set * ( 1 -  gender_mask)
test_input = test_set * ( 1 -  gender_mask)
train_label = get_label(train_set)
test_label = get_label(test_set)

In [10]:
class FraudInferDataset(Dataset):
    def __init__(self, inputs, gender):
        self.inputs = torch.tensor(inputs, dtype=torch.float32).to(device)
        self.labels = torch.cat([self.inputs[:,-1:], 1-self.inputs[:,-1:]], dim=1)
        self.inputs = self.inputs[:,:-1]
        self.gender = torch.tensor(np.round(gender), dtype=torch.float32).to(device)
    
    def __len__(self):
        return self.labels.shape[0]
    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx], self.gender[idx]

fraud_train_loader = DataLoader(FraudInferDataset(train_input, train_label), batch_size = 64)
fraud_test_loader = DataLoader(FraudInferDataset(test_input, test_label), batch_size = 64)

In [11]:
class GenderInferDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = torch.tensor(inputs,dtype=torch.float32).to(device)[:,:-1]
        self.labels = torch.tensor(labels,dtype=torch.float32).to(device)

    def __len__(self):
        return self.labels.shape[0]
    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

gender_train_loader = DataLoader(GenderInferDataset(train_input, train_label), batch_size = 64)
gender_test_loader = DataLoader(GenderInferDataset(test_input, test_label), batch_size = 64)

In [12]:
class InferNet(torch.nn.Module):
    def __init__(self, input_dim = 0):
        super(InferNet, self).__init__()
        self.linear1 = torch.nn.Linear(input_dim, 1024)
        self.linear2 = torch.nn.Linear(1024, 1024)
        self.linear3 = torch.nn.Linear(1024, 2)
        self.activation = torch.nn.LeakyReLU()
        self.softmax = torch.nn.Softmax(dim=1)
        self.dropout = torch.nn.Dropout(0.1)

    def forward(self, x):
        x = self.linear1(x)
        x = self.activation(x)
        
        #x = self.dropout(x)
        x = self.linear2(x)
        x = self.activation(x)
        #x = self.dropout(x)
        x = self.linear3(x)
        return torch.nn.functional.normalize(x,p=1)

# Calculate imput dim
inpt_sample,_,_ = FraudInferDataset(train_input, train_label)[0]
input_dim = inpt_sample.shape[0]

# Init Fraud inferer
fraud_infer = InferNet(input_dim).to(device)
fraud_optimizer = torch.optim.SGD(fraud_infer.parameters(), lr=0.01, momentum=0.9)  

# Init Gender Inferer
gender_infer = InferNet(input_dim).to(device)
gender_optimizer = torch.optim.SGD(gender_infer.parameters(), lr=0.01, momentum=0.9) 

# Loss Function
criterion = torch.nn.MSELoss()

In [None]:
# Train Gender Inferer
for epoch in range(200): 

    running_loss = 0.0
    for i, data in enumerate(gender_train_loader, 0):

        inputs, labels = data
        gender_optimizer.zero_grad()
        outputs = gender_infer(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        gender_optimizer.step()

 
        running_loss += loss.item()
        if i % 200 == 199:   
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 200:.8f}')
            running_loss = 0.0

print('Finished Gender Inferer Training')

In [14]:
# Gender Inferer Accuracy Test
correct = 0
total = 0
f_total_wrong = 0
m_total_wrong = 0

with torch.no_grad():

    for data in gender_test_loader:
      
        images, labels = data
        outputs = gender_infer(images)
        _, predicted = torch.max(outputs.data, 1)
        _, real = torch.max(labels.data, 1)

        
        f_total_wrong += ((predicted < real)).sum().item()
        m_total_wrong += ((predicted > real)).sum().item()

        total += labels.size(0)
        correct += (predicted == real).sum().item()

print(f'Accuracy of the network on the 4000 cases: {100 * correct / total} %')
print(f"Number of Wrong Prediction: Actually Famele: {f_total_wrong}, Actully Male: {m_total_wrong}")

Accuracy of the network on the 4000 cases: 98.075 %
Number of Wrong Prediction: Actually Famele: 52, Actully Male: 25


In [None]:
# Train Fraud Inferer
for epoch in range(200): 

    running_loss = 0.0
    for i, data in enumerate(fraud_train_loader, 0):
        inputs, labels,_ = data
        fraud_optimizer.zero_grad()
        outputs = fraud_infer(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        fraud_optimizer.step()

        
        running_loss += loss.item()
        if i % 200 == 199:    
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 200:.8f}')
            running_loss = 0.0

print('Finished Fraud Inferer Training')

In [15]:
# Gender Inferer Accuracy Test
correct = 0
total = 0
f_fp_total = 0 # Female False Positive
f_fn_total = 0
m_fp_total = 0
m_fn_total = 0

with torch.no_grad():

    for data in fraud_test_loader:

        images, labels, gender = data
        outputs = fraud_infer(images)
        _, predicted = torch.max(outputs.data, 1)
        _, real = torch.max(labels.data, 1)
        
        f_fp_total += ((predicted > real) * gender[:, 0]).sum().item()
        f_fn_total += ((predicted < real) * gender[:, 0]).sum().item()
        m_fp_total += ((predicted > real) * gender[:, 1]).sum().item()
        m_fn_total += ((predicted < real) * gender[:, 1]).sum().item()

        total += labels.size(0)
        correct += (predicted == real).sum().item()

print(f'Accuracy of the network on the 4000 cases: {100 * correct / total} %')
print("Female False Positive:", int(f_fp_total))
print("Female False Negative:", int(f_fn_total))
print("Male False Positive:", int(m_fp_total))
print("Male False Negative:", int(m_fn_total))

Accuracy of the network on the 4000 cases: 85.3 %
Female False Positive: 113
Female False Negative: 279
Male False Positive: 32
Male False Negative: 164


In [None]:
# Save Models
if merge_occupy and zero_out:
  torch.save(gender_infer.state_dict(), "./gender_model_merge_and_zero.pts")
  torch.save(fraud_infer.state_dict(), "./fraud_model_merge_and_zero.pts")
else:
  torch.save(gender_infer.state_dict(), "./gender_model_origin.pts")
  torch.save(fraud_infer.state_dict(), "./fraud_model_orogin.pts")

In [13]:
# Load Models
if merge_occupy and zero_out:
  gender_infer.load_state_dict(torch.load("gender_model_merge_and_zero.pts",map_location=torch.device('cpu')))
  fraud_infer.load_state_dict(torch.load("fraud_model_merge_and_zero.pts",map_location=torch.device('cpu')))
else:
  gender_infer.load_state_dict(torch.load("gender_model_origin.pts",map_location=torch.device('cpu')))
  fraud_infer.load_state_dict(torch.load("fraud_model_orogin.pts",map_location=torch.device('cpu')))

gender_infer.to(device).eval()
fraud_infer.to(device).eval()


InferNet(
  (linear1): Linear(in_features=53, out_features=1024, bias=True)
  (linear2): Linear(in_features=1024, out_features=1024, bias=True)
  (linear3): Linear(in_features=1024, out_features=2, bias=True)
  (activation): LeakyReLU(negative_slope=0.01)
  (softmax): Softmax(dim=1)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [None]:
# Impact Analysis
x = torch.zeros([input_dim, input_dim]).to(device)
d0_fraud = fraud_infer(x)
d0_gender = gender_infer(x)

for i in range(input_dim):
  x[i][i] = 1.

d1_fraud = fraud_infer(x)
d1_gender = gender_infer(x)
delta_fraud = d1_fraud - d0_fraud
delta_gender = d1_gender - d0_gender

In [None]:

print(f"G(o_i) - G(0) \t F(o_i)  - F(0)  Feature Idx\tFeature")
for feature, dg, df in sorted(list(zip(list(enumerate(normalized_df.columns)), delta_gender[:,0], delta_fraud[:,0])), key = lambda x:x[2]):
  print(f"{dg.tolist():.6f},\t {df.tolist():.6f},\t {feature[0]}\t\t{feature[1]}")