In [1]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import normalize
import sklearn.utils
import matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
np.set_printoptions(precision=4)
np.set_printoptions(suppress=True)
torch.set_printoptions(precision=4)
torch.set_printoptions(sci_mode=False)
from imblearn.over_sampling import SMOTENC
from torch.optim.lr_scheduler import StepLR

In [2]:
csv_data = pd.read_csv('application_record.csv')

# Some Preprocessing
csv_data = csv_data.drop_duplicates('ID', keep='last').copy()
csv_data['DAYS_BIRTH'] = round(csv_data['DAYS_BIRTH']/-365,0)
csv_data.rename(columns={'DAYS_BIRTH':'AGE_YEARS'}, inplace=True)
csv_data['DAYS_EMPLOYED'].replace(365243, 0, inplace=True)
csv_data['DAYS_EMPLOYED'] = abs(round(csv_data['DAYS_EMPLOYED']/-365,0))
csv_data.rename(columns={'DAYS_EMPLOYED':'YEARS_EMPLOYED'}, inplace=True)
csv_data.drop('FLAG_MOBIL', axis=1, inplace=True)

csv_data['CODE_GENDER'].replace("F", 0, inplace=True)
csv_data['CODE_GENDER'].replace("M", 1, inplace=True)
csv_data['FLAG_OWN_CAR'].replace("N", 0, inplace=True)
csv_data['FLAG_OWN_CAR'].replace("Y", 1, inplace=True)
csv_data['FLAG_OWN_REALTY'].replace("N", 0, inplace=True)
csv_data['FLAG_OWN_REALTY'].replace("Y", 1, inplace=True)
csv_data = csv_data.fillna("DNE")

# Remove Outlier
high_bound = csv_data['CNT_CHILDREN'].quantile(0.999)
low_bound = csv_data['CNT_CHILDREN'].quantile(0.001)
csv_data = csv_data[(csv_data['CNT_CHILDREN']>=low_bound) & (csv_data['CNT_CHILDREN']<=high_bound)]

high_bound = csv_data['AMT_INCOME_TOTAL'].quantile(0.999)
low_bound = csv_data['AMT_INCOME_TOTAL'].quantile(0.001)
csv_data = csv_data[(csv_data['AMT_INCOME_TOTAL']>=low_bound) & (csv_data['AMT_INCOME_TOTAL']<=high_bound)]

high_bound = csv_data['YEARS_EMPLOYED'].quantile(0.999)
low_bound = csv_data['YEARS_EMPLOYED'].quantile(0.001)
csv_data = csv_data[(csv_data['YEARS_EMPLOYED']>=low_bound) & (csv_data['YEARS_EMPLOYED']<=high_bound)]

high_bound = csv_data['CNT_FAM_MEMBERS'].quantile(0.999)
low_bound = csv_data['CNT_FAM_MEMBERS'].quantile(0.001)
csv_data = csv_data[(csv_data['CNT_FAM_MEMBERS']>=low_bound) & (csv_data['CNT_FAM_MEMBERS']<=high_bound)]


In [3]:
credit_df = pd.read_csv('credit_record.csv')

# Some Preprocessing
credit_df['STATUS'].replace(['C', 'X', '0'], '0', inplace=True)
credit_df['STATUS'].replace(['2','3','4','5', '1'], '1', inplace=True)
credit_df['STATUS'] = credit_df['STATUS'].astype('int')
credit_df_trans = credit_df.groupby('ID').agg(max).reset_index()
credit_df_trans.drop('MONTHS_BALANCE', axis=1, inplace=True)

In [4]:
merged_df = pd.merge(csv_data, credit_df_trans, on='ID', how='inner')
merged_df.drop('ID', axis=1, inplace=True)

In [5]:
gender_balanced_df = merged_df.drop(merged_df[merged_df['CODE_GENDER'] < 0.5].sample(frac=0.50725, random_state=43).index)

In [6]:
categorical_feactures = ["CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "FLAG_WORK_PHONE", "FLAG_PHONE", "FLAG_EMAIL", "OCCUPATION_TYPE"] 
idx_categorical_feactures = list(map(lambda x: list(gender_balanced_df.columns).index(x), categorical_feactures ))

In [7]:
# Upsampling responsble to label to solve imbalanced learn
X = gender_balanced_df[gender_balanced_df.columns[:-1]]
y = gender_balanced_df[gender_balanced_df.columns[-1:]]
oversample = SMOTENC(idx_categorical_feactures, random_state = 44)
X_over, y_over = oversample.fit_resample(X, y)
gender_status_balanced_df = pd.concat([X_over, y_over],axis=1)

In [8]:
# Shuffle & Split in to input and label
gender_status_balanced_df = sklearn.utils.shuffle(gender_status_balanced_df, random_state = 1)
gender_status_balanced_df = gender_status_balanced_df.reset_index(drop = True)

input_df = gender_status_balanced_df[gender_status_balanced_df.columns[:-1]]
label_df = gender_status_balanced_df[gender_status_balanced_df.columns[-1:]]
dummy_input_df = pd.get_dummies(input_df)

# Normalize Input
dummy_input_df=(dummy_input_df-dummy_input_df.min())/(dummy_input_df.max()-dummy_input_df.min())

# Concat
final_df = pd.concat([dummy_input_df, label_df],axis=1)

In [9]:
# Merge & Zero out some colume
df_value = final_df.values

merge_occupy = False

if merge_occupy:
    for pairs in [[43, 35]]:
        x, y = pairs 
        df_value[:,x] += df_value[:, y]
        df_value[:, y] = np.zeros(df_value[:, y].shape)
        
# Split to training and test set
train_set = df_value[:-4000]
test_set = df_value[-4000:]

In [10]:
class InferDataset(Dataset):
    def __init__(self, data):
        self.input = torch.tensor(data[:,1:-1], dtype=torch.float32).to(device)
        self.fraud = torch.tensor(data[:,-1:], dtype=torch.float32).to(device)
        self.gender = torch.tensor(data[:, 0:1], dtype=torch.float32).to(device)
    
    def __len__(self):
        return self.input.shape[0]
    def __getitem__(self, idx):
        return self.input[idx], self.fraud[idx], self.gender[idx]

train_loader = DataLoader(InferDataset(train_set), batch_size = 64)
test_loader  = DataLoader(InferDataset(test_set), batch_size = 64)

In [11]:
class InferNet(torch.nn.Module):
    def __init__(self, input_dim = 0):
        super(InferNet, self).__init__()
        self.linear1 = torch.nn.Linear(input_dim, 1024)
        self.linear2 = torch.nn.Linear(1024, 1024)
        self.linear3 = torch.nn.Linear(1024, 1)
        self.activation = torch.nn.LeakyReLU()
        self.sigmoid = torch.nn.Sigmoid()
        self.dropout = torch.nn.Dropout(0.1)
        torch.nn.init.zeros_(self.linear1.weight)
        torch.nn.init.zeros_(self.linear2.weight)
        torch.nn.init.zeros_(self.linear3.weight)
    
    def forward(self, x):
        x = self.linear1(x)
        x = self.activation(x)
        
        #x = self.dropout(x)
        x = self.linear2(x)
        x = self.activation(x)
        #x = self.dropout(x)
        x = self.linear3(x)
        #return x
        return self.sigmoid(x) #torch.clamp(x, min=0, max=1)

# Calculate imput dim
inpt_sample,_,_ = InferDataset(test_set)[0]
input_dim = inpt_sample.shape[0]

# Init Fraud inferer
fraud_infer = InferNet(input_dim).to(device)
fraud_optimizer = torch.optim.SGD(fraud_infer.parameters(), lr=0.5, momentum=0.9)  
fraud_scheduler = torch.optim.lr_scheduler.StepLR(fraud_optimizer, step_size = 100, gamma = 0.2)
# Init Gender Inferer
gender_infer = InferNet(input_dim).to(device)
gender_optimizer = torch.optim.SGD(gender_infer.parameters(), lr=0.5, momentum=0.9)
gender_scheduler = torch.optim.lr_scheduler.StepLR(gender_optimizer, step_size = 100, gamma = 0.2)
# Loss Function
criterion = torch.nn.BCELoss()

In [None]:
# Train Inferers
for epoch in range(400): 

    fraud_running_loss = 0.0
    gender_running_loss = 0.0
    
    for data in train_loader:
        
        inputs, frauds, genders = data
        
        gender_optimizer.zero_grad()
        gender_outputs = gender_infer(inputs)
        gender_loss = criterion(gender_outputs, genders)
        gender_loss.backward()
        gender_optimizer.step()
        gender_running_loss += gender_loss.item()
    
        
        fraud_optimizer.zero_grad()
        fraud_outputs = fraud_infer(inputs)
        fraud_loss = criterion(fraud_outputs, frauds)
        fraud_loss.backward()
        fraud_optimizer.step()
        fraud_running_loss += fraud_loss.item()
    fraud_scheduler.step()
    gender_scheduler.step()
    print(f'Epoch: {epoch + 1}, gender loss: {gender_running_loss:.16f}, fraud loss: {fraud_running_loss:.16f}')
    

print('Finished Inferer Training')


In [13]:
# Gender Inferer Accuracy Test
correct = 0
total = 0
f_total_wrong = 0
m_total_wrong = 0

with torch.no_grad():

    for data in test_loader:
      
        inputs, _, genders = data
        
        outputs = gender_infer(inputs)
        
        predicted = torch.round(torch.clamp(outputs, min=0, max=1))
        #print(predicted.shape, genders.shape)
        
        f_total_wrong += (torch.logical_and(genders == 0, predicted == 1)).sum().item()
        m_total_wrong += (torch.logical_and(genders == 1, predicted == 0)).sum().item()

        total += genders.size(0)
        correct += (predicted == genders).sum().item()

print(f'Accuracy of the network on the 4000 cases: {100 * correct / total} %')
print(f"Number of Wrong Prediction: Actually Famele: {f_total_wrong}, Actully Male: {m_total_wrong}")

Accuracy of the network on the 4000 cases: 96.175 %
Number of Wrong Prediction: Actually Famele: 81, Actully Male: 72


In [14]:
# Fraud Inferer Accuracy Test
correct = 0
total = 0
f_fp_total = 0 # Female False Positive
f_fn_total = 0
m_fp_total = 0
m_fn_total = 0
f_tp_total = 0 # Female True Positive
f_tn_total = 0
m_tp_total = 0
m_tn_total = 0
with torch.no_grad():

    for data in test_loader:
        
        inputs, frauds, genders = data
        outputs = fraud_infer(inputs)
        predicted = torch.round(torch.clamp(outputs, min=0, max=1))
        f_fp_total += torch.logical_and(genders == 0, predicted > frauds).sum().item() 
        f_fn_total += torch.logical_and(genders == 0, predicted < frauds).sum().item() 
        f_tp_total += torch.logical_and(genders == 0, torch.logical_and(predicted == frauds, frauds == 0)).sum().item() 
        f_tn_total += torch.logical_and(genders == 0, torch.logical_and(predicted == frauds, frauds == 1)).sum().item() 
        
        m_fp_total += torch.logical_and(genders == 1, predicted > frauds).sum().item() 
        m_fn_total += torch.logical_and(genders == 1, predicted < frauds).sum().item() 
        m_tp_total += torch.logical_and(genders == 1, torch.logical_and(predicted == frauds, frauds == 0)).sum().item()
        m_tn_total += torch.logical_and(genders == 1, torch.logical_and(predicted == frauds, frauds == 1)).sum().item()

        total += frauds.size(0)
        correct += (predicted == frauds).sum().item()

print(f'Accuracy of the network on the 4000 cases: {100 * correct / total} %')
print("Female False Positive:", int(f_fp_total))
print("Female False Negative:", int(f_fn_total))
print("Female True Positive:", int(f_tp_total))
print("Female True Negative:", int(f_tn_total))
print("Male False Positive:", int(m_fp_total))
print("Male False Negative:", int(m_fn_total))
print("Male True Positive:", int(m_tp_total))
print("Male True Negative:", int(m_tn_total))

Accuracy of the network on the 4000 cases: 89.575 %
Female False Positive: 111
Female False Negative: 95
Female True Positive: 863
Female True Negative: 798
Male False Positive: 108
Male False Negative: 103
Male True Positive: 909
Male True Negative: 1013


In [None]:
# Save Models
if merge_occupy:
  torch.save(gender_infer.state_dict(), "./gender_model_merge_and_zero.pts")
  torch.save(fraud_infer.state_dict(), "./fraud_model_merge_and_zero.pts")
else:
  torch.save(gender_infer.state_dict(), "./gender_model_origin.pts")
  torch.save(fraud_infer.state_dict(), "./fraud_model_orogin.pts")

In [12]:
# Load Models
if merge_occupy:
  gender_infer.load_state_dict(torch.load("gender_model_merge_and_zero.pts",map_location=torch.device('cpu')))
  fraud_infer.load_state_dict(torch.load("fraud_model_merge_and_zero.pts",map_location=torch.device('cpu')))
else:
  gender_infer.load_state_dict(torch.load("gender_model_origin.pts",map_location=torch.device('cpu')))
  fraud_infer.load_state_dict(torch.load("fraud_model_orogin.pts",map_location=torch.device('cpu')))

gender_infer.to(device).eval()
fraud_infer.to(device).eval()


InferNet(
  (linear1): Linear(in_features=50, out_features=1024, bias=True)
  (linear2): Linear(in_features=1024, out_features=1024, bias=True)
  (linear3): Linear(in_features=1024, out_features=1, bias=True)
  (activation): LeakyReLU(negative_slope=0.01)
  (sigmoid): Sigmoid()
  (dropout): Dropout(p=0.1, inplace=False)
)

In [None]:
# Impact Analysis
x = torch.zeros([input_dim, input_dim]).to(device)
d0_fraud = fraud_infer(x)
d0_gender = gender_infer(x)

for i in range(input_dim):
  x[i][i] = 1.

d1_fraud = fraud_infer(x)
d1_gender = gender_infer(x)
delta_fraud = d1_fraud - d0_fraud
delta_gender = d1_gender - d0_gender

In [None]:

print(f"G(o_i) - G(0) \t F(o_i)  - F(0)  Feature Idx\tFeature")
for feature, dg, df in sorted(list(zip(list(enumerate(final_df.columns[1:-1])), delta_gender, delta_fraud)), key = lambda x:x[2]):
    print(f"{dg.item():.6f},\t {df.item():.8f},\t {feature[0]+1},\t\t{feature[1]}")