<a href="https://colab.research.google.com/github/waltz2u/bd/blob/master/DNN_Classification_Vanilla_for_Credit_Risk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Credit Risk Prediction Using Deep Neural Network

In this notebook a neural network is implemented from scratch to predict credit risk using UCI German Credit dataset

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [4]:
import warnings
import os 

from __future__ import absolute_import, division, print_function, unicode_literals
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

os.chdir(r"/content/gdrive/My Drive/models/germancat/")
x_df = pd.read_csv("training_features.csv",header =0)
y_df = pd.read_csv("training_labels.csv",header =0)
credit_df = pd.concat([x_df, y_df], axis = 1)
credit_df.tail()

Unnamed: 0,checkin_acc,duration,credit_history,purpose,amount,saving_acc,present_emp_since,inst_rate,personal_status,other_debtors,residing_since,property,age,inst_plans,housing,num_credits,job,dependents,telephone,foreign_worker,status
695,A11,12,A30,A40,1082,A61,A73,4,A93,A101,4,A123,48,A141,A152,2,A173,1,A191,A201,-1
696,A12,27,A32,A49,3915,A61,A73,4,A93,A101,2,A123,36,A143,A152,1,A173,2,A192,A201,-1
697,A14,9,A32,A46,3832,A65,A75,1,A93,A101,4,A121,64,A143,A152,1,A172,1,A191,A201,1
698,A12,18,A34,A42,1928,A61,A72,2,A93,A101,2,A121,31,A143,A152,2,A172,1,A191,A201,-1
699,A12,36,A33,A49,9857,A62,A74,1,A93,A101,3,A122,31,A143,A152,2,A172,2,A192,A201,1


In [5]:
credit_df = pd.get_dummies(credit_df, prefix='', prefix_sep='')
credit_df.tail()

Unnamed: 0,duration,amount,inst_rate,residing_since,age,num_credits,dependents,status,A11,A12,A13,A14,A30,A31,A32,A33,A34,A40,A41,A410,A42,A43,A44,A45,A46,A48,A49,A61,A62,A63,A64,A65,A71,A72,A73,A74,A75,A91,A92,A93,A94,A101,A102,A103,A121,A122,A123,A124,A141,A142,A143,A151,A152,A153,A171,A172,A173,A174,A191,A192,A201,A202
695,12,1082,4,4,48,2,1,-1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,1,0
696,27,3915,4,2,36,1,2,-1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,1,0
697,9,3832,1,4,64,1,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,1,0
698,18,1928,2,2,31,2,1,-1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,1,0
699,36,9857,1,3,31,2,2,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,1,0


## Data Exploration
Exploring the dataset using plots, histograms, descriptive statistics, etc.

In [22]:
credit_df.isna().sum()

duration          0
amount            0
inst_rate         0
residing_since    0
age               0
                 ..
A174              0
A191              0
A192              0
A201              0
A202              0
Length: 62, dtype: int64

In [0]:
credit_df = credit_df.dropna()

In [0]:
train_dataset = credit_df.sample(frac=0.8,random_state=0)
test_dataset = credit_df.drop(train_dataset.index)

In [0]:
import seaborn
seaborn.pairplot(train_dataset, diag_kind="kde")

In [9]:
train_stats = train_dataset.describe()
train_stats.pop("status")
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,560.0,20.869643,12.043265,4.0,12.0,18.0,24.0,60.0
amount,560.0,3286.292857,2868.900771,276.0,1362.5,2281.5,3979.5,18424.0
inst_rate,560.0,2.987500,1.116563,1.0,2.0,3.0,4.0,4.0
residing_since,560.0,2.901786,1.077994,1.0,2.0,3.0,4.0,4.0
age,560.0,36.339286,11.749437,19.0,27.0,34.0,43.0,75.0
...,...,...,...,...,...,...,...,...
A174,560.0,0.150000,0.357391,0.0,0.0,0.0,0.0,1.0
A191,560.0,0.569643,0.495569,0.0,0.0,1.0,1.0,1.0
A192,560.0,0.430357,0.495569,0.0,0.0,0.0,1.0,1.0
A201,560.0,0.964286,0.185743,0.0,1.0,1.0,1.0,1.0


In [0]:
y_train = train_dataset.pop('status')
y_test = test_dataset.pop('status')

In [0]:
def norm(x):
  return (x - train_stats['mean']) / train_stats['std']
X_train = norm(train_dataset)
X_test = norm(test_dataset)

## Modeling

Starting here write your own neural network code. To train, use X_train and y_train. To test, use X_test, y_test. Compute the accuracy, confusion matrix, precision, recall. Start with writing individual functions, then the main code that calls the functions above to train a model using the training data and return theta, then test the model using the test data, compute accuracy, confusion matrix, precision, recall. Make changes as needed.

In [0]:
import numpy as np
import math

# function that return sigmoid of z
def sigmoid(z):
    g = 1.0 / (1.0 + np.exp(-z))
    return g

def gradient(theta, grads, learning_rate = 1):
    Theta1 = np.reshape(theta[0:input_num*hidden_num], (hidden_num,input_num), 'F')
    Theta2 = np.reshape(theta[input_num*hidden_num: ], (1,hidden_num), 'F')
    
    dTheta1 = np.reshape(grads[0:input_num*hidden_num], (hidden_num,input_num), 'F')
    dTheta2 = np.reshape(grads[input_num*hidden_num: ], (1,hidden_num), 'F')
    
    Theta1 = Theta1 - learning_rate * dTheta1
    Theta2 = Theta2 - learning_rate * dTheta2
    
    theta = np.concatenate((Theta1.reshape(hidden_num * input_num, 1, order="F"),
                            Theta2.reshape(label_num * hidden_num, 1, order="F")))
    
    return theta

# cost function
def computeCost(X, y, theta, lambd):    
    m = y.shape[1]
    Theta1 = np.reshape(theta[0:input_num*hidden_num], (hidden_num,input_num), 'F')
    Theta2 = np.reshape(theta[input_num*hidden_num: ], (1,hidden_num), 'F')

    grad = np.dot(np.log(X), y.T) + np.dot(np.log(1 - X), (1 - y.T))
    cost_vector = (-1.0/m) * np.sum(grad)
    
    regularization_cost = (np.sum(np.square(Theta1)) + np.sum(np.square(Theta2)))*(lambd/(2*m))
    
    J = cost_vector + regularization_cost

    return J

# cost function
def computeGrad(X, y, cache, theta, lambd):
    m = X.shape[1]
    
    Theta1 = np.reshape(theta[0:input_num*hidden_num], (hidden_num,input_num), 'F')
    Theta2 = np.reshape(theta[input_num*hidden_num: ], (1,hidden_num), 'F')
        
    A1 = cache["A1"]
    A2 = cache["A2"]
    
    dZ2 = A2 - y
    dTheta2 = (1.0/m) * np.matmul(dZ2, np.transpose(A1)) + (lambd/m)*Theta2 
    
    dZ1 = np.matmul(np.transpose(Theta2), dZ2) * (1 - np.power(A1, 2))
    dTheta1 = (1.0/m) * np.matmul(dZ1, np.transpose(X)) + (lambd/m)*Theta1
    
    grad = np.concatenate((dTheta1.reshape(hidden_num * input_num, 1, order="F"),
                           dTheta2.reshape(label_num * hidden_num, 1, order="F")))
         
    return grad

def predict(X, theta):
    A2, cache = forward_propagation(X, theta)
    pred = A2 > 0.5
    
    return pred


def optimizeCost(X, y, theta, maxiter, lambd):
    costs = []
    for i in range(0, maxiter):        
        A2, cache = forward_propagation(X, theta)
        cost = computeCost(A2, y, theta, lambd)
        costs.append(cost)
        grads = computeGrad(X, y, cache, theta, lambd)
     
        theta = gradient(theta, grads, learning_rate)
            
        if i % 1000 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
            
    return theta

def forward_propagation(X, theta):
    Theta1 = np.reshape(theta[0:input_num*hidden_num], (hidden_num,input_num), 'F')
    Theta2 = np.reshape(theta[input_num*hidden_num: ], (1,hidden_num), 'F')
    
    Z1 = np.add(np.matmul(Theta1, X), 1)
    A1 = sigmoid(Z1)
    Z2 = np.add(np.matmul(Theta2, A1), 1)
    A2 = sigmoid(Z2)
    
    cache = {"Z1": Z1, "A1": A1, "Z2": Z2, "A2": A2}
    
    return A2, cache

In [0]:
from sklearn import metrics
X = X_train.values.T
Y = y_train.values.T
learning_rate = 0.01
input_num = X.shape[0]
hidden_num = input_num + 1
output_num = 1
lambd = 0.01

Theta1 = np.random.randn(hidden_num, input_num) * 0.01
Theta2 = np.random.randn(1, hidden_num) * 0.01
theta = np.concatenate((Theta1.reshape(hidden_num * input_num, 1, order="F"),
                            Theta2.reshape(output_num * hidden_num, 1, order="F")))

theta = optimizeCost(X_train, y_train, theta, 10000, 0.01)
pred = predict(X_test.T, theta)

In [0]:
print('\nTraining Accuracy: '+ str(np.mean((pred.T == y_test) * 100)))
print("Accuracy:",metrics.accuracy_score(y_test, pred.T))
print("Precision:",metrics.precision_score(y_test, pred.T))
print("Recall:",metrics.recall_score(y_test, pred.T))
print("Confusion Matrix:\n")
print(metrics.confusion_matrix(y_test, pred.T))

Now have some fun. Export the result to a csv file as follows: The first column is numbered starting from 1;The second is the label of the prediction. The first row is the header (like "num" and "label"). Submit the csv to the contest at https://www.scriptedin.com/contests/view/20 via Add Submission to see where you are on the leaderboard