In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler 

In [2]:
from platform import python_version

print(python_version())

3.9.10


### 4.3 Decision Tree

In [3]:
def importdata():
    loan_data = pd.read_csv('Data/merged_data.csv', sep= ',')
    loan_data = loan_data.drop('Unnamed: 0', axis=1)
      
    # Printing the dataswet shape
    print ("Dataset Length: ", len(loan_data))
    print ("Dataset Shape: ", loan_data.shape)
      
    # Printing the dataset obseravtions
    print ("Dataset: ",loan_data.head())
    return loan_data

In [4]:
# Function to standardize features
# ['nr_credit_applications', 'CRG_1.0', 'CRG_2.0', 'CRG_3.0', 'CRG_4.0', 'CRG_5.0', 'CRG_7.0']

def standardization(data, excl_var):
    temp = data.drop(excl_var, axis=1)
    scale_features = temp.columns
    tobe_scaled_df = data[scale_features]
    scaler = StandardScaler().fit(tobe_scaled_df.values)
    scaled = scaler.transform(tobe_scaled_df.values)
    data[scale_features] = scaled
    return data

In [5]:
def oversampling(X, y, method, random_state):
    if method == "random":
        ros = RandomOverSampler(random_state=random_state)
        ros_X_train, ros_y_train = ros.fit_resample(X, y)
        print('Original dataset shape %s' % Counter(y['credit_application']))
        print('Resampled dataset shape %s' % Counter(ros_y_train['credit_application']))
        return ros_X_train, ros_y_train
    
    if method == "smote":
        columns = X.columns
        os = SMOTE(random_state=random_state)
        os_X_train, os_y_train = os.fit_resample(X, y)
        print('Original dataset shape %s' % Counter(y['credit_application']))
        print('Resampled dataset shape %s' % Counter(os_y_train['credit_application']))
        os_data_X = pd.DataFrame(data=os_X_train,columns=columns)
        os_data_y= pd.DataFrame(data=os_y_train,columns=['credit_application'])
        return os_data_X, os_data_y

In [6]:
def dummyit(data, var):
    return pd.get_dummies(data, dummy_na=False, columns=var)

In [7]:
# Function to split the dataset
def splitdataset(data):
  
    # Separating the target variable
    data = data.drop(['client_nr', 'yearmonth2'], axis=1)
    X = data.loc[:, data.columns != 'credit_application']
    y = data.loc[:, data.columns == 'credit_application']
  
    # Splitting the dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split( 
    X, y, test_size = 0.3, random_state = 100)
      
    return X, y, X_train, X_test, y_train, y_test

In [8]:
# Function to perform training with giniIndex.
def train_using_gini(X_train, X_test, y_train):
  
    clf_gini = DecisionTreeClassifier(criterion = "gini",
            random_state = 100,max_depth=3, min_samples_leaf=5)
  
    clf_gini.fit(X_train, y_train)
    return clf_gini

# Function to perform training with entropy.
def train_using_entropy(X_train, X_test, y_train):
  
    clf_entropy = DecisionTreeClassifier(
            criterion = "entropy", random_state = 100,
            max_depth = 3, min_samples_leaf = 5)
  
    clf_entropy.fit(X_train, y_train)
    return clf_entropy
  
# Function to make predictions
def prediction(X_test, clf_object):
  
    # Predicton on test with giniIndex
    y_pred = clf_object.predict(X_test)
    print("Predicted values:")
    print(y_pred)
    return y_pred
      
# Function to calculate accuracy
def cal_accuracy(y_test, y_pred):
      
    print("Confusion Matrix: ",
        confusion_matrix(y_test, y_pred))
      
    print ("Accuracy : ",
    accuracy_score(y_test,y_pred)*100)
      
    print("Report : ",
    classification_report(y_test, y_pred))

In [22]:
# Driver code
def main():
      
    # Building Phase
    data = importdata()
    data = dummyit(data, var=['CRG'])
    
    # feature creation
    data['dti'] = data['volume_debit_trx']/data['volume_credit_trx']
    
    X, Y, X_train, X_test, y_train, y_test = splitdataset(data)
    
    # oversampling
    X_train_os, y_train_os = oversampling(X_train, y_train, method='smote', random_state=49) 
    
    # select only previously identified statistically significant columns
    X_train_sub = X_train_os[['nr_debit_trx', 'nr_credit_trx', 'min_balance']]
    X_test_sub = X_test[['nr_debit_trx', 'nr_credit_trx', 'min_balance']]
    clf_gini = train_using_gini(X_train_sub, X_test_sub, y_train_os)
    clf_entropy = train_using_entropy(X_train_sub, X_test_sub, y_train_os)
      
    # Operational Phase
    print("Results Using Gini Index:")
      
    # Prediction using gini
    y_pred_gini = prediction(X_test_sub, clf_gini)
    cal_accuracy(y_test, y_pred_gini)
      
    print("Results Using Entropy:")
    # Prediction using entropy
    y_pred_entropy = prediction(X_test_sub, clf_entropy)
    cal_accuracy(y_test, y_pred_entropy)

In [23]:
main()

Dataset Length:  29996
Dataset Shape:  (29996, 12)
Dataset:     client_nr  total_nr_trx  nr_debit_trx  volume_debit_trx  nr_credit_trx  \
0          1            97            50           6527929             47   
1          1            88            59           3475918             29   
2          1            96            62          31316405             34   
3          1            83            53          18669967             30   
4          1            94            54           2893905             40   

   volume_credit_trx  min_balance  max_balance  CRG  yearmonth2  \
0            7454863     -7914288     25110651  1.0  2014-01-01   
1            1895848     -8448513     25036651  1.0  2014-02-01   
2           20083583    -10347650     18020151  1.0  2014-03-01   
3            1091295    -15385039     13318200  1.0  2014-04-01   
4            2034075    -15682170      2350000  1.0  2014-05-01   

   credit_application  nr_credit_applications  
0                   0    

ValueError: Input X contains infinity or a value too large for dtype('float64').