In [1]:
import os
import json
import pickle
import datetime
import time

import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.base import clone

from tqdm import tqdm

import ToolsForAnalysis as tfa

In [2]:
#Start time
start_time = time.time()

In [3]:
models_param_dict = {
        'DecisionTreeClassifier':
            {'criterion': ['gini', 'entropy', 'log_loss'],
            'max_depth': [2, 4, 6, 8, 10, 12],
            'max_features': ['sqrt', 'log2']},
        'GradientBoostingClassifier':
            {'loss': ['log_loss', 'exponential'],
             'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.1, 0.2, 0.3],
             'n_estimators': [100, 150, 200, 250, 300],
             'criterion': ['friedman_mse', 'squared_error'],
             'max_depth': [3, 4, 5, 6, 7, 8]}
}

# Objects

In [4]:
tools = tfa.ToolsForAnalysis()

# Parameters

In [5]:
#Data folder
dataset_path = './Data/'

#Results folder
dir_output_name = './outputs/'

#Subfolder name for results
start_time_folder_name = datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')

#Dataset name
file_name = 'Loan Approval Prediction'

#Summary results file name
summary_file_name = 'all_results'

#type_of_transformation = 'standardization'
type_of_transformation = 'min_max'

#Best parameters already calculated
best_parameters_ready = True

target_variable = 'Loan_Status'

#Amount of data used to test the models
test_size = 0.3

#Score type for choose best model
score_type = 'accuracy'

#Metric of the results dataset that will be analyzed to verify the best model
target_metric_for_best_results = 'Accuracy'

# Best parameters path

In [6]:
if best_parameters_ready == True:
    if type_of_transformation == 'standardization':
        best_parameters_path = f'{dir_output_name}best_parameters_standardization.json'
    else:
        best_parameters_path = f'{dir_output_name}best_parameters_min_max.json'

# Create output folder

In [7]:
if os.path.exists(dir_output_name) == False:
    os.makedirs(dir_output_name, exist_ok=True)

# Read data

In [8]:
data = tools.get_data_by_name(file_name, dataset_path)

In [9]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [10]:
len(data)

614

# Pre-processing

## Drop missing values

In [11]:
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

In [12]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y


## Create dummies variables

In [13]:
gender = pd.get_dummies(data['Gender'], drop_first=True)
married = pd.get_dummies(data['Married'], drop_first=True)
education = pd.get_dummies(data['Education'], drop_first=True)
self_employed = pd.get_dummies(data['Self_Employed'], drop_first=True)
property_area = pd.get_dummies(data['Property_Area'], drop_first=True)
loan_status = pd.get_dummies(data['Loan_Status'], drop_first=True)
data.drop(['Loan_ID', 'Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status'], axis=1, inplace=True)
data['Gender'] = gender
data['Married'] = married
data['Education'] = education
data['Self_Employed'] = self_employed
data = pd.concat([data, property_area], axis=1)
data['Loan_Status'] = loan_status

In [14]:
data.head()

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender,Married,Education,Self_Employed,Semiurban,Urban,Loan_Status
0,1,4583,1508.0,128.0,360.0,1.0,1,1,0,0,0,0,0
1,0,3000,0.0,66.0,360.0,1.0,1,1,0,1,0,1,1
2,0,2583,2358.0,120.0,360.0,1.0,1,1,1,0,0,1,1
3,0,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,1,1
4,2,5417,4196.0,267.0,360.0,1.0,1,1,0,1,0,1,1


## Adjust string to int

In [15]:
data['Dependents'] = data['Dependents'].apply(tools.convert_to_int)

In [16]:
data.head()

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender,Married,Education,Self_Employed,Semiurban,Urban,Loan_Status
0,1,4583,1508.0,128.0,360.0,1.0,1,1,0,0,0,0,0
1,0,3000,0.0,66.0,360.0,1.0,1,1,0,1,0,1,1
2,0,2583,2358.0,120.0,360.0,1.0,1,1,1,0,0,1,1
3,0,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,1,1
4,2,5417,4196.0,267.0,360.0,1.0,1,1,0,1,0,1,1


## Split data (train/test) and transform (standardization/min_max) data 

In [17]:
#Columns that not will be transformed
non_transform_columns = ['Dependents', 'Credit_History', 'Gender', 'Married', 'Education', 'Self_Employed', 'Semiurban', 'Urban']

In [18]:
X_train, X_test, y_train, y_test = tools.split_train_test_and_transform_data(data, type_of_transformation, non_transform_columns, target_variable, test_size)

## Show splited and transformed data

In [19]:
X_train.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Dependents,Credit_History,Gender,Married,Education,Self_Employed,Semiurban,Urban
0,0.04,0.01,0.17,0.73,2,1.0,1,1,1,0,1,0
1,0.04,0.0,0.18,1.0,0,1.0,1,0,0,0,0,1
2,0.03,0.22,0.15,0.73,0,1.0,0,1,0,0,1,0
3,0.04,0.07,0.15,0.73,0,1.0,1,1,0,0,0,1
4,0.12,0.0,0.25,0.73,1,1.0,1,1,0,1,0,0


In [20]:
X_test.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Dependents,Credit_History,Gender,Married,Education,Self_Employed,Semiurban,Urban
0,0.15,0.05,0.12,0.7,2,1.0,1,1,1,0,0,1
1,0.35,0.0,0.22,0.7,0,1.0,1,0,1,0,1,0
2,0.16,0.13,0.46,0.7,2,1.0,1,1,0,0,0,0
3,0.13,0.05,0.16,1.0,0,0.0,1,1,1,0,0,1
4,0.15,0.06,0.26,0.7,2,1.0,1,1,0,0,0,1


In [21]:
y_train

0      1
1      0
2      1
3      1
4      0
      ..
331    1
332    1
333    1
334    1
335    1
Name: Loan_Status, Length: 336, dtype: uint8

In [22]:
y_test

0      1
1      1
2      1
3      0
4      1
      ..
139    1
140    0
141    1
142    1
143    1
Name: Loan_Status, Length: 144, dtype: uint8

# Fit and test models

In [23]:
dt = DecisionTreeClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)

In [24]:
list_of_models = [dt, gb]
model_name_list = []
accuracy_list = []
f1_list = []
precision_list = []
recall_list = []

best_params_dict = {}

for model in tqdm(list_of_models):
    
    dir_experiment_name = f'{dir_output_name}{start_time_folder_name}/'
    os.makedirs(dir_experiment_name, exist_ok=True)
    
    model_name = tools.get_model_name(model)
    
    if best_parameters_ready == False:
        parameters = models_param_dict[model_name]
        best_model, best_parameters, best_score, all_results = tools.seek_best_params_for_models(X_train, y_train, model, parameters, score_type)
        best_params_dict[model_name] = best_parameters
        
        #Print best score
        print('\n')
        print(f'Best train score - {model_name}: {np.round(best_score,4)}')
        print('\n')
    else:
        best_model = clone(model)
        #Read best parameters in local dir
        best_params_dict = json.load(open(best_parameters_path))
        parameters = tools.read_best_params_for_model(model_name, best_params_dict)
        best_model.set_params(**parameters)
        best_model.fit(X_train, y_train)
    
    #Save best model
    model_file_name = f'{dir_experiment_name}model_{model_name}.sav'
    pickle.dump(best_model, open(model_file_name, 'wb'))
    
    #Predict
    y_pred = best_model.predict(X_test)
    
    #Metrics
    accuracy = np.round(accuracy_score(y_test, y_pred),4)
    f1 = np.round(f1_score(y_test, y_pred),4)
    precision = np.round(precision_score(y_test, y_pred),4)
    recall = np.round(recall_score(y_test, y_pred),4)
    
    #Put metrics in lists
    model_name_list.append(model_name)
    accuracy_list.append(accuracy)
    f1_list.append(f1)
    precision_list.append(precision)
    recall_list.append(recall)
    
#Save all metrics in csv file
df_model_results = pd.DataFrame({'Model':model_name_list, 'Accuracy':accuracy_list, 'F1':f1_list,
                                 'Precision':precision_list, 'Recall':recall_list})
df_model_results.to_csv(f'{dir_experiment_name}model_results_{type_of_transformation}.csv', index=False)

#Save best parameters in json file
if best_parameters_ready == False:
    json.dump(best_params_dict, open(f'{dir_output_name}best_parameters_{type_of_transformation}.json', 'w'))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  6.77it/s]


# Save summary of results

In [25]:
tools.save_summary_view_of_results(df_model_results, target_metric_for_best_results, type_of_transformation, start_time_folder_name, summary_file_name)

# Total execution time

In [26]:
total_time_in_minutes = np.round((time.time() - start_time)/60,2)

In [27]:
print('#### TOTAL ELAPSED TIME ####\n')
print(f'{total_time_in_minutes} minutes')

#### TOTAL ELAPSED TIME ####

0.06 minutes
