In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe


In [3]:
df = pd.read_csv('../../datasets/synthesized_loan_data.csv')

In [4]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,1.015491,0,0,60,2263.426739,120.264574,362.034926,0.999618,2,1
1,1,1,2.793550,1,2,299,207.078743,97.333547,185.855127,0.994450,1,1
2,0,0,1.020575,0,1,497,-222.105153,98.957504,361.411489,0.999132,0,1
3,1,1,0.967628,0,0,35,1874.200797,141.121981,360.427013,0.999273,2,1
4,1,1,1.929354,0,0,340,47.080906,322.936778,360.013915,0.979882,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,1,1,2.118012,0,0,248,2693.049737,176.172212,360.665439,1.007047,1,1
3996,0,0,-0.002574,0,0,249,1949.728061,117.277610,361.044262,0.986177,1,1
3997,1,1,0.010152,0,0,300,2882.202092,136.098872,361.722296,0.994603,0,1
3998,0,1,-0.002625,0,0,385,182.631094,191.733185,360.723093,0.989029,1,0


In [5]:
x = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

In [6]:
x.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,1.015491,0,0,60,2263.426739,120.264574,362.034926,0.999618,2
1,1,1,2.79355,1,2,299,207.078743,97.333547,185.855127,0.99445,1
2,0,0,1.020575,0,1,497,-222.105153,98.957504,361.411489,0.999132,0
3,1,1,0.967628,0,0,35,1874.200797,141.121981,360.427013,0.999273,2
4,1,1,1.929354,0,0,340,47.080906,322.936778,360.013915,0.979882,0


In [7]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: Loan_Status, dtype: int64

In [8]:
y[y == 0] = 0
y[ y == 1] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[y == 0] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[ y == 1] = 1


In [9]:
x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=0.3, random_state=0)

In [10]:
space = {
    'max_depth': hp.quniform('max_depth', 3, 18, 1), 
    'gamma': hp.uniform('gamma', 1, 9), 
    'reg_alpha': hp.quniform('reg_alpha', 40, 180, 1), 
    'reg_lambda': hp.uniform('reg-lambda', 0, 1), 
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'min_child_weight': hp.quniform('min_child_weight', 0, 10,  1),
    'n_estimators': 180,
    'seed': 0,
    'use_label_encoder' : False
}

In [11]:
def objective(space): 
    clf = xgb.XGBClassifier(
        n_estimators = space['n_estimators'], 
        max_depth = int(space['max_depth']),
        gamma = space['gamma'],
        reg_alpha = int(space['reg_alpha']),
        min_child_weight = int(space['min_child_weight']),
        colsample_bytree = int(space['colsample_bytree']),
        use_label_encoder= space['use_label_encoder']
    )
    
    evaluation = [(x_train, y_train), (x_test, y_test)]
    
    clf.fit(x_train, y_train, eval_set=evaluation, eval_metric='auc', early_stopping_rounds=10, verbose=False)
    
    pred = clf.predict(x_test)
    
    accuracy = accuracy_score(y_test, pred>0.5)
    
    print('Score: ', accuracy)
    
    return {'loss' : -accuracy, 'status' : STATUS_OK}

In [12]:
trials = Trials()

best_hyperparameters = fmin(
    fn = objective,
    space = space,
    algo = tpe.suggest,
    max_evals = 100,
    trials = trials
)

Score:                                                 
0.7808333333333334                                     
Score:                                                                            
0.7808333333333334                                                                
Score:                                                                            
0.7808333333333334                                                                
Score:                                                                            
0.7808333333333334                                                                
Score:                                                                            
0.7825                                                                            
Score:                                                                            
0.7808333333333334                                                    
Score:                                                                

In [13]:
print(best_hyperparameters)

{'colsample_bytree': 0.8567691170911922, 'gamma': 8.389062414869231, 'max_depth': 12.0, 'min_child_weight': 1.0, 'reg-lambda': 0.4763025600500439, 'reg_alpha': 41.0}


In [14]:
xgb_model = xgb.XGBClassifier(
    colsample_bytree = 0.7,
    gamma = 2.1,
    max_depth = 15, 
    min_child_weight = 7,
    reg_lambda = 0.04,
    reg_alpha = 1,
    use_label_encoder=False,
    objective='binary:logistic',
)

In [15]:
xgb_model.fit(x_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=2.1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=15,
              min_child_weight=7, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=1, reg_lambda=0.04, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [16]:
xg_preds = xgb_model.predict(x_test)


In [17]:
xgb_rmse = np.sqrt(mean_squared_error(y_test, xg_preds))

In [18]:
xgb_rmse

0.2645751311064591

In [19]:
single_instance = {
    'Gender' : 1, 
    'Married' : 1, 
    'Dependents' : 1.02, 
    'Education' : 0, 
    'Self_Employed' : 0,
       'ApplicantIncome': 60, 
       'CoapplicantIncome' : 2263.43, 
       'LoanAmount' : 362.03,
       'Loan_Amount_Term' : 360, 
       'Credit_History' : 1, 
       'Property_Area': 2,
}

In [20]:
def predict_discrete (xgb_model, single_input):
    #Data preparation
    input_df = pd.DataFrame([single_input])
    
    #input_df[numerical_cols] = imputer.transform(input_df[numerical_cols])
    #input_df[numerical_cols] = scaler.transform(input_df[numerical_cols])
    #input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])

    x_input = input_df
    predictions = xgb_model.predict(x_input)[0]
    probability = xgb_model.predict_proba(x_input)[0][list(xgb_model.classes_).index(predictions)]

    return predictions, probability

In [21]:
predict_discrete(xgb_model, single_instance)

(1, 0.99333996)