In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import pickle

In [3]:
a=pd.read_csv("data.csv")

In [4]:
a.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [5]:
df=a

In [6]:
df.rename({'fico':'creditscore'},inplace=True,axis=1)

In [7]:
df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,creditscore,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


```    log.annual.inc - The natural logarithm of the annual income of the applicant```    
```    dti - debt-to-income ratio```    
```    days.with.cr.line - The number of days the applicant has had a credit line.```    
```    revol.bal - The total revolving balance on the credit accounts```    

In [8]:
df1 = pd.get_dummies(df, columns = ['purpose'], drop_first = True)
df1.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,creditscore,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,0,1,0,0,0,0
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0,1,0,0,0,0,0
2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0,0,1,0,0,0,0
3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0,0,1,0,0,0,0
4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0,1,0,0,0,0,0


In [9]:
x = df1.drop(['not.fully.paid'], axis = 1)
y = df1['not.fully.paid']

In [10]:
x.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,creditscore,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,1,0,0,0,0
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,1,0,0,0,0,0
2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0,1,0,0,0,0
3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0,1,0,0,0,0
4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,1,0,0,0,0,0


In [11]:
y.head()


0    0
1    0
2    0
3    0
4    0
Name: not.fully.paid, dtype: int64

In [12]:
def Fitmodel(x, y, algo_name, algorithm, params, cv):
    np.random.seed(42)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3,random_state = 95)
    grid = GridSearchCV(algorithm, params, scoring = 'accuracy', n_jobs = -1, cv = cv, verbose = 0)
    model = grid.fit(x_train, y_train)
    pred = model.predict(x_test)
    best_params = model.best_params_
    pickle.dump(model, open(algo_name,'wb'))
    cm = confusion_matrix(pred, y_test)
    print('Algorithm Name : ',algo_name,'\n')
    print('Best Params : ',best_params,'\n')
    print('Percentage of Accuracy Score : {0:.2f} %'.format(100*(accuracy_score(y_test,pred))),'\n')
    print('Classification Report : \n',classification_report(y_test,pred))
    print('Confusion Matrix : \n',cm,'\n')

_**doing oversampling to have same 0 and 1 classes usign SMOTE**_

In [13]:
print('Before Oversampling')
display (df['not.fully.paid'].value_counts())
sm = SMOTE(random_state = 95)
x_res, y_res = sm.fit_resample (x, y)
print('-'*100)
print('After Oversampling')
display (y_res.value_counts())

Before Oversampling


0    8045
1    1533
Name: not.fully.paid, dtype: int64

----------------------------------------------------------------------------------------------------
After Oversampling


0    8045
1    8045
Name: not.fully.paid, dtype: int64

In [43]:
params = {'n_estimators' : [111,222,333,444,555]}
Fitmodel(x_res,y_res,'XG Boost',XGBClassifier(),params,cv = 10)

Algorithm Name :  XG Boost 

Best Params :  {'n_estimators': 333} 

Percentage of Accuracy Score : 87.40 % 

Classification Report : 
               precision    recall  f1-score   support

           0       0.85      0.91      0.88      2411
           1       0.91      0.83      0.87      2416

    accuracy                           0.87      4827
   macro avg       0.88      0.87      0.87      4827
weighted avg       0.88      0.87      0.87      4827

Confusion Matrix : 
 [[2203  400]
 [ 208 2016]] 



In [44]:
np.random.seed(50)
x_train,x_test, y_train,y_test = train_test_split (x_res,y_res,test_size = 0.3,random_state = 95)
xgbc = XGBClassifier(n_estimators = 333)
fit = xgbc.fit (x_train, y_train)
accuracy = fit.score(x_test,y_test)
predict = fit.predict(x_test)
cmatrix = confusion_matrix (predict,y_test)
print ('Accuracy of XGBoost : ', (accuracy))
print ('Percentage of Accuracy Score : {0:.2f} %'.format(100*(accuracy_score(y_test,predict))))
print ('Classification Report:',classification_report(y_test,predict))
print ('Confusion Matrix :\n',cmatrix)

Accuracy of XGBoost :  0.8740418479386782
Percentage of Accuracy Score : 87.40 %
Classification Report:               precision    recall  f1-score   support

           0       0.85      0.91      0.88      2411
           1       0.91      0.83      0.87      2416

    accuracy                           0.87      4827
   macro avg       0.88      0.87      0.87      4827
weighted avg       0.88      0.87      0.87      4827

Confusion Matrix :
 [[2203  400]
 [ 208 2016]]


In [None]:
pickle.dump(fit, open('xggboo','wb'))

In [45]:
importances = xgbc.feature_importances_
indices = np.argsort(importances)[::-1]
print ("Feature Ranking:")
for f in range (x.shape[1]):
    print ("Feature %s (%f)"  %(list (x)[f],importances[indices[f]]))

Feature Ranking:
Feature credit.policy (0.270119)
Feature int.rate (0.110844)
Feature installment (0.088949)
Feature log.annual.inc (0.087295)
Feature dti (0.080519)
Feature creditscore (0.074106)
Feature days.with.cr.line (0.059123)
Feature revol.bal (0.043373)
Feature revol.util (0.037945)
Feature inq.last.6mths (0.025459)
Feature delinq.2yrs (0.024924)
Feature pub.rec (0.015993)
Feature purpose_credit_card (0.014866)
Feature purpose_debt_consolidation (0.014169)
Feature purpose_educational (0.013833)
Feature purpose_home_improvement (0.013075)
Feature purpose_major_purchase (0.012776)
Feature purpose_small_business (0.012631)


In [47]:
model = pickle.load(open('XG Boost','rb'))

In [49]:
pred1 = model.predict (x_test)
print ('Percentage of Accuracy Score : {0:.2f} %'.format(100*(accuracy_score(y_test,pred1))))
print('Params for Best Fitted Model : ',model.best_params_)

Percentage of Accuracy Score : 87.40 %
Params for Best Fitted Model :  {'n_estimators': 333}


In [50]:
fpred = model.predict(x)
print ('Percentage of Accuracy Score for Best Fitted Model of Whole Data: {0:.2f} %'.format(100*(accuracy_score(y,fpred))))

Percentage of Accuracy Score for Best Fitted Model of Whole Data: 94.26 %


In [52]:
# df = df.rename(columns = {'not.fully.paid' : 'Loan Repayment Status'})
# df['Loan Repayment Status'] = df['Loan Repayment Status'].map({0 : 'Paid', 1 : 'Not Paid'})

In [53]:
# final_data = pd.concat([df, fpred_df], axis = 1)
# final_data.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,creditscore,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,Loan Repayment Status,Predicted Loan Repayment Status
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,Paid,Paid
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,Paid,Paid
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,Paid,Paid
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,Paid,Paid
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,Paid,Paid


In [54]:
# final_data


Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,creditscore,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,Loan Repayment Status,Predicted Loan Repayment Status
0,1,debt_consolidation,0.1189,829.10,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,Paid,Paid
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.000000,33623,76.7,0,0,0,Paid,Paid
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.000000,3511,25.6,1,0,0,Paid,Paid
3,1,debt_consolidation,0.1008,162.34,11.350407,8.10,712,2699.958333,33667,73.2,1,0,0,Paid,Paid
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.000000,4740,39.5,0,1,0,Paid,Paid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,0,all_other,0.1461,344.76,12.180755,10.39,672,10474.000000,215372,82.1,2,0,0,Not Paid,Not Paid
9574,0,all_other,0.1253,257.70,11.141862,0.21,722,4380.000000,184,1.1,5,0,0,Not Paid,Not Paid
9575,0,debt_consolidation,0.1071,97.81,10.596635,13.09,687,3450.041667,10036,82.9,8,0,0,Not Paid,Paid
9576,0,home_improvement,0.1600,351.58,10.819778,19.18,692,1800.000000,0,3.2,5,0,0,Not Paid,Not Paid
