# Loan Prediction using SmoteTomek

# Read Training Dataset

In [None]:
import pandas as pd
df_train = pd.read_csv('./Input/training_set.csv')
df_train.head()

# Changing Credit History to object

In [None]:
crd_hst = []
for i in range(len(df_train)):
    if df_train['Credit_History'][i]==1:
        crd_hst.append('Yes')
    elif df_train['Credit_History'][i]==0:
        crd_hst.append('No')
    else:
        crd_hst.append(None)
df_train['Credit_History']=crd_hst

# Replacing Missing values

In [None]:
from definitions import replacer
replacer(df_train)

# Defining X and Y

In [None]:
X = df_train.drop(labels=['Loan_ID','Loan_Status'],axis=1)
Y = df_train[['Loan_Status']]

In [None]:
Y.value_counts()

# Data Preprocessing

In [None]:
from definitions import catconsep
cat, con =catconsep(X)

In [None]:
X1 = X[con]
X2 = pd.get_dummies(X[cat])
Xnew = X1.join(X2)

In [None]:
Xnew.shape

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Ynew = pd.DataFrame(le.fit_transform(Y.values.flatten()),columns=Y.columns)
Ynew

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(Xnew, Ynew, test_size=0.2,random_state=21,stratify=Y)

# Using SMOTETomek on train data

In [None]:
from imblearn.combine import SMOTETomek
smtmk = SMOTETomek(random_state=42)
x_res , y_res = smtmk.fit_resample(xtrain,ytrain)

In [None]:
y_res.value_counts()

# Creating 5 RepeatedStratified Folds in train data

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
rskf = RepeatedStratifiedKFold(n_splits=5,n_repeats=5,random_state=42)

# Training The Data on folds

In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np
from catboost import CatBoostClassifier

clfs = []
scores = []

for i, (train_index, test_index) in enumerate(rskf.split(x_res, y_res)):
    x_train, x_val = x_res.iloc[train_index,:], x_res.iloc[test_index,:]
    y_train, y_val = y_res.iloc[train_index,:], y_res.iloc[test_index,:]
    cbc = CatBoostClassifier(iterations=1000,verbose=False)
    cbc.fit(x_train, y_train, eval_set = [(x_val, y_val)])
    preds = cbc.predict_proba(x_val)[:,-1]
    clfs.append(cbc)
    scr = roc_auc_score(y_val,preds)
    scores.append(scr)
    print('******************************')
    print('Fold : ',i , 'ROC AUC : ',round(scr,5))
print(f'\n\nMean score: {np.mean(scores)}')       

In [None]:
np.std(scores)

In [None]:
np.std(scores)/np.mean(scores)

# Evaluating Model based on Test Data

In [None]:
test_prob = []
for clf in clfs:
    pred = clf.predict_proba(xtest)
    test_prob.append(pred[:,1])

In [None]:
test_prob = np.stack(test_prob).mean(0)

In [None]:
test_pred = []
for i in test_prob:
    if i>=0.5:
        test_pred.append(1)
    else:
        test_pred.append(0)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(ytest, test_prob)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(ytest, test_pred))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sb
cf = confusion_matrix(ytest,test_pred)
sb.heatmap(cf, annot=True)