In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## Loading the dataset

In [None]:
dt = pd.read_csv("Loan Prediction Dataset.csv")
dt.head()

In [None]:
dt.shape

In [None]:
dt.describe()

In [None]:
dt.info()

## Preprocessing the dataset

In [None]:
dt.isnull().sum()

In [None]:
# fill the missing values for numerical terms
dt['LoanAmount'] = dt['LoanAmount'].fillna(dt['LoanAmount'].mean())
dt['Loan_Amount_Term'] = dt['Loan_Amount_Term'].fillna(dt['Loan_Amount_Term'].mean())
dt['Credit_History'] = dt['Credit_History'].fillna(dt['Credit_History'].mean())

In [None]:
# fill the missing values for categorical terms
dt['Gender'] = dt["Gender"].fillna(dt['Gender'].mode()[0])
dt['Married'] = dt["Married"].fillna(dt['Married'].mode()[0])
dt['Dependents'] = dt["Dependents"].fillna(dt['Dependents'].mode()[0])
dt['Self_Employed'] = dt["Self_Employed"].fillna(dt['Self_Employed'].mode()[0])

In [None]:
dt.isnull().sum()

## Exploratory Data Analysis

In [None]:
# categorical attributes visualization
sns.countplot(dt['Gender'])

In [None]:
sns.countplot(dt['Married'])

In [None]:
sns.countplot(dt['Dependents'])

In [None]:
sns.countplot(dt['Education'])

In [None]:
sns.countplot(dt['Self_Employed'])

In [None]:
sns.countplot(dt['Property_Area'])

In [None]:
sns.countplot(dt['Loan_Status'])

In [None]:
# numerical attributes visualization
sns.distplot(dt["ApplicantIncome"])

In [None]:
sns.distplot(dt["CoapplicantIncome"])

In [None]:
sns.distplot(dt["LoanAmount"])

In [None]:
sns.distplot(dt['Loan_Amount_Term'])

In [None]:
sns.distplot(dt['Credit_History'])

## Creation of new attributes

In [None]:
# total income
dt['Total_Income'] = dt['ApplicantIncome'] + dt['CoapplicantIncome']
dt.head()

## Log Transformation

In [None]:
# apply log transformation to the attribute
dt['ApplicantIncomeLog'] = np.log(dt['ApplicantIncome'])
sns.distplot(dt["ApplicantIncomeLog"])

In [None]:
dt['CoapplicantIncomeLog'] = np.log(dt['CoapplicantIncome'])
sns.distplot(dt["CoapplicantIncomeLog"])

In [None]:
dt['LoanAmountLog'] = np.log(dt['LoanAmount'])
sns.distplot(dt["LoanAmountLog"])

In [None]:
dt['Loan_Amount_Term_Log'] = np.log(dt['Loan_Amount_Term'])
sns.distplot(dt["Loan_Amount_Term_Log"])

In [None]:
dt['Total_Income_Log'] = np.log(dt['Total_Income'])
sns.distplot(dt["Total_Income_Log"])

## Coorelation Matrix

In [None]:
corr = dt.corr()
plt.figure(figsize=(15,10))
sns.heatmap(corr, annot = True, cmap="BuPu")

In [None]:
dt.head()

In [None]:
cols = ['ApplicantIncome', 'CoapplicantIncome', "LoanAmount", "Loan_Amount_Term", "Total_Income", 'Loan_ID', 'CoapplicantIncomeLog']
dt = dt.drop(columns=cols, axis=1)
dt.head()

## Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
cols = ['Gender',"Married","Education",'Self_Employed',"Property_Area","Loan_Status","Dependents"]
le = LabelEncoder()
for col in cols:
    dt[col] = le.fit_transform(dt[col])

In [None]:
dt.head()

## Train-Test Split

In [None]:
X = dt.drop(columns=['Loan_Status'], axis=1)
y = dt['Loan_Status']

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Model Training

In [None]:
from sklearn.model_selection import cross_val_score
def classify(model, x, y):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model.fit(x_train, y_train)
    print("Accuracy is", model.score(x_test, y_test)*100)
    score = cross_val_score(model, x, y, cv=5)
    print("Cross validation is",np.mean(score)*100)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model, X, y)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
classify(model, X, y)

In [None]:
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
model = RandomForestClassifier()
classify(model, X, y)

In [None]:
model = ExtraTreesClassifier()
classify(model, X, y)

In [None]:
from xgboost import XGBClassifier

In [None]:
model = XGBClassifier()
classify(model, X, y)

In [None]:
from lightgbm import LGBMClassifier

In [None]:
lgb = LGBMClassifier()
classify(lgb, X, y)

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
cb = CatBoostClassifier()
classify(cb, X, y)

## Hyperparameter tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
 #Randomized Search CV
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

In [None]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

In [None]:
rf = RandomForestClassifier()

In [None]:
# Use the random grid to search for best hyperparameters
rf=RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='accuracy', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
rf.fit(X,y)

In [None]:
rf.best_score_

In [None]:
rf.best_params_

In [None]:
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

In [None]:
random_grid = {
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

In [None]:
dt = DecisionTreeClassifier()

In [None]:
dt=RandomizedSearchCV(estimator = dt, param_distributions = random_grid,scoring='accuracy', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
dt.fit(X,y)

In [None]:
dt.best_score_

In [None]:
dt.best_params_

In [None]:
 #Randomized Search CV
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

In [None]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

In [None]:
ex = ExtraTreesClassifier()

In [None]:
ex=RandomizedSearchCV(estimator = ex, param_distributions = random_grid,scoring='accuracy', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
ex.fit(X,y)

In [None]:
ex.best_score_

In [None]:
ex.best_params_

In [None]:
from scipy.stats import uniform, randint

In [None]:
xgb=XGBClassifier()

In [None]:
params = {
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4)
}

In [None]:
xgb = RandomizedSearchCV(estimator = xgb, param_distributions = params,scoring='accuracy', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
xgb.fit(X,y)

In [None]:
xgb.best_score_

In [None]:
xgb.best_params_

In [None]:
grid = {'learning_rate': [0.03, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}

In [None]:
cb = RandomizedSearchCV(estimator = cb, param_distributions = grid,scoring='accuracy', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
cb.fit(X,y)

In [None]:
cb.best_score_

In [None]:
cb.best_params_

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
y_pred = dt.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
sns.heatmap(cm, annot=True,fmt='.2f')

In [None]:
y_pred = rf.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
sns.heatmap(cm, annot=True,fmt='.2f')

In [None]:
y_pred = ex.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
sns.heatmap(cm, annot=True,fmt='.2f')

In [None]:
y_pred = xgb.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
sns.heatmap(cm, annot=True,fmt='.2f')

In [None]:
s