In [None]:
#Importing Libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 

%matplotlib inline

In [None]:
#Loading Dataset
data = pd.read_csv('../Bank_Personal_Loan.csv')
data

### **About the data:**
The data set includes 5000 observations with 14 variables divided into 4 different measurement categories.

-The **binary** category has five variables, including the target variable personal loan, also securities account, CD account, online banking and credit card.

-The **interval** category contains five variables: age, experience, income, CC avg and mortgage.

-The **ordinal** category includes the variables family and education.

-The last category is **nominal** with ID and Zip code.

### **Some information about the Dataset**

In [None]:
print("Shape of Data:",data.shape)
r, c = data.shape
print("Number of Rows:",r)
print("Number of Columns:",c)

Number of Rows in Dataset are **5000** & Number of Columns in Dataset are **14**

In [None]:
print("Names of Column:")
print(data.columns)

In [None]:
print("Number of Null Values:",data.isnull().sum())

In [None]:
print("Number of Duplicate Values: ",data.duplicated().sum())

In [None]:
print("Information about the Dataset")
data.info()

So here we can see that, dataset donot have any **Duplicate Values** which is good thing. If we get duplicate values we have to remove those. After this we check for the **null values**. Our data not have any null value so we are ready to go further.

In [None]:
plt.figure()
sns.countplot(x = 'Personal Loan',data = data )
plt.show()

### **Drop coulmns operation**
-**ID** coulumn in our database have a unique number for every client.

-**ZIP Code** coulumn in our database have a ZIP Code for the city of the clients

So there is no relation between ID columns or ZIP Code coulumn and any ather variable. It will be useful for use when drop them to prevent occurrence of misleading.

In [None]:
data.drop(['ID', 'ZIP Code'], axis = 1, inplace=True)
data.head()

In [None]:
data.describe()

**Observations:**

We can see in the describe cell above the **min** value of **Experience** columns is (-3) but we knowe the Experience values must be possitive we will change any negative Experience value by the mean

In [None]:
data['Experience'][data['Experience'] < 0] = data['Experience'].mean()
data.describe()

**Observations:**
We will convert the CCAvg from monthely average to annual average like income coulmn.

In [None]:
data['ann_CV'] = data['CCAvg'] * 12
data

In [None]:
data.drop('CCAvg', axis = 1, inplace = True)
data

In [None]:
fig, ax = plt.subplots(figsize = (10, 10))
sns.heatmap(data.corr(),cmap='BuPu',cbar=True,annot=True,linewidths=0.5,ax=ax)
plt.show

## **Observation**
- 'Age' and 'Experience' are correlated with each other.

- 'Income' and ‘ann_CCAvg' correlated with each other.

- 'CD Account' has a correlation with 'Credit Card', 'Securities Account', 'Online', ‘ann_CCAvg' and 'Income'.

- 'Personal Loan' has correlation with 'Income’, ann_'CCAvg', 'CD Account', 'Mortgage', and 'Education'.

- 'Mortgage' has moderate correlation with 'Income'

- 'Income' influences ‘ann_CCAvg', 'Personal Loan', 'CD Account' and 'Mortgage'. 

In [None]:
sns.scatterplot(x = 'Age', y = 'Income', data = data, hue = 'Personal Loan')

**Clients with income more than 100k are more likely to get loan**

In [None]:
sns.scatterplot(x = 'Age', y = 'ann_CV', data = data, hue = 'Personal Loan')

**Clients with annual CV average more than 30 are more likely to get loan**

In [None]:
sns.countplot(x='Experience', hue = 'Personal Loan', data = data)

In [None]:
sns.countplot(x='Family', hue = 'Personal Loan', data = data)

**We can see in previous two graph the Family and Experience has a low effect in the personal loan attribute**

In [None]:
sns.countplot(x='CreditCard', hue = 'Personal Loan', data = data)

In [None]:
sns.countplot(x='Securities Account', hue = 'Personal Loan', data = data)

In [None]:
sns.countplot(x='CD Account', hue = 'Personal Loan', data = data)

In [None]:
sns.catplot(x='Securities Account', y = 'CD Account', data = data, kind = 'bar', hue = 'Personal Loan' )

In [None]:
sns.catplot(x='CreditCard', y = 'CD Account', data = data, kind = 'bar', hue = 'Personal Loan' )

****After investigating previous plots.We will work on the all data after drop the 'ID' and 'ZIP Code' coulmns because we find all coulmns are affect on each other****

### **Spliting the data**
Training Set and Testing Set in the ratio of 70:30

In [None]:
# import module
from sklearn.model_selection import train_test_split

X = data.drop('Personal Loan', axis = 1).values
y = data['Personal Loan'].values.reshape((-1, 1))
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['Personal Loan'], axis=1),
    data['Personal Loan'],
    test_size=0.4,
    random_state=0) # train, test split by 60:40 respectively

# X_train.shape, X_test.shape
print("Training Dataset Shape:",X_train.shape)
r, c = X_train.shape
print("Rows= ",r )
print("Columns= ",c)
print("Testing Dataset Shape:",X_test.shape)
r, c = X_test.shape
print("Rows= ",r )
print("Columns= ",c)

### **Determine Mutual Information**
Calculate the mutual information between the variables and the target the smaller the value of the mi, the less information we can infer from the feature about the target.

In [None]:
# to obtain the mutual information values
from sklearn.feature_selection import mutual_info_classif

mi = mutual_info_classif(X_train, y_train)
mi

Let's capture the above array in a pandas series add the variable names in the index sort the features based on their mutual information value and make a var plot

In [None]:
# to select the features
from sklearn.feature_selection import SelectKBest

mi = pd.Series(mi)
mi.index = X_train.columns
mi.sort_values(ascending=False).plot.bar(figsize=(20, 6))
plt.ylabel('Mutual Information')

There are a few features (left of the plot) with higher mutual information values. There are also features with almost zero mutual information(mi) values on the right of the plot.

Once we find the mutual information values, to select features we need to determine a threshold, or cut-off value, above which a feature will be selected.

There are a few ways in which this can be done:

Select top k features, where k is an arbitrary number of features

### **Select top k features based on Mutual Information**
Here we will select the top **5 features** based on their mutual information value

In [None]:
# select features
sel_ = SelectKBest(mutual_info_classif, k=5).fit(X_train, y_train)

# display features
X_train.columns[sel_.get_support()]

In [None]:
# X_train.shape,X_test.shape
r, c = X_train.shape
print("Train Dataset:")
print("Rows=",r)
print("Column=",c)
r, c = X_test.shape
print("Test Dataset:")
print("Rows=",r)
print("Column=",c)

Now our Training Dataset is of the size (3500,5) i.e. Rows = 3500, Columns = 5. And Testing Dataset is of the size (1500,5) i.e. Rows = 1500, Columns = 5.

### **SMOTE**
Here we can see the data is **Unbalance**. The lable which contain '0' is greater than the label containg '1'.

So here we need to **Balance** the dataset in this way that our model get train in similar ways on both the labels.

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

In [None]:
# !pip install imbalanced-learn
from imblearn.over_sampling import SMOTE  

sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [None]:
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

Now, our dataset is been **Balance**, so we can train our privious model on this dataset.

In [None]:
#checking the length of the Training data before balancing
print(len(X_train))

In [None]:
#checking the length of the Test data after balancing
print(len(X_train_res))

## **Algorithm**
**On our work we will use five kind of algorithms to the find algorithm with highest f1_score**

- LogisticRegression
- SVM
- K-NN
- DecisionTreeClassifier
- RandomForestClassifier

## **1. Logestic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
# logistic regression object
lr = LogisticRegression()

# train the model on train set
lr.fit(X_train_res, y_train_res)

predictions = lr.predict(X_test)

#import classification report
from sklearn.metrics import confusion_matrix, classification_report

# print classification report
print(classification_report(y_test, predictions))

In [None]:
y_pred = classifier.predict(X_test)
print(y_pred)

In [None]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
sns.heatmap(cm,annot=True)
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
LR_acc1 = accuracy_score(y_test, y_pred)
print("Accuracy score for Logistic Regression Model: {:.2f} %".format(LR_acc1*100))

**ROC Curve**

In [None]:
from sklearn.metrics import roc_curve, auc

#---find the predicted probabilities using the test set
probs = classifier.predict_proba(X_test)
preds = probs[:,1]

#---find the FPR, TPR, and threshold---
fpr, tpr, threshold = roc_curve(y_test, preds)


In [None]:
roc_auc = auc(fpr, tpr)

import matplotlib.pyplot as plt
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate (TPR)')
plt.xlabel('False Positive Rate (FPR)')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc = 'lower right')
plt.show()


**By using SMOTE**

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = [{'penalty': ['l1','l2'], 'C': np.arange(1,10) }]

grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)

In [None]:
grid_search.fit(X_train_res, y_train_res)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

In [None]:
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

## **2. SVM**

In [None]:
from sklearn.svm import SVC
SVMclassifier = SVC(kernel = 'linear', random_state = 0)
SVMclassifier.fit(X_train, y_train)

In [None]:
y_pred = SVMclassifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
sns.heatmap(cm,annot=True)
plt.show()


In [None]:
SVM_acc1 = accuracy_score(y_test, y_pred)
print("Accuracy score for SVM Model: {:.2f} %".format(SVM_acc1*100))

**By using SMOTE**

In [None]:
from sklearn.svm import SVC

classifier2 = SVC(kernel = 'linear', random_state = 0)
SVM_classifier = classifier2.fit(X_train_res, y_train_res)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

y_pred = SVM_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
sns.heatmap(cm,annot=True)
plt.show()

In [None]:
SVM_acc2 = accuracy_score(y_test, y_pred)
print("Accuracy score for SVM Model: {:.2f} %".format(SVM_acc2*100))

**Hyperparameter Tuning**
- Execution Time : 2 hour (approx)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

parameters = [{'C': np.arange(1,10) }]

grid_search = RandomizedSearchCV(estimator = SVMclassifier,
                           param_distributions = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)

In [None]:
grid_search.fit(X_train_res, y_train_res)
SVM_acc_sorte = grid_search.best_score_
best_parameters = grid_search.best_params_

In [None]:
print("Best Accuracy of SVM: {:.2f} %".format(SVM_acc_sorte*100))
print("Best Parameters of SVM:", best_parameters)

**Grid Serarch is time consuming, if you have time then try the above code. It will improve your Accuracy**

## **3. K-NN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

KNNclassifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
KNNclassifier.fit(X_train, y_train)

In [None]:
y_pred = KNNclassifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
sns.heatmap(cm,annot=True)
plt.show()


In [None]:
KNN_acc1 = accuracy_score(y_test, y_pred)
print("Best Accuracy of K-NN: {:.2f} %".format(KNN_acc1*100))

**By using SMOTE**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN_classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
KNN_classifier.fit(X_train_res, y_train_res)

In [None]:
y_pred = KNN_classifier.predict(X_test)
y_pred

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
sns.heatmap(cm,annot=True)
plt.show()

In [None]:
KNN_acc2 = accuracy_score(y_test, y_pred)
print("Best Accuracy of K-NN: {:.2f} %".format(KNN_acc2*100))

**Hyperparameter Tuning**

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = [{ 'n_neighbors' :  np.arange(1,10)  }]

grid_search = GridSearchCV(estimator = KNN_classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)

In [None]:
grid_search.fit(X_train_res, y_train_res)
KNN_acc3 = grid_search.best_score_
best_parameters3 = grid_search.best_params_

In [None]:
print("Best Accuracy of KNN after Hyperparameter tuning: {:.2f} %".format(KNN_acc3*100))
print("Best Parameters of KNN:", best_parameters3)

## **4. Decision Tree Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier
DTclassifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
DTclassifier.fit(X_train, y_train)

In [None]:
y_pred = DTclassifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
y_pred = classifier.predict(X_test)
DT_acc1 = accuracy_score(y_test, y_pred)
print(f"Accuracy score for Decision Tree: {DT_acc1*100}")

**By using SMOTE**

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
DT_classifier.fit(X_train_res, y_train_res)

In [None]:
y_pred = DT_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
sns.heatmap(cm,annot=True)
plt.show()

In [None]:
DT_acc2 = accuracy_score(y_test, y_pred)
print("Accuracy score for Decision Tree: {:.2f} %".format(DT_acc2*100))

**Hyperparameter Tuning**

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = [{  }]

grid_search = GridSearchCV(estimator = DT_classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)

In [None]:
grid_search.fit(X_train_res, y_train_res)
DT_acc3 = grid_search.best_score_
best_parameters5 = grid_search.best_params_

In [None]:
print("Best Accuracy of Decision Tree Classifier: {:.2f} %".format(DT_acc3*100))
print("Best Parameters of Decision Tree Classifier:", best_parameters5)

## **6. Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
RFclassifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
RFclassifier.fit(X_train, y_train)

In [None]:
y_pred = RFclassifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)


In [None]:
RF_acc1 = accuracy_score(y_test, y_pred)
print(f"Random Forest Classification accuracy: {RF_acc1*100}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
RF_classifier.fit(X_train_res, y_train_res)

In [None]:
y_pred = RF_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
sns.heatmap(cm,annot=True)
plt.show()

In [None]:
RF_acc2 = accuracy_score(y_test, y_pred)
print("Accuracy score for Random Forest: {:.2f} %".format(RF_acc2*100))

**Hyperparameter Tuning**

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = [{'n_estimators' : [10, 50, 100, 200], 'max_depth' : [3, 10, 20, 40]}]

grid_search = GridSearchCV(estimator = RF_classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)

In [None]:
grid_search.fit(X_train_res, y_train_res)
RF_acc3 = grid_search.best_score_
best_parameters6 = grid_search.best_params_

In [None]:
print("Best Accuracy of Random Forest with hyperparameter tuning: {:.2f} %".format(RF_acc3*100))
print("Best Parameters of Random Forest:", best_parameters6)

In [None]:
mylist=[]
mylist2=[]
mylist.append(LR_acc1)
mylist2.append("Logistic Regression")
mylist.append(SVM_acc1)
mylist2.append("SVM")
mylist.append(KNN_acc1)
mylist2.append("K-NN")
mylist.append(DT_acc1)
mylist2.append("DTC")
mylist.append(RF_acc1)
mylist2.append("RFC")

plt.rcParams['figure.figsize']=22,10
sns.set_style("darkgrid")
ax = sns.barplot(x=mylist2, y=mylist, palette = "coolwarm", saturation =1.5)
plt.xlabel("Classification Models", fontsize = 20 )
plt.ylabel("Accuracy", fontsize = 20)
plt.title("Accuracy of different Classification Models with Unbalance Data", fontsize = 20)
plt.xticks(fontsize = 11, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 13)
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{height:.2%}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()

**But in this Model we can see the type-2 error.**

**What is type-2 error?** -A type II error is a statistical term used within the context of hypothesis testing that describes the error that occurs when one fails to reject a null hypothesis that is actually false. A type II error produces a false negative, also known as an error of omission.

So, here we use SMOTE to reduce the type-2 error and also we use Hyperparameter Tuning for **more** Accuracy

In [None]:
mylist=[]
mylist2=[]
mylist.append(LR_acc1)
mylist2.append("Logistic Regression")
mylist.append(SVM_acc2)
mylist2.append("SVM")
mylist.append(KNN_acc2)
mylist2.append("K-NN")
mylist.append(DT_acc2)
mylist2.append("DTC")
mylist.append(RF_acc2)
mylist2.append("RFC")

plt.rcParams['figure.figsize']=22,10
sns.set_style("darkgrid")
ax = sns.barplot(x=mylist2, y=mylist, palette = "coolwarm", saturation =1.5)
plt.xlabel("Classification Models", fontsize = 20 )
plt.ylabel("Accuracy", fontsize = 20)
plt.title("Accuracy of different Classification Models with SMOTE ", fontsize = 20)
plt.xticks(fontsize = 11, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 13)
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{height:.2%}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()

In [None]:
mylist=[]
mylist2=[]
mylist.append(LR_acc1)
mylist2.append("Logistic Regression")
mylist.append(SVM_acc2)
mylist2.append("SVM")
mylist.append(KNN_acc3)
mylist2.append("K-NN")
mylist.append(DT_acc3)
mylist2.append("DTC")
mylist.append(RF_acc3)
mylist2.append("RFC")

plt.rcParams['figure.figsize']=22,10
sns.set_style("darkgrid")
ax = sns.barplot(x=mylist2, y=mylist, palette = "coolwarm", saturation =1.5)
plt.xlabel("Classification Models", fontsize = 20 )
plt.ylabel("Accuracy", fontsize = 20)
plt.title("Accuracy of different Classification Models with SMOTE and Hyperparameter Tuning", fontsize = 20)
plt.xticks(fontsize = 11, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 13)
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{height:.2%}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()

## **Conclusion**
By observing above plot we can conclude that the **SVM**, **Decision Tree Classifier** and **Random Forest Classifier** are the best algoritham to analyse with **Unblance Data**.

By observing above plot we can conclude that the **Decision Tree Classifier** and **Random Forest Classifier** are the best algoritham to analyse with **SMOTE**.

By observing above plot we can conclude that the **K-NN**, **Decision Tree Classifier** and **Random Forest Classifier** are the best algoritham to analyse with **SMOTE with Hyperparameter Tuning**.