## Project Guide

1. **[Import packages](#import_packages)**
2. **[Preprocessing](#Preprocessing)**
    1. **[Null Values Treatment](#null_values)**
    2. **[Upper Lower Capping](#capping)**

3. **[EDA](#EDA)**
4. **[Model Building](#Model_Building)**
5. **[Model Evaluation](#Model_Evaluation)**
6. **[Conclusion](#Conclusion)**

<a id='import_packages'></a>
## Data Loading And Packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data = pd.read_csv("BankCreditCard.csv" , index_col=None, sep=',')
data.head()

In [None]:
data.info()

In [None]:
data.summary()

<a id='Preprocessing'></a>
## Preprocessing

## Checking For NAs <a id='null_values'></a>

In [None]:
# Checking For NAs Variables Having NA's

drop_columns = []

def get_na(data): 
    null_vars = data.isnull().sum()
    null_vars = null_vars[null_vars > 0]
    if(len(null_vars) > 0):
        null_vars.sort_values(inplace=True)
        null_vars.plot.bar(figsize=(15,4))
    else:
        print("No column have NA values")
get_na(data)

## Removing Variables those having significant NA Values

In [None]:
drop_columns = []

## Impute Values for NAs

In [None]:
data.fillna(method=None, axis= 1, inplace=True)
method='ffill'
method='backfill'

## Splitting x(Independant) and y (Dependant Variable)

In [None]:
y = data.y
x = data.drop(y, axis = 1)

## Categorical and Numerical Variables Splitting

In [None]:
data_categorical = x.select_type('O')
data_numeric = x.drop(data_categorical, axis = 1, inplace=False)

<a id='capping'></a>
## Outlier Treatment

In [None]:
# Upper Caping and Lower Caping for numerical Attribute

def set_caping(column, capping):
    mean = column.mean() # Mean of the column
    std = column.std() # Standard Deviation of the column
    
    UCL = mean + 3 * std
    LCL = mean - 3 * std
    if caping == "both":
        data[column > UCL] = UCL
        data[column < LCL] = LCL
    elif caping == "upper":
        data[column > UCL] = UCL
    elif caping == "lower":
        data[column > LCL] = LCL
    else:
        print("Please enter proper value of capping parameter. \n Possible values:\tboth\tupper\tlower")

# set_caping(data["age"], capping = "both")
# data.boxplot(column ="age")

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)
x_scaled = pd.DataFrame(scaler.transform(x), columns = df.columns)

## One Hot Encoding for Categorical Variables

In [None]:
data = pd.concat([data,pd.get_dummies(cat_data, drop_first=True)], axis=1)

<a id='EDA'></a>
## EDA - Exploratory Data Analysis

## Correlation Plot For Numerical Data

In [None]:
# Remove Columns those have strong Correlation for avoid multicoliniearity Issue


## Checking for Normal Distribution

In [None]:
# Original Price
sale_price = original_data.SalePrice
# With Square Root Transformation
sqrt_price = np.sqrt(original_data.SalePrice)
# With Square Log Transformation
log_price = np.log(original_data.SalePrice)

# Compare all Possibale Ways to get proper Transformation
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
prices = pd.DataFrame({"Sale Price":sale_price,"Log Sale Price ":log_price, "SQRT Price":sqrt_price })
prices.hist()
plt.tight_layout()

<a id='Feature_Engineering'></a>
## Feature Engineering

<a id='Model_Building'></a>
## Model Building Process

In [None]:
# Devide Dataset into Train and Test
from sklearn.model_selection import train_test_split

x_train,x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state=0)

print("\nx_train ",x_train.shape,"\nx_test ",x_test.shape, "\ny_train ",y_train.shape , "\ny_test ",y_test.shape)

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(x_train,y_train)
predicted = model.predict(x_test)

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(solver='lbfgs').fit(x_train_res,y_train_res)
logistic_predict = logistic_model.predict(x_test)
logistic_conusion_matrix = confusion_matrix(y_test,logistic_predict)
logistic_conusion_matrix

In [None]:
# Decision Tree

# For Classification
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion="entropy", max_depth=3, min_samples_split=2, min_samples_leaf=1)
model = tree.fit(X_train,y_train)
predicted = model.predict(x_test)

# For Regression

In [None]:
# Random Forest
# n_estimators is the number of trees to be used in the forest, by default is 10.

# For Classification
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators = 100, criterion='gini')
rf_model = rf_model.fit(x_train_res, y_train_res)
rf_model_predict = rf_model.predict(x_test)
rf_conusion_matrix = confusion_matrix(y_test,rf_model_predict)
rf_conusion_matrix

# For Regression

In [None]:
# KNN - K nearest Neighbour
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model = knn_model.fit(x_train_res,y_train_res)
knn_predict = knn_model.predict(x_test)
knn_conusion_matrix = confusion_matrix(y_test,knn_predict)
knn_conusion_matrix

# Choosing a K Value with Elbow method
error_rate = []
k_list = [3,7,9,11,13,15,17,19,21,23,25,27,29]
for i in k_list:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train_res,y_train_res)
    pred_i = knn.predict(x_test)
    error_rate.append(np.mean(pred_i != y_test))
    
# Plotting Residuals by Elbow method
plt.figure(figsize=(10,6))
plt.plot(k_list,error_rate,color='blue', linestyle='dashed', marker='o', markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
# SVM - Support Vector Machine
# For Classification
from sklearn import svm
model = svm.SVC(kernel='rbf')
model = model.fit(X_train,y_train)
predicted = model.predict(x_test)

# For Regression

from sklearn import svmr
model = svm.SVC(kernel='rbf')
model = model.fit(X_train,y_train)
predicted = model.predict(x_test)

In [None]:
# ADABOOST - Adaptive Boosting

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

adaboost_classifier = AdaBoostClassifier( DecisionTreeClassifier(max_depth=3), n_estimators=400, learning_rate=.0001)
adaboost_model = adaboost_classifier.fit(x_train_res,y_train_res)
adaboost_predict = adaboost_model.predict(x_test)
adaboost__conusion_matrix = confusion_matrix(y_test,adaboost_predict)
adaboost__conusion_matrix

In [None]:
# XG BOOST - Extream Gradient Boosting


<a id='Model_Evaluation'></a>
## Evaluating Model

In [None]:
# Report All Results into Dataframe - Creating Method Where Evaluation Parameters will generate from Concusion Matrix
report_comments, report_Accuracy, report_Precision, report_Recall , report_F1_Score= [],[],[],[],[]

def report_generate(conf_mat,comment):

    # Append Comment into List
    report_comments.append(comment)

    # Calculating Accuracy and Append into List
    accuracy = (conf_mat[1][1] + conf_mat[0][0])/ (conf_mat[1][1] + conf_mat[0][0] + conf_mat[0][1] + conf_mat[1][0])
    report_Accuracy.append(accuracy)
    
    # Calculating Precision and Append into List
    precisoin = (conf_mat[0][0])/(conf_mat[0][1] + conf_mat[0][0])
    report_Precision.append(precisoin)
    
    # Calculating Recall and Append into List
    recall = (conf_mat[0][0])/(conf_mat[1][0] + conf_mat[0][0])
    report_Recall.append(recall)
    
    # Calculating F1-Score and Append into List
    f1scrore = 2 * precisoin * recall / (precisoin + recall) 
    report_F1_Score.append(f1scrore)

In [None]:
# Report All Results into Dataframe
def show_report():
    report_data = {'Comment': report_comments,
               'Accuracy': report_Accuracy,
               'Precision': report_Precision,
               'Recall': report_Recall ,
               'F1 Score': report_F1_Score
              }
    df_results = pd.DataFrame.from_dict(report_data)
    return(df_results)

In [None]:
# Pass all Generated Confusion Matrix for generate report
report_generate(logistic_conusion_matrix, comment='After SMOTE Logistic Regression')
report_generate(knn_conusion_matrix, comment='After SMOTE KNN for K = 3')
report_generate(rf_conusion_matrix, comment='After SMOTE Random Forest')

show_report()

<a id='Conclusion'></a>
## Conclusion

In [None]:
With references of the above models and model evaluations we can say that Random Forest model is giving the best solution for the given problem.