In [None]:
import zipfile
import os

def extract_zip(zip_file_path, output_folder):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(output_folder)

zip_file_path = '/content/bank-marketing-uci.zip'
output_folder = '/content/Bank Marketing'
extract_zip(zip_file_path, output_folder)

In [None]:
import os
import json, pickle
import numpy as np

# importing libraries for data structuring and analysis (visualization)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# importing warnings to avoid warnings 
import warnings
warnings.filterwarnings("ignore")

# importing scaler and encoders
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

# importing randomised cross validation and train test split (for data seperation)
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Importing Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

# importing performance metrixes 
from sklearn.metrics import accuracy_score, precision_recall_curve,classification_report,roc_curve
from sklearn.metrics import confusion_matrix, plot_roc_curve, roc_auc_score

# to export models and json file for production
import json,pickle
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import mutual_info_classif, chi2, f_classif, VarianceThreshold
#from skfeature.function.similarity_based import fisher_score

# importing SMOTE to resample data(balancing the data)
from imblearn.over_sampling import SMOTE

# setting to display max columns 
pd.set_option("display.max_columns",None)

In [None]:
data= pd.read_csv("bank.csv",sep=";")
print(data.shape)
data.head(2)
df= data.copy()
df.head()
df.info()
df.describe()

In [None]:
df.duplicated().sum()
df.isnull().sum()
df["y"].value_counts()

In [None]:
def get_counts(data):
    plt.figure(figsize = (15,5))
    plt.ylabel("Counts",fontsize = 10)
    return  data.value_counts().plot(kind = "bar",cmap='summer')
    plt.show()

In [None]:
_ = df["age"]
_ = pd.cut(_,bins = [18,35,60,90],labels = ["young","mid-age","old"])
get_counts(_)
plt.xlabel("Age",fontsize = 13)

In [None]:
get_counts(df["job"])
plt.xlabel("Job",fontsize = 13)

In [None]:
get_counts(df["marital"])
plt.xlabel("Merital Status",fontsize = 13)

In [None]:
get_counts(df["education"])
plt.xlabel("Education",fontsize = 13)

In [None]:
get_counts(df["default"])
plt.xlabel("Credit in default",fontsize = 13)

In [None]:
def get_distribution(data):
    fig, axes = plt.subplots(1, 2, figsize=(16,4))
    sns.distplot(data, hist = False,ax = axes[0],color="r")
    sns.boxplot(data,ax = axes[1],color="g")
    return plt.show()

In [None]:
get_distribution(df["age"])

In [None]:
get_distribution(df["balance"])

In [None]:
get_distribution(df["previous"])

In [None]:
for i,predictor in enumerate(df.drop(columns=['y','age','campaign','previous', 'balance', 'duration','pdays'])):
    plt.figure(i)
    sns.countplot(data=df, x=predictor, hue='y')

In [None]:
_ = df.groupby("y").mean()["age"]
print(_)
_.plot(kind = "bar", cmap = "summer")   

In [None]:
df.pivot_table(values = "age", columns = "y",index = "job",aggfunc="mean")

In [None]:
sns.catplot(data = df, x = "job", y = "age",
            hue = "y", jitter = 0.1, aspect = 2.3,height = 5)

In [None]:
sns.catplot(data = df, x = "marital", y = "age",
            hue = "y", jitter = 0.1, aspect = 1.5,height = 5)

In [None]:
df.groupby(["housing","loan"],as_index=False)["age"].mean()

In [None]:
sns.pairplot(data = df.drop("y",axis = 1))

In [None]:
df.groupby(["marital","y"])["age"].count().plot(kind = "bar",cmap = "rainbow" )

In [None]:
df1 = df.copy()

In [None]:
df1.drop(["day","month"],axis =1, inplace = True)   

In [None]:
for i in df1.columns:
    df1[i] = np.where(df1[i] == "unknown", np.nan, df1[i])
df1.isna().sum()

In [None]:
(3705/df1.poutcome.shape[0])*100    

In [None]:
(1324/df1.contact.shape[0])*100

In [None]:
df1.drop("poutcome", inplace = True, axis = 1)

In [None]:
df1.drop("contact", inplace = True, axis = 1)

In [None]:
df1["job"].fillna(method = "ffill",inplace=True)
df1["education"].fillna(method = "ffill",inplace= True)

In [None]:
df1["education"].replace({'primary':0, 'secondary':1, 'tertiary':2},inplace = True)
df1["default"].replace({'no':0, 'yes':1},inplace = True)
df1["marital"].replace({'single':0, 'married':1, 'divorced':2},inplace = True)
df1["housing"].replace({'no':0, 'yes':1},inplace = True)
df1["loan"].replace({'no':0, 'yes':1},inplace = True)

In [None]:
ohe = OneHotEncoder()
df1[list(df1["job"].unique())] = ohe.fit_transform(df1[["job"]]).A
df1.drop("job",axis = 1, inplace = True)
df1.head(1)

In [None]:
label = LabelEncoder()
df1["y"] =  label.fit_transform(df1["y"])
df1.head(1)

In [None]:
_ = df1[['age', 'marital', 'education', 'default', 'balance', 'housing', 'loan',
       'duration', 'campaign', 'pdays', 'previous']]
vif_lst = []
for i in range(_.shape[1]):
    vif = variance_inflation_factor(_.to_numpy(),i)
    vif_lst.append(vif)
s1 = pd.Series(vif_lst, index = _.columns)
s1.sort_values().plot(kind = "barh", cmap = "summer")

In [None]:
var_th = VarianceThreshold(threshold = 0.0)
var_th.fit_transform(_)
s = pd.Series(var_th.get_support(),index = _.columns)
s

In [None]:
_temp = df1[['marital', 'education', 'default', 'housing', 'loan',
        'campaign', 'previous']]
chi, p_val = chi2(_temp,df1["y"])
s = pd.DataFrame({"Chi2":np.around(chi,2), "P_val": np.around(p_val,2)}, index = _temp.columns)
s

In [None]:
_temp = df1[['age', 'balance', 'duration', 'pdays']]
f_val,p_val = f_classif(_temp,df1["y"])
pd.DataFrame({"F_Val": np.around(f_val,2), "P_val": np.around(p_val,2)},index = _temp.columns)

In [None]:
df1.drop("marital",inplace = True,axis=1)

In [None]:
x = df1.drop("y",axis = 1)
y = df1["y"]

sm = SMOTE(sampling_strategy=0.75)
sm_x,sm_y = sm.fit_resample(x,y)
print(f"First we have the value counts:\n{y.value_counts()}\n\nAfter OverSampling now we have value counts:\n{sm_y.value_counts()}")

In [None]:
x_train,x_test,y_train,y_test = train_test_split(sm_x,sm_y, test_size=0.2, random_state=42, stratify=sm_y)

In [None]:
std = StandardScaler()
arr = std.fit_transform(x_train)
std_x = pd.DataFrame(arr, columns = x_train.columns)
std_x.head()

In [None]:
arr1 = std.transform(x_test)
std_x_te = pd.DataFrame(arr1, columns = x_test.columns)
std_x_te.head()

In [None]:
lst = [("LogisticRegression",LogisticRegression()),
       ("KNN Classifier", KNeighborsClassifier()),
       ("RandomForest",RandomForestClassifier()),
      ("AdaBoost",AdaBoostClassifier()),
      ("XGBoost",XGBClassifier())]
for name,model in lst:
    model.fit(std_x,y_train)
    y1 = model.predict(std_x)
    accuracy = accuracy_score(y_train,y1)
    y2 = model.predict(std_x_te)
    acc_te = accuracy_score(y_test,y2)
    print(f"For {name}::\nThe Training Accuracy is: {accuracy}\nThe Testing Accuracy is: {acc_te}")
    print("--"*40)

In [None]:
estimator = XGBClassifier()
parameters = {"n_estimators": [50,100,150,200,250,300,350,400],
             "max_depth": np.arange(2,10),
             "learning_rate": np.arange(0.01,0.1,0.02),
             'subsample': np.arange(0.5, 1.0, 0.1),
             'colsample_bytree': np.arange(0.4, 1.0, 0.1),
             'colsample_bylevel': np.arange(0.4, 1.0, 0.1)}
rscv = RandomizedSearchCV(estimator, parameters, cv = 5, random_state= 42)
rscv.fit(std_x,y_train)
rscv.best_params_

In [None]:
xgb = XGBClassifier(subsample = 0.7, n_estimators = 150,
                    max_depth = 8, learning_rate = 0.09,
                    colsample_bytree = 0.7, colsample_bylevel = 0.8)
xgb.fit(std_x,y_train)

In [None]:
y_pr_train = xgb.predict(std_x)
acc_train = accuracy_score(y_train,y_pr_train)
class_re = classification_report(y_train,y_pr_train)
con_mat = confusion_matrix(y_train,y_pr_train)
print("Confusion Matrix:\n",con_mat)
print("\n")
print("The accuracy of the model:",(acc_train)*100)
print("\n")
print("The classification report:\n",class_re)

In [None]:
y_pr_test = xgb.predict(std_x_te)
acc_test = accuracy_score(y_test,y_pr_test)
class_re1 = classification_report(y_test,y_pr_test)
con_mat1 = confusion_matrix(y_test,y_pr_test)
print("Confusion Matrix:\n",con_mat1)
print("\n")
print("The accuracy of the model:",(acc_test)*100)
print("\n")
print("The classification report:\n",class_re1)

In [None]:
s = pd.Series(xgb.feature_importances_,sm_x.columns)
s.sort_values().plot(kind = "barh",cmap = "summer")

In [None]:
y_prob = xgb.predict_proba(std_x_te)
y_pr   = y_prob[:,1]
auc_score = roc_auc_score(y_test, y_pr)
fpr,tpr,threshold = roc_curve(y_test,y_pr)
plt.plot(fpr,tpr, "r")
plt.text(1, 0.02, 'AUC = %0.3f' % auc_score, ha='right', fontsize=12,weight='bold', color='green')
plt.fill_between(fpr, tpr, facecolor='orange', alpha=0.3)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC - AUC CURVE")
plt.show()

In [None]:
with open("Standerd_scaling.pkl", "wb") as f:
    pickle.dump(std, f)
with open("xgb_model.pkl", "wb") as f:
    pickle.dump(xgb, f)

In [None]:
def plot_history(history):
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
  ax1.plot(history.history['loss'], label='loss')
  ax1.plot(history.history['val_loss'], label='val_loss')
  ax1.set_xlabel('Epoch')
  ax1.set_ylabel('Binary crossentropy')
  ax1.grid(True)

  ax2.plot(history.history['accuracy'], label='accuracy')
  ax2.plot(history.history['val_accuracy'], label='val_accuracy')
  ax2.set_xlabel('Epoch')
  ax2.set_ylabel('Accuracy')
  ax2.grid(True)

  plt.show()

In [None]:
nn_model = tf.keras.Sequential([
      tf.keras.layers.Dense(30, activation='relu', input_shape=(14,)),
      tf.keras.layers.Dropout(0.2),
      tf.keras.layers.Dense(50, activation='relu'),
      tf.keras.layers.Dropout(0.2),
      tf.keras.layers.Dense(1, activation='sigmoid')
  ])

nn_model.compile(optimizer=tf.keras.optimizers.Adam(0.01), loss='binary_crossentropy',
                  metrics=['accuracy'])

In [None]:
nn_model.summary()

In [None]:
history=nn_model.fit(X_train,y_train,epochs=100,validation_split=0.2,verbose=0)
y_pred=nn_model.predict(X_valid)
y_pred = (y_pred > 0.5).astype(int).reshape(-1,)
print(classification_report(y_valid, y_pred))

In [None]:
def train_model(X_train, y_train, num_nodes, dropout_prob, lr, epochs):
  nn_model = tf.keras.Sequential([
      tf.keras.layers.Dense(num_nodes, activation='relu', input_shape=(14,)),
      tf.keras.layers.Dropout(dropout_prob),
      tf.keras.layers.Dense(num_nodes, activation='relu'),
      tf.keras.layers.Dropout(dropout_prob),
      tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  nn_model.compile(optimizer=tf.keras.optimizers.Adam(lr), loss='binary_crossentropy',
                  metrics=['accuracy'])
  history = nn_model.fit(
    X_train, y_train, epochs=epochs, validation_split=0.2, verbose=0
  )

  return nn_model, history

In [None]:
least_val_loss = float('inf')
least_loss_model = None
epochs=100
for num_nodes in [16, 32, 64]:
  for dropout_prob in[0, 0.2]:
    for lr in [0.01, 0.005, 0.001]:
        print(f"{num_nodes} nodes, dropout {dropout_prob}, lr {lr}")
        model, history = train_model(X_train, y_train, num_nodes, dropout_prob, lr, epochs)
        plot_history(history)
        val_loss = model.evaluate(X_valid, y_valid)[0]
        if val_loss < least_val_loss:
          least_val_loss = val_loss
          least_loss_model = model

In [None]:
y_pred = least_loss_model.predict(X_valid)
y_pred = (y_pred > 0.5).astype(int).reshape(-1,)
print(classification_report(y_valid, y_pred))
least_loss_model.summary()
least_loss_model.get_config()