### Import libraries

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_palette(sns.color_palette("viridis"))

### Loading data

#### We are not going to use test.csv data, because it consists of train.csv data, and this can lead to data leak

In [59]:
train_df = pd.read_csv('/kaggle/input/banking-dataset-marketing-targets/train.csv', sep=';')
print(f"Dataset shape - {train_df.shape}")
train_df.head()

In [60]:
train_df = train_df.rename(columns={'y': 'target'})
train_df

## Prepare data for visualizing'

Copy training dataframe for manipulations during EDA

In [61]:
df = train_df.copy()
df

#### Transform binary category features to int

In [62]:
binary_columns = []
for column in df.select_dtypes('object').columns:
    if len(df[column].unique()) == 2:
        binary_columns.append(column)
    print(f"Column - {column} ",df[column].unique(), end='\n\n')

In [63]:
for column in binary_columns:
    df[column] = df[column].map({'yes': 1, 'no': 0})
    
df.loc[:, binary_columns]

##  EDA

### Target feature

Target variable is highly imbalanced, this will impact perfomance of a model, this can be fixed with SMOTE technique

In [64]:
plt.figure(figsize=(8,6))
sns.histplot(data=df, x="target", hue="target", multiple="dodge",binwidth=1.3)
plt.show()

### Education

Most clients have secondary education

Let's see how education impacts our target variable

In [65]:
plt.figure(figsize=(8,6))
sns.countplot(x = 'education',
              data = df,
              order = df['education'].value_counts().index)
plt.title('Education')
plt.show()

Even if we have more clients that have secondary education, by average clients who have tertiary have a positive target value.

So, in theory this can impact output of the model

In [66]:
education_target = df.groupby('education', as_index=False)['target']
education_target_mean = education_target.mean().sort_values(by='target', ascending=False)
education_target_sum = education_target.sum().sort_values(by='target', ascending=False)

In [67]:
plt.figure(figsize=(16,10))
plt.subplot(1,3,1)
sns.histplot(data=df, x="education", hue="target", multiple="dodge",binwidth=1.1,shrink=.8)
plt.title('Distribution of target value by education')

plt.subplot(1,3,2)
sns.barplot(data=education_target_mean, x='education', y='target')
plt.xlabel("education")
plt.ylabel("Mean target")
plt.title("Mean target value by education")

plt.subplot(1,3,3)
sns.barplot(data=education_target_sum, x='education', y='target')
plt.xlabel("Education")
plt.ylabel("Sum target")
plt.title("Sum target value by education")

plt.show()

Also, clients who have secondary education and loan in average subscribe a term deposit more

This can also impact the results of model

In [68]:
education_group = df.groupby('education', as_index=False)
education_loan_mean = education_group['loan'].mean().sort_values(by='loan', ascending=False)
education_loan_sum = education_group['loan'].sum().sort_values(by='loan', ascending=False)

education_default_mean = education_group['default'].mean().sort_values(by='default', ascending=False)
education_default_sum = education_group['default'].sum().sort_values(by='default', ascending=False)

education_loan_mean

In [69]:
plt.figure(figsize=(16,12))
plt.subplot(3,2,1)

sns.histplot(data=df, x="education", hue="loan", multiple="dodge",binwidth=1.1,shrink=.8)
plt.title('Distribution of loan by education')

plt.subplot(3,2,2)
sns.histplot(data=df, x="education", hue="default", multiple="dodge",binwidth=1.1,shrink=.8)
plt.title('Distribution of default by education')

plt.subplot(3,2,3)
sns.barplot(data=education_loan_mean, x='education', y='loan',)
plt.xlabel("education")
plt.ylabel("Mean loan")
plt.title("Mean loan by education")

plt.subplot(3,2,4)
sns.barplot(data=education_loan_sum, x='education', y='loan')
plt.xlabel("education")
plt.ylabel("Sum loan")
plt.title("Sum loan by education")

plt.subplot(3,2,5)
sns.barplot(data=education_default_mean, x='education', y='default')
plt.xlabel("education")
plt.ylabel("Mean default")
plt.title("Mean default by education")

plt.subplot(3,2,6)
sns.barplot(data=education_default_sum, x='education', y='default')
plt.xlabel("education")
plt.ylabel("Sum default")
plt.title("Sum default by education")
plt.tight_layout()
plt.show()

### Job

Most of the cliens have blue-collar, management or technician job

In [70]:
plt.figure(figsize=(16,6))
sns.countplot(x = 'job',
              data = df,
              order = df['job'].value_counts().index, palette=sns.color_palette('viridis'))
plt.title('job')
plt.show()

But, as we can see, in average students and retired clients subscribed for deposit more

In [71]:
sns.set(rc={'figure.figsize':(16,18)})
x,y = 'job', 'target'

df1 = df.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

g= sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1)
g.figure.set_figheight(8)
g.figure.set_figwidth(16)
g.ax.set_ylim(0,100)
g.ax.set_xlabel('Job')
g.ax.set_ylabel('% of target variable')
g.ax.set_title("% of target variable by job")
for p in g.ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() 
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)




We have no high correlation between features

In [72]:
plt.figure(figsize=(12,10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot =True)

### Visualise variables with PCA

In [73]:
train_df.info()

In [74]:
pca_df = df.copy()
columns_to_transform = ['age', 'balance', 'day', 'duration', 'pdays']
object_col = pca_df.select_dtypes('object').columns
object_col

In [75]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

pca_df.loc[:, columns_to_transform] = StandardScaler().fit_transform(pca_df.loc[:, columns_to_transform].values)
pca_df.loc[:, object_col] = OrdinalEncoder().fit_transform(pca_df.loc[:, object_col].values)
pca_df

In [76]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

x = pca_df.iloc[:, :-1]
y = pca_df.iloc[:, -1]

principalComponents = pca.fit_transform(x)

In [77]:
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal_component_1', 'principal_component_2'])

In [78]:
principalDf

In [79]:
finalDf = pd.concat([principalDf, pca_df[['target']]], axis = 1)

Most of subscribed clients are in visible cloud

But it could be because we have imbalanced data

In [80]:
sns.scatterplot(data=finalDf, x='principal_component_1', y='principal_component_2', hue='target')
plt.title('2 component PCA')
plt.show()

## Encoding and Scaling Features

In [81]:
df = train_df.copy()
df.shape

In [82]:
df.describe().T

In [83]:
df = pd.get_dummies(df,columns = ['job','marital','education','default','housing','month','loan','contact','poutcome'], drop_first = True)
df.head()

In [84]:
df['target'] = df['target'].map({'no': 0, 'yes':1})
df['target']

In [85]:
df.target.value_counts()

In [86]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

columns = df.columns[:-1]

target = df['target']
df = df.drop('target',axis = 1)

df = scaler.fit_transform(df)

df = pd.DataFrame(df,columns=[columns])
df.head()

In [87]:
target.value_counts()

### Splitting data into train and test datasets

In [88]:
from sklearn.model_selection import train_test_split

#Splitting the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(df,target,test_size = 0.25, random_state = 20)

In [89]:
X_train.columns = X_train.columns.get_level_values(0)
X_test.columns =  X_test.columns.get_level_values(0)

### Testing models

In [90]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline

kfold = KFold(shuffle=True , random_state=12)

We need to balance classes before testing and learning.

I will use SMOTE to oversample the minority class

In [91]:
oversample = SMOTE()

X_train_smote, y_train_smote = oversample.fit_resample(X_train,y_train)

#### Function for testing models

In [96]:
def plot_learning_curve(
    estimator,
    title,
    X,
    y,
    axes=None,
    ylim=None,
    cv=None,
    n_jobs=None,
    train_sizes=np.linspace(0.1, 1.0, 5),
):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : estimator instance
        An estimator instance implementing `fit` and `predict` methods which
        will be cloned for each validation.

    title : str
        Title for the chart.

    X : array-like of shape (n_samples, n_features)
        Training vector, where ``n_samples`` is the number of samples and
        ``n_features`` is the number of features.

    y : array-like of shape (n_samples) or (n_samples, n_features)
        Target relative to ``X`` for classification or regression;
        None for unsupervised learning.

    axes : array-like of shape (3,), default=None
        Axes to use for plotting the curves.

    ylim : tuple of shape (2,), default=None
        Defines minimum and maximum y-values plotted, e.g. (ymin, ymax).

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, default=None
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like of shape (n_ticks,)
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the ``dtype`` is float, it is regarded
        as a fraction of the maximum size of the training set (that is
        determined by the selected validation method), i.e. it has to be within
        (0, 1]. Otherwise it is interpreted as absolute sizes of the training
        sets. Note that for classification the number of samples usually have
        to be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
        estimator,
        X,
        y,
        cv=cv,
        n_jobs=n_jobs,
        train_sizes=train_sizes,
        return_times=True,
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(
        train_sizes,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.1,
        color="r",
    )
    axes[0].fill_between(
        train_sizes,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.1,
        color="g",
    )
    axes[0].plot(
        train_sizes, train_scores_mean, "o-", color="r", label="Training score"
    )
    axes[0].plot(
        train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score"
    )
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, "o-")
    axes[1].fill_between(
        train_sizes,
        fit_times_mean - fit_times_std,
        fit_times_mean + fit_times_std,
        alpha=0.1,
    )
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, "o-")
    axes[2].fill_between(
        fit_times_mean,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.1,
    )
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt

In [92]:
from sklearn.model_selection import  learning_curve
from sklearn.metrics import precision_recall_curve, roc_auc_score, f1_score

In [94]:
classifier = RandomForestClassifier()
classifier.fit(X_train_smote, y_train_smote)
predicted_proba = classifier.predict_proba(X_test)
print(f"ROC AUC score: {roc_auc_score(y_test, predicted_proba[:, 1])}")
pred = classifier.predict(X_test)
print(f"F1 score : {f1_score(y_test, pred)}")
print(classification_report(y_test, pred))

#### Check RandomForest perfomance with learning curves

In [97]:
estimator = RandomForestClassifier()
plot_learning_curve(estimator, "Test",X_train_smote, y_train_smote, cv=kfold, n_jobs=-1)

#### LGBM testing

In [98]:
from lightgbm import LGBMClassifier

estimator = LGBMClassifier()
plot_learning_curve(estimator, "Test",X_train_smote, y_train_smote, cv=kfold, n_jobs=-1)
estimator.fit(X_train_smote, y_train_smote)
y_pred = estimator.predict(X_test)
print(classification_report(y_test, y_pred))

#### LGBM have a better results, so i will use it

## Tunning LGBMClassifier

In [99]:
from sklearn.model_selection import GridSearchCV
kfold= KFold(5, shuffle=True, random_state=47)

params = {
    'boosting_type': ['gbdt', 'dart'],
    'max_depth': range(2,15),
}

estimator = LGBMClassifier()

gr_lgbm = GridSearchCV(estimator, param_grid=params, cv=kfold, n_jobs=-1, verbose=1, scoring='f1')
gr_lgbm.fit(X_train_smote, y_train_smote)
lgbm_depth = gr_lgbm.best_params_
print(f"Best depth: {lgbm_depth['max_depth']}")
print(f"Best boosting_type: {lgbm_depth['boosting_type']}")
print(f"Best score: {gr_lgbm.best_score_}")

In [100]:
kfold= KFold(5, shuffle=True, random_state=47)

params = {
    "num_leaves": [2**x for x in range(1, 15)]
}

estimator = LGBMClassifier(max_depth=10)

gr_lgbm = GridSearchCV(estimator, param_grid=params, cv=kfold, n_jobs=-1, verbose=1, scoring='f1')
gr_lgbm.fit(X_train_smote, y_train_smote)
lgbm_leaves = gr_lgbm.best_params_
print(f"Best n_leaves: {lgbm_leaves['num_leaves']}")
print(f"Best score: {gr_lgbm.best_score_}")


In [101]:
params = {
    "learning_rate": np.linspace(0.009, 0.1)
}

estimator = LGBMClassifier(max_depth=10, num_leaves=256)

gr_lgbm = GridSearchCV(estimator, param_grid=params, cv=kfold, n_jobs=-1, verbose=1, scoring='f1')
gr_lgbm.fit(X_train_smote, y_train_smote)
lgbm_rate = gr_lgbm.best_params_
print(f"Best learning_rate: {lgbm_rate['learning_rate']}")
print(f"Best score: {gr_lgbm.best_score_}")

In [102]:
estimator = LGBMClassifier(max_depth=10, num_leaves=256, learning_rate=0.1, n_estimators=1000)
plot_learning_curve(estimator, "Test",X_train_smote, y_train_smote, cv=kfold, n_jobs=-1)

## Training_model

In [103]:
params = {'max_depth':10,'num_leaves':256, 'learning_rate':0.1, 'n_estimators':1000}


estimator =LGBMClassifier(**params)
estimator.fit(X_train_smote, y_train_smote, eval_set=(X_test, y_test), eval_metric='f1', early_stopping_rounds=100, verbose=1)

In [104]:
pred = estimator.predict(X_test)
print(classification_report(y_test, pred))

In [105]:
print(f"Final F1 score: {f1_score(y_test, pred)}")