# Term Deposit Subscription

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

#import machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import xgboost

from sklearn.model_selection import train_test_split #split
from sklearn.metrics import accuracy_score #metrics

#tools for hyperparameters search
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

## Attributes

### Clients Personal Information
1 - age: age of the individual (numeric)  
2 - job: types of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')  
3 - marital: marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)  
4 - education: (categorical: primary, secondary, tertiary and unknown)  
5 - default: has credit in default? (categorical: 'no','yes','unknown')  
6 - housing: has housing loan? (categorical: 'no','yes','unknown')  
7 - loan: has personal loan? (categorical: 'no','yes','unknown')  
8 - balance: Balance of the individual.

### Last Contact Information
9 - contact: contact communication type (categorical: 'cellular','telephone')  
10 - month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')  
11 - day: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')  
12 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.  

### Other Attributes:  
13 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)  
14 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)  
15 - previous: number of contacts performed before this campaign and for this client (numeric)  
16 - poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')  

### Output variable (desired target):  
y - has the client subscribed a term deposit? (binary: 'yes','no')  

In [None]:
pd.set_option("display.max_columns", 500)

In [None]:
df = pd.read_csv('../input/bank.csv')
bank = df.copy()
bank.head()

### Exploratory



In [None]:
bank.describe()

In [None]:
len(bank)

In [None]:
bank.info()

In [None]:
bank.isnull().sum()

Since there is not any missing value in the dataset, and all the columns have equal lengths, let's go ahead and plot some graphs to see the relationship between our attributes. 

In [None]:
# numerical columns: colummns with numeric values
num_columns = ['balance', 'day','duration', 'campaign', 'pdays', 'previous']

sns.set(style = 'white', palette = 'muted', color_codes = True)
colors = ['c', 'yellow', 'r', 'b', 'g', 'c']

i = 0   
fig, axes = plt.subplots(2, 3, sharex=False, sharey=False, figsize=(15,10))

for num in num_columns:
    sns.distplot(bank[num], kde=False, color = colors[i], ax = axes[i//3, i%3])
    i += 1
;

The graphs of campaign, pdays, and previous have tails, which means there are outliers in these columns. In order to determine whether these outliers are noises, we proceed to use box plot to have a clearer understanding of these numerical attributes. 

In [None]:
num_columns = ['balance', 'day','duration', 'campaign', 'pdays', 'previous']

colors = ['c', 'yellow', 'r', 'b', 'g', 'c']
i = 0   
fig, axes = plt.subplots(2, 3, sharex=False, sharey=False, figsize=(20,15))
for num in num_columns:
    sns.boxplot(data = bank, y = num, hue = num, color = colors[i], ax = axes[i//3, i%3], )
    i += 1

Since the three columns campaign, pdays, and previous seem to have significant outliers, it is important that we take a better look at these columns. First, let's review the meaning of these columns.  
- campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)  
- pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)  
- previous: number of contacts performed before this campaign and for this client (numeric)

In [None]:
bank[['campaign', 'pdays', 'previous']].describe()

<span style="color:blue"> Most of the values of campaign attribute fall in the interval [1, 3], which mean most customers receive 1-3 calls. </span>
   
   Take a look of the percentage of campaign that are greater than 4 

In [None]:
len (bank[bank['campaign'] > 4] ) / len(bank) * 100

In [None]:
len (bank[bank['pdays'] == -1] ) / len(bank) * 100

In [None]:
len (df[df['previous'] > 20] ) / len(df) * 100

Roughly 75% of columns pdays have values = -1. Since pdays is the number of days passed since last time the customers were contacted, we can exclude this column from the study since we are not sure what -1 stand for.

In [None]:
# campaign 
num_clients = bank['campaign'].value_counts()
num_calls = bank['campaign'].unique()

plt.plot(num_clients, num_calls);

In [None]:
opened = bank[bank['deposit'] == 'yes']
notOpened = bank[bank['deposit'] == 'no']
percentNotOpen = round((len(notOpened) / len(bank))*100, 2)
percentOpen = round((len(opened) / len(bank))*100, 2)
print('{} % of the clients opened a term deposit and {}% of the clients did not open a term deposit.'.format(percentOpen,percentNotOpen))

In [None]:
bank["deposit"].value_counts().plot.pie(explode=[0,0.1], autopct='%0.3f%%', shadow=True, colors=colors, 
                                             fontsize=12, startangle=10)
plt.show()

In [None]:
bank['education'].replace({'secondary': 'high school', 'tertiary': 'college', 'primary' : 'elementary'}, inplace = True)
bank.head()

In [None]:
#job and deposit
job = pd.DataFrame()

job['yes'] = bank[bank['deposit'] == 'yes']['job'].value_counts()
job['no'] = bank[bank['deposit'] == 'no']['job'].value_counts()

job.plot.bar(title = 'Job and deposit')

In [None]:
#education and deposit
edu = pd.DataFrame()

edu['yes'] = bank[bank['deposit'] == 'yes']['education'].value_counts()
edu['no'] = bank[bank['deposit'] == 'no']['education'].value_counts()

edu.plot.bar(title = 'Education and Deposit');

In [None]:
age = bank[['age']].describe()
age

In [None]:
#balance and deposit

balance = pd.DataFrame()
balance['balance_yes'] = (bank[bank['deposit'] == 'yes'][['deposit','balance']].describe())['balance']
balance['balance_no'] = (bank[bank['deposit'] == 'no'][['deposit','balance']].describe())['balance']

balance.drop(['count', '25%', '50%', '75%']).plot.bar(title = 'Balance and Deposit');

In [None]:
#age and deposit
age = pd.DataFrame()

age['yes'] = (bank[bank['deposit'] == 'yes'][['deposit', 'age']]).describe()['age']
age['no'] = (bank[bank['deposit'] == 'no'][['deposit', 'age']]).describe()['age']

age.drop(['count', '25%', '50%', '75%']).plot.bar(title = 'Age and Deposit');

In [None]:
#number of contacts performed during this campaign ('campaign') and deposit
cam = pd.DataFrame()

cam['campaign_yes'] = (bank[bank['deposit'] == 'yes'][['deposit','campaign']].describe())['campaign']
cam['campaign_no'] = (bank[bank['deposit'] == 'no'][['deposit','campaign']].describe())['campaign']

cam.drop(['count', '25%', '50%', '75%']).plot.bar(title = 'Number of contacts made during campaign and Deposits');

In [None]:
#number of contacts performed before this campaign ('campaign') and deposit
prev = pd.DataFrame()

prev['campaign_yes'] = (bank[bank['deposit'] == 'yes'][['deposit','previous']].describe())['previous']
prev['campaign_no'] = (bank[bank['deposit'] == 'no'][['deposit','previous']].describe())['previous']

prev.drop(['count', '25%', '50%', '75%']).plot.bar(title = 'Number of contacts made previous to the campaign and Deposits');

In [None]:
def replace_mean(df, column,row, limit):
    ''' Returns mean value if value in column_name is above threshold'''
    if row[column] <= limit:
        return row[column]
    else:
        mean = df[df[column] <= limit][column].mean()
        return mean

In [None]:
def encoding(table, column):
    ''' Returns 0 if value in column_name is no, returns 1 if value in column_name is yes'''
    return 1 if table[column] == 'yes' else 0



In [None]:

def clean_data(df):
    
    table = df.copy()
    
    table = table.drop(columns = ['pdays'])
    
    #replace columns containing 'yes' and 'no' values to boolean variables
    bool_columns = ['default', 'housing', 'loan', 'deposit']
    for bool_col in bool_columns:
        table[bool_col] = df.apply(lambda row: encoding(row, bool_col),axis=1)
    
    #convert categorical columns to one-hot encoding
    cat_columns = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
    
    for col in  cat_columns:
        table = pd.concat([table.drop(col, axis=1),
                                pd.get_dummies(table[col], prefix=col, prefix_sep='_',
                                               drop_first=True, dummy_na=False)], axis=1)
    
    #impute incorrect values and drop original columns
    correct_cols = ['campaign', 'previous']
    for col in correct_cols:
        table[col] = df.apply(lambda row: replace_mean(table,col,row, 34),axis=1)
    
    return table

In [None]:
cleaned = clean_data(bank)

In [None]:
cleaned.head() 

In [None]:
## Machine Learning 
X = cleaned.drop(columns='deposit')
y = cleaned[['deposit']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
print(len(X_train), len(X_test),len(y_train), len(y_test))

In [None]:
xgb = xgboost.XGBClassifier(n_estimators=100, learning_rate=0.09, gamma=0.11, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
xgb.fit(X_train,y_train.values)

#calculate and print scores for the model for top 15 features
y_train_preds = xgb.predict(X_train)
y_test_preds = xgb.predict(X_test)
acc_train = accuracy_score(y_train, y_train_preds)
acc_test = accuracy_score(y_test, y_test_preds)

print(acc_train, acc_test)

In [None]:
#train XGBoost model
def get_accuracy(n, learningRate, discountFactor): 
    accuracy = pd.DataFrame(columns=['Learning Rate', 'Discount Factor', 'Train Accuracy', 'Test Accuracy'])
    #learningRate = np.arange(0.01, 0.11, 0.01)
    #discountFactor = np.arange(0.01, 0.11, 0.01)
    for i in range(len(discountFactor)):
        xgb = xgboost.XGBClassifier(n_estimators=n, learning_rate=learningRate[i], gamma=discountFactor[i], subsample=0.75,
                           colsample_bytree=1, max_depth=7)
        xgb.fit(X_train,y_train.squeeze().values)

        #calculate and print scores for the model for top 15 features
        y_train_preds = xgb.predict(X_train)
        y_test_preds = xgb.predict(X_test)
        acc_train = accuracy_score(y_train, y_train_preds)
        acc_test = accuracy_score(y_test, y_test_preds)
        accuracy = accuracy.append({'Learning Rate':learningRate[i],'Discount Factor': discountFactor[i],
                               'Train Accuracy': acc_train, 'Test Accuracy': acc_test}, ignore_index = True)
    return accuracy


In [None]:
learning = np.arange(0.01, 0.2, 0.01)
discount = np.arange(0.01, 0.2, 0.01)
n = 100
first_trial = get_accuracy(n, learning, discount)


In [None]:
first_trial

In [None]:
print(len(X_train), len(X_test),len(y_train), len(y_test))

In [None]:
## Plot accuracy vs Learning rate
# since we have similar Learning Rate and Discount factor for each pair, Discount Factor graph should look the same
plt.plot('Learning Rate', 'Test Accuracy', data = first_trial)
plt.plot('Learning Rate', 'Train Accuracy', data = first_trial)
plt.xlabel('Learning Rate')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Learning Rate')
plt.legend();

In [None]:
## Change Discount Factor - Keep Learning Rate the same

learning = np.arange(0.01, 0.2, 0.01)
discount = np.flip(np.arange(0.01, 0.2, 0.01))
n = 100
second_trial = get_accuracy(n, learning, discount)
second_trial

In [None]:
first_trial[first_trial['Test Accuracy'] == max(first_trial['Test Accuracy'])]

In [None]:
second_trial[second_trial['Test Accuracy'] == max(second_trial['Test Accuracy'])]

In [None]:
print(len(X_train), len(X_test),len(y_train), len(y_test))

In [None]:
 final = xgboost.XGBClassifier(n_estimators=100, learning_rate=0.09, gamma=0.11, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
final.fit(X_train,y_train.squeeze().values)

In [None]:
xgb.feature_importances_?

In [None]:
#get feature importances from the model
headers = ["name", "score"]
values = sorted(zip(X_train.columns, final.feature_importances_), key=lambda x: x[1] * -1)
featureImportance = pd.DataFrame(values, columns = headers)

#plot feature importances
plt.figure(figsize=(15, 8))
x_pos = np.arange(0, len(featureImportance))
plt.bar(x_pos, featureImportance['score'])
plt.xticks(x_pos, featureImportance['name'])
plt.xticks(rotation=90)
plt.title("Features' Importance (XGB)")

plt.show()

In [None]:
importance_types = ['weight', 'gain', 'cover', 'total_gain', 'total_cover']
weight = xgb.get_booster().get_score(importance_type= 'weight')

In [None]:
cleaned.head()

In [None]:
balance = cleaned[['balance', 'deposit']]
balance = balance[balance['deposit'] == 1]
balance.describe()