In [None]:
# Load the different packages into python

import numpy as np               # helps with working with large, multi-dimensional arrays and matrices
import pandas as pd              # helps with data manipulation and analysis
import matplotlib.pyplot as plt  # helps with plotting graphs, histogram, bar plot, etc
import seaborn as sns            # helps with statistical data visualization
%matplotlib inline

In [None]:
# Load the Bank Marketing dataset into python to begin analysis

bank=pd.read_csv('C:/Users/Surin/Desktop/Project/bank-additional/bank-additional-full.csv', sep=';')

In [None]:
# Let's take a look at the dataset. It contains 41,188 entries and 21 variables.

bank

In [None]:
# Statistics such as count, mean, std, etc done on the dataset with values rounded to the nearest whole number

round(bank.describe())

In [None]:
# We find the data types for each column. The dataset contains two data types, numeric and categorical. 

bank.dtypes

In [None]:
# Now, let's check the missing values (if present) in this data. The dataset has zero missing values. 

nans = bank.shape[0] - bank.dropna().shape[0]
print ("%d rows have missing values in the data" %nans)

In [None]:
# Delete any duplicate rows found in the dataset. This left 41,164 entries which means 24 duplicated entries were found and deleted.

bank.drop_duplicates(keep = False, inplace = True)
bank.shape

In [None]:
# Statistics such as count, mean, std, etc done after the duplicated entries were removed from the dataset with values rounded to the nearest whole number

round(bank.describe())

In [None]:
# Let's count the number of unique values from character variables. 
# It is interesting to note that the month variable only has ten months so no data was provided for two months. 

cat = bank.select_dtypes(include=['O'])
cat.apply(pd.Series.nunique)

In [None]:
# Now, we'll check the target variable to investigate if this data is imbalanced or not. 
# We see that almost 89% of the dataset belongs to the 'no' class. This means if we were to take a rough prediction of target variable as 'no', we'll get 89% accuracy. 

bank.y.value_counts()/bank.shape[0]

In [None]:
# Let's create a cross tab of the target variable with job. 
# With this, we'll try to understand the influence of job on the target variable.

round(pd.crosstab(bank['y'],bank['job']).apply(lambda r: r/r.sum(), axis=1)*100,2)

In [None]:
# Let's create a cross tab of the target variable with marital. 
# With this, we'll try to understand the influence of marital on the target variable.

round(pd.crosstab(bank['y'],bank['marital']).apply(lambda r: r/r.sum(), axis=1)*100,2)

In [None]:
# Let's create a cross tab of the target variable with education. 
# With this, we'll try to understand the influence of education on the target variable.

round(pd.crosstab(bank['y'],bank['education']).apply(lambda r: r/r.sum(), axis=1)*100,2)

In [None]:
# Let's create a cross tab of the target variable with default. 
# With this, we'll try to understand the influence of default on the target variable.

round(pd.crosstab(bank['y'],bank['default']).apply(lambda r: r/r.sum(), axis=1)*100,2)

In [None]:
# Let's create a cross tab of the target variable with housing. 
# With this, we'll try to understand the influence of housing on the target variable.

round(pd.crosstab(bank['y'],bank['housing']).apply(lambda r: r/r.sum(), axis=1)*100,2)

In [None]:
# Since about half of the people who subscribed to a term deposit have a housing loan and therefore the other half didn't have a housing loan,housing doesn't give much information on predicting who will subscribe for a term deposit so we delete the housing column from the dataset. 

del bank['housing']

In [None]:
# Let's create a cross tab of the target variable with loan. 
# With this, we'll try to understand the influence of loan on the target variable.

round(pd.crosstab(bank['y'],bank['loan']).apply(lambda r: r/r.sum(), axis=1)*100,2)

In [None]:
# Let's create a cross tab of the target variable with contact. 
# With this, we'll try to understand the influence of contact on the target variable.

round(pd.crosstab(bank['y'],bank['contact']).apply(lambda r: r/r.sum(), axis=1)*100,2)

In [None]:
# Let's create a cross tab of the target variable with month. 
# With this, we'll try to understand the influence of month on the target variable.

round(pd.crosstab(bank['y'],bank['month']).apply(lambda r: r/r.sum(), axis=1)*100,2)

In [None]:
# We delete the month column since the dataset doesn't have which year each month is associated with.
# It should be noted that the dataset doesn't have any information on the months January and February. 

del bank['month']

In [None]:
# Let's create a cross tab of the target variable with day_of_week. 
# With this, we'll try to understand the influence of day_of_week on the target variable.

round(pd.crosstab(bank['y'],bank['day_of_week']).apply(lambda r: r/r.sum(), axis=1)*100,2)

In [None]:
# There was no contact made on Saturdays and Sundays and people were last contacted roughly 20% each weekday. 
# Therefore the day of the weekday they were last contacted is not important and we can remove this variable.

del bank['day_of_week']

In [None]:
# We create a new dataset 'bankno', which contains only rows where people did not subscribe for a term deposit (only show target variable 'y' where the outcome is 'no')
# The mean duration of a call is 221 seconds.
# The mean number of contacts performed before this campaign and with a customer is 0.132.

bankno=bank[bank["y"]=='no']
bankno.describe()

In [None]:
# We create a new dataset 'bankyes', which contains only rows where people subscribed for a term deposit (only show target variable 'y' where the outcome is'yes')
# The mean duration of a call is 553 seconds.
# The mean number of contacts performed before this campaign and with a customer is 0.493.
# This shows us that the longer the call and the more number of contacts performed to a customer the likelier they will subscribe.

bankyes=bank[bank["y"]=='yes']
bankyes.describe()

In [None]:
# Countplot of the target variable when outcome is 'yes' with campaign (the number of contacts performed during this campaign and for this client)

sns.countplot(x="campaign", data=bankyes)

In [None]:
# Let's create a cross tab of the target variable with pdays. 
# With this, we'll try to understand the influence of pdays on the target variable.

round(pd.crosstab(bank['y'],bank['pdays']).apply(lambda r: r/r.sum(), axis=1)*100,2)

In [None]:
# Countplot of the target variable when outcome is 'yes' with pdays (the number of days that passed by after the client was last contacted from a previous campaign)

sns.countplot(x="pdays", data=bankyes)

In [None]:
# 999 above means client was not previously contacted. 
# This value is high for both people who did subscribed and did not subscribe so we can omit this variable from the analysis.

del bank['pdays']

In [None]:
# Let's create a cross tab of the target variable with poutcome. 
# With this, we'll try to understand the influence of poutcome on the target variable.

round(pd.crosstab(bank['y'],bank['poutcome']).apply(lambda r: r/r.sum(), axis=1)*100,2)

In [None]:
# The target variable 'y' has two outcomes 'yes' or 'no'. Here we have the the counts for each outcome.
# 36,526 did not subscribed in this dataset whereas 4,638 did subscribe.

bank.y.value_counts()

In [None]:
# This countplot shows the count of each outcome, 'no' and 'yes' in the target variable.

sns.countplot(bank['y'])

In [None]:
# Scikit learn accepts data in numeric format. Now, we'll have to convert the character variable into numeric. We'll use the labelencoder function.
# In label encoding, each unique value of a variable gets assigned a number, i.e., let's say a variable color has four values ['red','green','blue','pink']. Label encoding this variable will return output as: red = 2 green = 0 blue = 1 pink = 3
# Load sklearn and encode all object type variables

from sklearn import preprocessing
for x in bank.columns:
    if bank[x].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(bank[x].values))
        bank[x] = lbl.transform(list(bank[x].values))

In [None]:
# Let's check the changes applied to the dataset. It can be seen that all character variable were converted to numeric.

bank.head()

In [None]:
# Statistics such as count, mean, std, etc done on the numeric dataset with values rounded to the nearest whole number

round(bank.describe())

In [None]:
# The target variable 'y' has two outcomes '0' or '1'. '0' represents 'no' as in a subscription wasn't sold to a customer and '1' represents 'yes', a subscription was sold.
# Here we have the the counts for each outcome.

bank.y.value_counts()

In [None]:
# This countplot shows the count of each outcome, '0' and '1' in the target variable.

sns.countplot(bank['y'])

In [None]:
# Converting the bank dataset into a DataFrame.

df=pd.DataFrame(bank)

In [None]:
# Performing a correlation analysis on the dataset 

C=df.corr()

In [None]:
# Setting significant figure to 1 and applying colour backgrounds to the correlation between variables depending on strong positive, strong negative, weak positive, weak negative and no correlation.
# Positive correlation between the target variable 'y' and duration and the target variable 'y' and previous.

C.style.background_gradient(cmap='coolwarm').set_precision(1)

In [None]:
# Split our dataset into its attributes and labels

X = bank.drop('y', axis = 1)
y = bank['y']

In [None]:
# Let's see what y looks like

y.tail()

In [None]:
# We will divide our dataset into training and test splits. The training data will be used to train the logistic regression model and the test data will be used to evaluate the performance of the model.
# Splits 90% of the dataset into our training set and the other 10% into test data.


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [None]:
# Applying standard scaling to get optimized result

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
# Train the logistic regression model and start making predictions using Scikit-Learn

from sklearn.linear_model import LogisticRegression

logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)

In [None]:
# When our logistic regression model predicted y is going to be ‘no’ (0), it is accurate 93% of the time, ‘yes’ (1) is predicted with 69% precision. 
# In Recall, if the client didn’t subscribe to a term deposit - ‘no’ in the test set our logistic regression model can identify it 98% of the time; if the client did subscribe - ‘yes’ is predicted 40% of the time.
# Logistic regression model gets 90% accuracy overall.

from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

In [None]:
# Random Forest Classifier
# Splits 90% of the dataset into our training set and the other 10% into test data.
# The random_state parameter which controls the randomness of the bootstrapping of the samples used when building trees and the sampling of the features to consider when looking for the best split at each node was set to 0. 

X2_train, X2_test, y2_train, y2_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

In [None]:
# Applying standard scaling to get optimized result

sc2 = StandardScaler()
X2_train = sc2.fit_transform(X2_train)
X2_test = sc2.fit_transform(X2_test)

In [None]:
# Train the random forest classifier and start making predictions using Scikit-Learn
# The number of trees (n_estimators) was set to 200 since if there are more trees it will not allow overfitting trees in the model.

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X2_train, y2_train)
pred_rfc = rfc.predict(X2_test)

In [None]:
# When our random forest classifier predicted y is going to be ‘no’ (0), it is accurate 94% of the time, ‘yes’ (1) is predicted with 63% precision. 
# In Recall, if the client didn’t subscribe to a term deposit - ‘no’ in the test set our random forest classifier can identify it 96% of the time; if the client did subscribe - ‘yes’ is predicted 52% of the time.
# Random Forest Classifier gets 91% accuracy overall.

print(classification_report(y2_test, pred_rfc))