## The data given in the url  is related with direct marketing campaigns of a banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed. Build a model to predict whether client will subscribe to term deposit


### Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('/home/admin1/PycharmProjects/Models in Machine Learning/')
from ipynb.fs.full.ml_library import *

# reading dataset from csv file & storing pandas dataframe
bank_data = pd.read_csv('bank.csv', sep=';')
bank_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [2]:
bank_data.isna().sum()       # checking null values

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [3]:
# mapping target variable i.e. column 'y' to 0 & 1
# mapping  'no' -> 0  &  'yes' -> 1
bank_data.loc[bank_data['y'] == 'no', 'y'] = 0
bank_data.loc[bank_data['y'] == 'yes', 'y']= 1
bank_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,0


#### Checking all features have contributed to target variable or not

In [4]:
# checking numeric columns variation with target variable
bank_data.groupby('y').mean()

Unnamed: 0_level_0,age,balance,day,duration,campaign,pdays,previous
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,40.998,1403.21175,15.94875,226.3475,2.86225,36.006,0.47125
1,42.491363,1571.955854,15.658349,552.742802,2.266795,68.639155,1.090211


In [5]:
bank_data.groupby('marital').mean()

Unnamed: 0_level_0,age,balance,day,duration,campaign,pdays,previous,y
marital,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
divorced,45.475379,1122.390152,15.753788,279.232955,2.604167,38.827652,0.439394,0.145833
married,43.454415,1463.195567,15.905971,256.528781,2.847336,38.466929,0.519128,0.099035
single,33.927258,1460.414716,16.008361,274.601171,2.751672,43.220736,0.642977,0.139632


In [6]:
bank_data.groupby('education').mean()

Unnamed: 0_level_0,age,balance,day,duration,campaign,pdays,previous,y
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
primary,46.833333,1411.544248,15.5059,261.70944,2.865782,35.069322,0.460177,0.094395
secondary,40.062446,1196.814397,15.977884,269.863833,2.734172,40.934085,0.528621,0.106245
tertiary,39.645926,1775.423704,16.00963,256.881481,2.901481,39.824444,0.612593,0.142963
unknown,45.299465,1701.245989,15.946524,250.449198,2.486631,41.983957,0.508021,0.101604


In [7]:
bank_data.groupby('job').mean()

Unnamed: 0_level_0,age,balance,day,duration,campaign,pdays,previous,y
job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
admin.,39.682008,1226.736402,16.324268,234.669456,2.631799,49.993724,0.644351,0.121339
blue-collar,40.156448,1085.161734,15.48203,278.161734,2.846723,41.590909,0.493658,0.072939
entrepreneur,42.011905,1645.125,15.255952,285.47619,2.589286,32.27381,0.428571,0.089286
housemaid,47.339286,2083.803571,15.294643,292.633929,2.5,26.401786,0.357143,0.125
management,40.540764,1766.928793,16.254902,260.536636,2.973168,40.968008,0.54902,0.135191
retired,61.869565,2319.191304,15.556522,285.656522,2.465217,35.073913,0.591304,0.234783
self-employed,41.453552,1392.409836,16.180328,264.125683,3.278689,28.256831,0.590164,0.10929
services,38.570743,1103.956835,15.515588,262.486811,2.822542,36.371703,0.443645,0.091127
student,26.821429,1543.821429,16.392857,248.690476,2.392857,45.714286,0.964286,0.22619
technician,39.470052,1330.996094,16.183594,252.178385,2.731771,39.265625,0.576823,0.108073


In [8]:
bank_data.groupby('poutcome').mean()

Unnamed: 0_level_0,age,balance,day,duration,campaign,pdays,previous,y
poutcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
failure,41.555102,1644.646939,14.395918,254.383673,1.955102,243.167347,2.85102,0.128571
other,39.873096,1424.472081,15.101523,273.832487,2.350254,219.385787,3.385787,0.192893
success,44.170543,1949.410853,14.581395,338.635659,1.736434,163.713178,3.015504,0.643411
unknown,41.083671,1374.862078,16.205938,262.103104,2.964912,-1.0,0.0,0.090958


In [9]:
bank_data.groupby('housing').mean()

Unnamed: 0_level_0,age,balance,day,duration,campaign,pdays,previous,y
housing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
no,43.511723,1595.277268,16.20999,259.29052,2.80632,26.402141,0.46789,0.153415
yes,39.374756,1290.309496,15.689332,267.542399,2.7839,50.013286,0.599844,0.085971


In [10]:
bank_data.groupby('loan').mean()

Unnamed: 0_level_0,age,balance,day,duration,campaign,pdays,previous,y
loan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
no,41.220627,1513.857963,15.932376,264.512794,2.771018,41.088512,0.558486,0.124804
yes,40.890014,917.163531,15.82055,260.904486,2.918958,32.439942,0.454414,0.062229


In [11]:
bank_data.groupby('default').mean()

Unnamed: 0_level_0,age,balance,day,duration,campaign,pdays,previous,y
default,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
no,41.194826,1450.550956,15.929584,264.355906,2.79865,40.111136,0.548481,0.115186
yes,39.723684,-208.723684,15.078947,240.881579,2.5,19.618421,0.197368,0.118421


In [12]:
bank_data.drop(['default', 'balance', 'contact', 'day', 'month'], axis=1, inplace=True)
bank_data.head()

Unnamed: 0,age,job,marital,education,housing,loan,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,no,79,1,-1,0,unknown,0
1,33,services,married,secondary,yes,yes,220,1,339,4,failure,0
2,35,management,single,tertiary,yes,no,185,1,330,1,failure,0
3,30,management,married,tertiary,yes,yes,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,yes,no,226,1,-1,0,unknown,0


#### Handling categorical data

In [13]:
# converted categorical data into number-wise category
# if only 2 categories then 0 & 1  i.e.  (no / yes)
# else numbers 1,2,3,...(No of categories)

# finding categorical columns
categorical_col = [col for col in bank_data.columns if bank_data[col].dtype == 'O']

for col in categorical_col:
    # sorting ccatories of the column
    categories = sorted(bank_data[col].unique())
    if len(categories) == 2:
        code = 0
        for category in categories:
            bank_data.loc[bank_data[col] == category, col] = code
            code += 1
    else:
        code = 1
        for category in categories:
            bank_data.loc[bank_data[col] == category, col] = code
            code += 1
bank_data.head()

Unnamed: 0,age,job,marital,education,housing,loan,duration,campaign,pdays,previous,poutcome,y
0,30,11,2,1,0,0,79,1,-1,0,4,0
1,33,8,2,2,1,1,220,1,339,4,1,0
2,35,5,3,3,1,0,185,1,330,1,1,0
3,30,5,2,3,1,1,199,4,-1,0,4,0
4,59,2,2,2,1,0,226,1,-1,0,4,0


In [14]:
# scaling columns age, duration, pdays in range 0-1
for col in bank_data.columns:
    minimum = bank_data[col].min()
    maximum = bank_data[col].max()
    bank_data[col] = (bank_data[col] - minimum)/(maximum - minimum)

bank_data.head()

Unnamed: 0,age,job,marital,education,housing,loan,duration,campaign,pdays,previous,poutcome,y
0,0.161765,0.909091,0.5,0.0,0.0,0.0,0.024826,0.0,0.0,0.0,1.0,0.0
1,0.205882,0.636364,0.5,0.333333,1.0,1.0,0.0715,0.0,0.389908,0.16,0.0,0.0
2,0.235294,0.363636,1.0,0.666667,1.0,0.0,0.059914,0.0,0.379587,0.04,0.0,0.0
3,0.161765,0.363636,0.5,0.666667,1.0,1.0,0.064548,0.061224,0.0,0.0,1.0,0.0
4,0.588235,0.090909,0.5,0.333333,1.0,0.0,0.073486,0.0,0.0,0.0,1.0,0.0


#### Removing one of the correlated columns

In [15]:
bank_data = bank_data.drop('pdays', axis=1)

In [16]:
bank_data.describe()

Unnamed: 0,age,job,marital,education,housing,loan,duration,campaign,previous,poutcome,y
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,0.326031,0.401017,0.573877,0.410455,0.566025,0.152842,0.086051,0.036605,0.021703,0.853056,0.11524
std,0.155533,0.295974,0.299825,0.249581,0.495676,0.359875,0.086017,0.063465,0.067742,0.330684,0.319347
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.205882,0.090909,0.5,0.333333,0.0,0.0,0.033102,0.0,0.0,1.0,0.0
50%,0.294118,0.363636,0.5,0.333333,1.0,0.0,0.059914,0.020408,0.0,1.0,0.0
75%,0.441176,0.636364,1.0,0.666667,1.0,0.0,0.10758,0.040816,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### Splitting dataset into train & test set

In [17]:
# Splitting 70 % dataset into train set & 30 % dataset into dataset into test set
train = bank_data.sample(frac=0.7, random_state=3)   # selecting random 0.7 fraction of dataset as train set   
# chossing different random state will give different random rows
test = bank_data.drop(train.index)            # selecting remaining i.e. 30% as test set
print('Train set')
print(train.shape)
train.head()

Train set
(3165, 11)


Unnamed: 0,age,job,marital,education,housing,loan,duration,campaign,previous,poutcome,y
3542,0.25,0.636364,0.5,0.333333,1.0,0.0,0.031447,0.0,0.0,1.0,0.0
2687,0.382353,0.818182,0.5,0.333333,0.0,0.0,0.023833,0.0,0.0,1.0,0.0
2053,0.073529,0.818182,1.0,0.333333,1.0,0.0,0.006289,0.0,0.0,1.0,0.0
3966,0.323529,0.818182,0.0,0.333333,0.0,0.0,0.085071,0.020408,0.0,1.0,0.0
1602,0.411765,0.0,0.0,0.333333,1.0,0.0,0.065872,0.061224,0.0,1.0,0.0


In [18]:
print('Test set')
print(test.shape)
test.head()

Test set
(1356, 11)


Unnamed: 0,age,job,marital,education,housing,loan,duration,campaign,previous,poutcome,y
1,0.205882,0.636364,0.5,0.333333,1.0,1.0,0.0715,0.0,0.16,0.0,0.0
7,0.294118,0.818182,0.5,0.333333,1.0,0.0,0.048659,0.020408,0.0,1.0,0.0
8,0.323529,0.181818,0.5,0.666667,1.0,0.0,0.017544,0.020408,0.0,1.0,0.0
13,0.014706,0.727273,1.0,0.333333,0.0,0.0,0.085071,0.0,0.0,1.0,1.0
14,0.176471,0.090909,0.5,0.333333,1.0,1.0,0.028136,0.0,0.04,0.0,0.0


#### Creating predictors & target variables in both data set

In [19]:
def create_pred_target(data):   
    """Function returns predictors' matrix & target variable's array for given dataset"""
    y_values = np.array(data['y'])
    x_values = np.array(data[data.columns[0]]).reshape(len(data),1)
    
    for col in data.columns[1:]:
        if col != 'y':
            new_col = np.array(data[col]).reshape(len(data), 1)
            x_values = np.append(x_values, new_col, axis=1)

    return x_values, y_values

train_x_values, train_y_values = create_pred_target(train)
test_x_values, test_y_values = create_pred_target(test)

### Building KNN model

In [20]:
k_number = 10       # total number of neighbours we have chosen 

# finding euclidean distance single test record & all train records
euclidean_distances = lambda x_val: np.sqrt(((x_val - train_x_values)**2).sum(axis=1))
        
# predicting label for single record of dataset   
def predict_single(x_val):
    distances = euclidean_distances(x_val)
    distance_target_data = pd.DataFrame({'Distances':distances, 'Target':train_y_values})
    k_nearest_neighbours =  distance_target_data.sort_values('Distances')[:k_number]
    return k_nearest_neighbours['Target'].sum() > (k_number/2)

# predicting for total records for given dataset
def predict(x_values):
    return np.array(list(map(predict_single, x_values)))

#### Storing predictions for test set

In [21]:
train_predicted_values = predict(train_x_values)
test_predicted_values = predict(test_x_values)

In [22]:
# categorise each record to it's category
train_predicted_values = categorise_knn(train_predicted_values)
test_predicted_values = categorise_knn(test_predicted_values)

### Evaluating accuracy against test set

In [23]:
accuracy_train_set = accuracy_classification(train_predicted_values, train_y_values)
accuracy_train_set

89.478672985782

In [24]:
accuracy_test_set = accuracy_classification(test_predicted_values, test_y_values)
accuracy_test_set

88.49557522123894