In [91]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [92]:
# Reads the data set
data_set = pd.read_csv('Telemarketing Dataset/bank-additional-full.csv', delimiter = ";")
# Prints it out
data_set.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [93]:
# Gets all the non numerical-values.
data_set_obj_columns = data_set.select_dtypes(['object']).columns
data_set[data_set_obj_columns] = data_set[data_set_obj_columns].apply(lambda x: x.astype('category').cat.codes)
data_set.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,3,1,0,0,0,0,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
1,57,7,1,3,1,0,0,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
2,37,7,1,3,0,2,0,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
3,40,0,1,1,0,0,0,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
4,56,7,1,3,0,0,2,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0


In [94]:
# You have to shuffle it BEFORE extracting the Y values. Otherwise
# you will have mismatched labels.
all_data =  data_set.values
# We get the data_set's last column as the label.
all_labels = data_set['y'].values
# We get the processed 
data_set_processed = data_set.drop(['y'], axis = 1)
data_set_processed.head()
data_set_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 20 columns):
age               41188 non-null int64
job               41188 non-null int8
marital           41188 non-null int8
education         41188 non-null int8
default           41188 non-null int8
housing           41188 non-null int8
loan              41188 non-null int8
contact           41188 non-null int8
month             41188 non-null int8
day_of_week       41188 non-null int8
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null int8
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
dtypes: float64(5), int64(5), int8(10)
memory usage: 3.5 MB


In [132]:
# Convert it to numpy array 
# Split it 2.5K Dev; 2.5K Test; Remaining Train.
# I dokn't think this is going to be good. But it's closer to what Andrew Ng said.
# Ideally we'd get more data than that. But we don't have. 
# Actually, scikit has a function that help us!!
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
training_data, test_data, training_label, test_label = train_test_split(data_set_processed.values, all_labels,  test_size=0.125)
# Let's do the splitting once more to get the dev, and test:
print('Before resplitting', test_data.shape)

# We now split the test into 2 more to get the dev and the label.
test_data, dev_data, test_label, dev_label = train_test_split(test_data, test_label, test_size = 0.5)


print(training_data.shape)
print(test_data.shape)
print(dev_data.shape)

Before resplitting (5149, 20)
(36039, 20)
(2574, 20)
(2575, 20)


In [97]:
# np.random.seed(1)
# # Removes the scientific notation
# np.set_printoptions(suppress = True)
# a = np.random.rand(3,3)
# print(a)
# np.random.shuffle(a)
# print(a)
print(training_data.shape[0])


36039


In [151]:
# Define the Keras Model
model = keras.Sequential([
    keras.layers.Dense(1, activation = tf.nn.sigmoid, input_shape = (20, ))
])

In [154]:
# Compile the model
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', 
              metrics= ['accuracy', 'binary_crossentropy'] )

In [155]:
# Excecutes the model
# Note. You could've specified a validation set by specifying 
# validation_split
# https://www.youtube.com/watch?v=dzoh8cfnvnI&feature=youtu.be
history = model.fit(training_data, training_label, epochs = 10, batch_size = 128,
# By setting verbose 0, 1 or 2 you just say how do you want to 'see' the training progress for each epoch.
# https://stackoverflow.com/questions/47902295/what-is-the-use-of-verbose-in-keras-while-validating-the-model
                   validation_data = (dev_data, dev_label), verbose = 2)


Train on 36039 samples, validate on 2575 samples
Epoch 1/10
 - 1s - loss: 1.8283 - acc: 0.8866 - binary_crossentropy: 1.8283 - val_loss: 1.7088 - val_acc: 0.8940 - val_binary_crossentropy: 1.7088
Epoch 2/10
 - 0s - loss: 1.8283 - acc: 0.8866 - binary_crossentropy: 1.8283 - val_loss: 1.7088 - val_acc: 0.8940 - val_binary_crossentropy: 1.7088
Epoch 3/10
 - 0s - loss: 1.8283 - acc: 0.8866 - binary_crossentropy: 1.8283 - val_loss: 1.7088 - val_acc: 0.8940 - val_binary_crossentropy: 1.7088
Epoch 4/10
 - 0s - loss: 1.8283 - acc: 0.8866 - binary_crossentropy: 1.8283 - val_loss: 1.7088 - val_acc: 0.8940 - val_binary_crossentropy: 1.7088
Epoch 5/10
 - 0s - loss: 1.8283 - acc: 0.8866 - binary_crossentropy: 1.8283 - val_loss: 1.7088 - val_acc: 0.8940 - val_binary_crossentropy: 1.7088
Epoch 6/10
 - 0s - loss: 1.8283 - acc: 0.8866 - binary_crossentropy: 1.8283 - val_loss: 1.7088 - val_acc: 0.8940 - val_binary_crossentropy: 1.7088
Epoch 7/10
 - 0s - loss: 1.8283 - acc: 0.8866 - binary_crossentropy: 

In [161]:
# Let's try to check it out :/ 
test_predictions = model.predict(test_data).flatten()
rango = len(test_label)
test_corr = 0
for i in range(rango):
    cond = test_predictions[i] == test_label[i]
    test_corr += 1 if cond else 0
#     print('Correct' if cond else 'Incorrect')

    
print(f'Total accuracy is {test_corr/rango} from {rango} samples')

Total accuracy is 0.8916083916083916 from 2574 samples
