# Deep learning real-life example (Audio books business)

Customers conversion for an audio book company, we have data for 2 years of activity + 6 months to check conversion => this is supervised learning since we have the targets, we want to predict if a customer will buy again (classification problem: will buy, wont buy)

In [1]:
import numpy as np
import tensorflow as tf
from sklearn import preprocessing

In [2]:
# Load data
raw_csv_data = np.loadtxt('./data/audiobooks.csv', delimiter=',')
unscaled_inputs_all = raw_csv_data[:,1:-1]
targets_all = raw_csv_data[:,-1]

In [3]:
# Balance dataset
num_one_targets = int(np.sum(targets_all))
zero_targets_counter = 0
indices_to_remove = []

for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter+=1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)

In [4]:
# Standardize inputs
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

In [5]:
# Shuffle data
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

In [6]:
# Train-validation-test split
samples_count = shuffled_inputs.shape[0]

train_samples_count = int(0.8*samples_count)
validation_samples_count = int(0.1*samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count

train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]

test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]

In [7]:
# Save datasets
np.savez('./data/Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('./data/Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('./data/Audiobooks_data_test', inputs=test_inputs, targets=test_targets)

In [11]:
# Load datasets
npz = np.load('./data/Audiobooks_data_train.npz')

train_inputs = npz['inputs'].astype(np.float64)
train_targets = npz['targets'].astype(np.int32)

npz = np.load('./data/Audiobooks_data_validation.npz')
validation_inputs = npz['inputs'].astype(np.float64)
validation_targets = npz['targets'].astype(np.int32)

npz = np.load('./data/Audiobooks_data_test.npz')
test_inputs = npz['inputs'].astype(np.float64)
test_targets = npz['targets'].astype(np.int32)

In [12]:
# NN Model
input_size = 10
output_size = 2 # will buy or wont buy
hidden_layer_size = 200
model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(output_size, activation='softmax') # last layer, apply classification activation function
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [21]:
# Training the model
batch_size, max_epochs = 100, 100
model.fit(
    train_inputs,
    train_targets,
    batch_size=batch_size,
    epochs=max_epochs,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=10)], # setup early stopping mechanism
    validation_data=(validation_inputs, validation_targets),
    verbose=2
)

Epoch 1/100
36/36 - 0s - loss: 0.2892 - accuracy: 0.8416 - val_loss: 0.3535 - val_accuracy: 0.8143 - 71ms/epoch - 2ms/step
Epoch 2/100
36/36 - 0s - loss: 0.2881 - accuracy: 0.8371 - val_loss: 0.3472 - val_accuracy: 0.8054 - 55ms/epoch - 2ms/step
Epoch 3/100
36/36 - 0s - loss: 0.2942 - accuracy: 0.8340 - val_loss: 0.3449 - val_accuracy: 0.8076 - 55ms/epoch - 2ms/step
Epoch 4/100
36/36 - 0s - loss: 0.2888 - accuracy: 0.8393 - val_loss: 0.3463 - val_accuracy: 0.8121 - 56ms/epoch - 2ms/step
Epoch 5/100
36/36 - 0s - loss: 0.2865 - accuracy: 0.8438 - val_loss: 0.3408 - val_accuracy: 0.8076 - 55ms/epoch - 2ms/step
Epoch 6/100
36/36 - 0s - loss: 0.2872 - accuracy: 0.8421 - val_loss: 0.3412 - val_accuracy: 0.8166 - 58ms/epoch - 2ms/step
Epoch 7/100
36/36 - 0s - loss: 0.2901 - accuracy: 0.8351 - val_loss: 0.3452 - val_accuracy: 0.8121 - 56ms/epoch - 2ms/step
Epoch 8/100
36/36 - 0s - loss: 0.2882 - accuracy: 0.8421 - val_loss: 0.3580 - val_accuracy: 0.8054 - 54ms/epoch - 1ms/step
Epoch 9/100
36/3

<keras.callbacks.History at 0x2333f273cd0>

In [23]:
# Testing the model
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)

