**Problem**  
You are given data from an Audiobook app. Each customer in the database has made a purchase at least once. We want to create a machine learning algorithm based on our available data that can predict if a customer will buy again from the Audiobook company.  
There are several variables: Customer ID, Book length in mins_avg (average of all purchases), Book length in minutes_sum (sum of all purchases), Price Paid_avg (average of all purchases), Price paid_sum (sum of all purchases), Review (a Boolean variable), Review (out of 10), Total minutes listened, Completion (from 0 to 1), Support requests (number), and Last visited minus purchase date (in days). The targets are a Boolean variable (0 or 1). We are taking a period of 2 years in our inputs, and the next 6 months as targets. So we are predicting if based on the last 2 years of activity and engagement, a customer will convert in the next 6 months.

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing

### Extract the data from the csv

In [2]:
raw_csv_data = np.loadtxt('Audiobooks_data.csv',delimiter=',')

# The inputs are all columns in the csv, except for the first one (customer id) and the last one (target)
unscaled_inputs_all = raw_csv_data[:,1:-1]

# save the targets
targets_all = raw_csv_data[:,-1]

### Balance the dataset

In [3]:
# Count how many targets are 1 (i.e. the customer did convert)
num_one_targets = int(np.sum(targets_all))

# Set a counter for targets that are 0 (i.e. the customer did not convert)
zero_targets_counter = 0

indices_to_remove = []

# Once there are as many 0s as 1s, mark entries where the target is 0.
for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

# Delete all indices marked "to remove"
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)

In [4]:
# standardize the inputs
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

In [5]:
# shuffle the data
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

### Split the dataset into train, validation, and test

In [6]:
# Count the total number of samples
samples_count = shuffled_inputs.shape[0]

# Count the samples in each subset, assuming the 80-10-10 distribution of training, validation, and test
train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.1 * samples_count)

# The 'test' dataset contains all remaining data.
test_samples_count = samples_count - train_samples_count - validation_samples_count

# Create variables that record the inputs and targets for training
train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

# Create variables that record the inputs and targets for validation
validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]

# Create variables that record the inputs and targets for test
test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]

# Print the number of 1 targets, the total number of samples, and the proportion data sets
print('Train:', np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print('Validation:', np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)
print('Test:', np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)

Train: 1773.0 3579 0.4953897736797988
Validation: 236.0 447 0.5279642058165548
Test: 228.0 448 0.5089285714285714


In [7]:
# Save the three datasets in *.npz
np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)

In [8]:
# load npz sets
npz = np.load('Audiobooks_data_train.npz')

train_inputs = npz['inputs'].astype(np.float)
train_targets = npz['targets'].astype(np.int)

npz = np.load('Audiobooks_data_validation.npz')
validation_inputs, validation_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

npz = np.load('Audiobooks_data_test.npz')
test_inputs, test_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  after removing the cwd from sys.path.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  # This is added back by InteractiveShellApp.init_path()
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  # This is added back by InteractiveShellApp.init_path()


### Model

In [9]:
input_size = 10
output_size = 2
hidden_layer_size = 50
    
model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(output_size, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

batch_size = 100
max_epochs = 100

# set early stopping to be a bit tolerant against random validation loss increases
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

model.fit(train_inputs,
          train_targets,
          batch_size=batch_size,
          epochs=max_epochs,
          callbacks=[early_stopping],
          validation_data=(validation_inputs, validation_targets),
          verbose=2
          )

Epoch 1/100
36/36 - 1s - loss: 0.5983 - accuracy: 0.6871 - val_loss: 0.5247 - val_accuracy: 0.7763 - 532ms/epoch - 15ms/step
Epoch 2/100
36/36 - 0s - loss: 0.4811 - accuracy: 0.7684 - val_loss: 0.4528 - val_accuracy: 0.7852 - 56ms/epoch - 2ms/step
Epoch 3/100
36/36 - 0s - loss: 0.4275 - accuracy: 0.7877 - val_loss: 0.4193 - val_accuracy: 0.7987 - 54ms/epoch - 2ms/step
Epoch 4/100
36/36 - 0s - loss: 0.4014 - accuracy: 0.7952 - val_loss: 0.3991 - val_accuracy: 0.8031 - 56ms/epoch - 2ms/step
Epoch 5/100
36/36 - 0s - loss: 0.3811 - accuracy: 0.8013 - val_loss: 0.3750 - val_accuracy: 0.8233 - 54ms/epoch - 1ms/step
Epoch 6/100
36/36 - 0s - loss: 0.3711 - accuracy: 0.8036 - val_loss: 0.3616 - val_accuracy: 0.8210 - 50ms/epoch - 1ms/step
Epoch 7/100
36/36 - 0s - loss: 0.3698 - accuracy: 0.7991 - val_loss: 0.3696 - val_accuracy: 0.8098 - 52ms/epoch - 1ms/step
Epoch 8/100
36/36 - 0s - loss: 0.3598 - accuracy: 0.8061 - val_loss: 0.3530 - val_accuracy: 0.8255 - 51ms/epoch - 1ms/step
Epoch 9/100
36

<keras.callbacks.History at 0x7fac3b14f0d0>

In [10]:
# test the model
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)



In [11]:
print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))


Test loss: 0.34. Test accuracy: 80.36%
