In [None]:
import numpy as np
import pandas as pd

In [None]:
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

In [None]:
features = ['smart_5_raw', 'smart_187_raw', 'smart_188_raw',
            'smart_197_raw', 'smart_198_raw', 'failure']
train_data = pd.read_csv("jan_feb_backblaze_train.csv").reindex(columns=features)
test_data = pd.read_csv("nov_dec_backblaze_test.csv").reindex(columns=features)
test_data.head()

Unnamed: 0,smart_5_raw,smart_187_raw,smart_188_raw,smart_197_raw,smart_198_raw,failure
0,0.0,2.0,0.0,0.0,0.0,1.0
1,102.0,,,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,5.0,0.0,64.0,64.0,1.0
4,0.0,,,1.0,0.0,1.0


In [None]:
print (train_data.smart_187_raw.value_counts(dropna=False))
print (test_data.smart_187_raw.value_counts(dropna=False))

0.0     1357
NaN      798
1.0       14
2.0        9
4.0        4
12.0       4
7.0        3
6.0        3
3.0        3
18.0       2
8.0        2
36.0       2
31.0       1
24.0       1
30.0       1
21.0       1
9.0        1
39.0       1
Name: smart_187_raw, dtype: int64
0.0     1349
NaN      825
2.0       13
1.0        8
6.0        7
12.0       4
3.0        4
4.0        3
5.0        1
33.0       1
78.0       1
7.0        1
27.0       1
11.0       1
18.0       1
24.0       1
8.0        1
17.0       1
9.0        1
Name: smart_187_raw, dtype: int64


In [None]:
train_data = train_data.fillna(value=-1)
test_data = test_data.fillna(value=-1)

In [None]:
test_data[['failure']] = test_data[['failure']].astype(int)  
test_data.head()


Unnamed: 0,smart_5_raw,smart_187_raw,smart_188_raw,smart_197_raw,smart_198_raw,failure
0,0.0,2.0,0.0,0.0,0.0,1
1,102.0,-1.0,-1.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,1
3,0.0,5.0,0.0,64.0,64.0,1
4,0.0,-1.0,-1.0,1.0,0.0,1


In [None]:
train_data.head()

Unnamed: 0,smart_5_raw,smart_187_raw,smart_188_raw,smart_197_raw,smart_198_raw,failure
0,0,-1.0,-1.0,0,-1.0,0
1,2,-1.0,-1.0,0,-1.0,0
2,0,-1.0,-1.0,0,-1.0,0
3,0,-1.0,-1.0,0,-1.0,0
4,0,0.0,-1.0,0,-1.0,0


In [None]:
x_train = train_data.drop(['failure'], axis=1)
x_test = test_data.drop(['failure'], axis=1)

x_test.head()

Unnamed: 0,smart_5_raw,smart_187_raw,smart_188_raw,smart_197_raw,smart_198_raw
0,0.0,2.0,0.0,0.0,0.0
1,102.0,-1.0,-1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,5.0,0.0,64.0,64.0
4,0.0,-1.0,-1.0,1.0,0.0


In [None]:
y_train = train_data['failure']
y_test = test_data['failure']
y_test.astype('int64')
y_test.head()

0    1
1    1
2    1
3    1
4    1
Name: failure, dtype: int64

In [None]:
def learn_parameters(x, y, mus, scales, optimiser, epochs):
    """
    Set up the class conditional distributions as a MultivariateNormalDiag
    object, and update the trainable variables in a custom training loop.
    """
    @tf.function
    def nll(dist, x_train, y_train):
        log_probs = dist.log_prob(x_train)
        L = len(tf.unique(y_train)[0])
        y_train = tf.one_hot(indices=y_train, depth=L)
        return -tf.reduce_mean(log_probs * y_train)

    @tf.function
    def get_loss_and_grads(dist, x_train, y_train):
        with tf.GradientTape() as tape:
            tape.watch(dist.trainable_variables)
            loss = nll(dist, x_train, y_train)
            grads = tape.gradient(loss, dist.trainable_variables)
        return loss, grads

    nll_loss = []
    mu_values = []
    scales_values = []
    x = tf.cast(np.expand_dims(x, axis=1), tf.float32)
    dist = tfd.MultivariateNormalDiag(loc=mus, scale_diag=scales)
    for epoch in range(epochs):
        loss, grads = get_loss_and_grads(dist, x, y)
        optimiser.apply_gradients(zip(grads, dist.trainable_variables))
        nll_loss.append(loss)
        mu_values.append(mus.numpy())
        scales_values.append(scales.numpy())
    nll_loss, mu_values, scales_values = \
        np.array(nll_loss), np.array(mu_values), np.array(scales_values)
    return (nll_loss, mu_values, scales_values, dist)

In [None]:
mus = tf.Variable([[1., 0., 1., 1., 1.], [1., 0., 1., 1., 1.]])
scales = tf.Variable([[1., 1., 1., 1., 1.], [1., 1., 1., 1., 1.]])
opt = tf.keras.optimizers.Adam(learning_rate=0.005)
epochs = 10000
nlls, mu_arr, scales_arr, class_conditionals = learn_parameters(x_train, y_train, mus, scales, opt, epochs)


In [None]:
def predict_class(prior, class_conditionals, x):
    def predict_fn(myx):
        class_probs = class_conditionals.prob(tf.cast(myx, dtype=tf.float32))
        prior_probs = tf.cast(prior.probs, dtype=tf.float32)
        class_times_prior_probs = class_probs * prior_probs
        Q = tf.reduce_sum(class_times_prior_probs)       # Technically, this step
        P = tf.math.divide(class_times_prior_probs, Q)   # and this one, are not necessary.
        Y = tf.cast(tf.argmax(P), dtype=tf.float64)
        return Y
    y = tf.map_fn(predict_fn, x)
    return y

# Get the class predictions
# Evaluate the model accuracy on the test set
predictions = predict_class(prior, class_conditionals, x_test)
accuracy = accuracy_score(y_test, predictions)
print("Test accuracy: {:.4f}".format(accuracy*100))

Test accuracy: 89.9281


In [211]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(8))
model.add(tf.keras.layers.Dense(1))
model.compile(optimizer='adam', loss='mse')

model.fit(x_train, y_train,  epochs=500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f3b66e78610>

In [204]:
new_input = [0, 2, 0, 100, 0]
new_input = np.array(new_input).reshape(1, -1)
model.predict_classes(new_input)




array([[1]], dtype=int32)

In [212]:
KERAS_MODEL_NAME = "hdd_model.h5"
model.save(KERAS_MODEL_NAME)

In [206]:
import os
def get_file_size(file_path):
    size = os.path.getsize(file_path)
    return size
def convert_bytes(size, unit=None):
    if unit == "KB":
        return print('File size: ' + str(round(size / 1024, 3)) + ' Kilobytes')
    elif unit == "MB":
        return print('File size: ' + str(round(size / (1024 * 1024), 3)) + ' Megabytes')
    else:
        return print('File size: ' + str(size) + ' bytes')

In [207]:
convert_bytes(get_file_size(KERAS_MODEL_NAME), "MB")

File size: 0.024 Megabytes


In [208]:
test_loss, test_acc = model.evaluate(x_test,  y_test, verbose=2)
print('\nTest accuracy:', test_acc)

70/70 - 0s - loss: 11538217039822848.0000 - accuracy: 0.9388

Test accuracy: 0.9388489127159119


In [None]:
TF_LITE_MODEL_FILE_NAME = "hdd_failure.tflite"

In [217]:

converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.experimental_new_converter=True
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
                                       tf.lite.OpsSet.SELECT_TF_OPS]
tflite_model = converter.convert()


INFO:tensorflow:Assets written to: /tmp/tmpj5w5ug_q/assets


INFO:tensorflow:Assets written to: /tmp/tmpj5w5ug_q/assets


In [218]:
tflite_model_name = TF_LITE_MODEL_FILE_NAME
open(tflite_model_name, "wb").write(tflite_model)

1808

In [219]:
convert_bytes(get_file_size(TF_LITE_MODEL_FILE_NAME), "KB")

File size: 1.766 Kilobytes
