In [7]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  47247      0  0:29:40  0:29:40 --:--:-- 70995 0:30:27 159682 54410 0  0:30:43  0:23:15  0:07:28 13402


In [8]:
import os
import re
import shutil
import string
import numpy as np
import mlfoundry as mlf

import pandas as pd
import tensorflow as tf
# import tensorflow_datasets as tfds
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.callbacks import Callback

## Training the model

## Using MlFoundry APIs

In [9]:
mlf_api = mlf.get_client() 
mlf_run = mlf_api.create_run(project_name='tensorflow-project')

2022-04-06 00:42:34.438 INFO    mlfoundry.mlfoundry_api: Run is created with id 354f310fffbc423a9ca63130ecaf26d6 and name run_2022-04-05_19-12-34_utc


### Building the tensorflow model

In [10]:
# To remove <br/> from input text
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')


batch_size = 32
seed = 42
max_features = 10000
sequence_length = 250
embedding_dim = 16
epochs = 10
AUTOTUNE = tf.data.AUTOTUNE

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)


def download_data():
    url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    dataset = tf.keras.utils.get_file("aclImdb_v1", url, untar=True, cache_dir='.', cache_subdir='')
    dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
    train_dir = os.path.join(dataset_dir, 'train')
    remove_dir = os.path.join(train_dir, 'unsup')
    shutil.rmtree(remove_dir)


def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

class MetricsLogCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        mlf_run.log_metrics(logs)   # logging metrics using mlfoundry run


def get_raw_dataset():
    raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
        'aclImdb/train',
        batch_size=batch_size,
        validation_split=0.2,
        subset='training',
        seed=seed)

    raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
        'aclImdb/train',
        batch_size=batch_size,
        validation_split=0.2,
        subset='validation',
        seed=seed)

    raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
        'aclImdb/test',
        batch_size=batch_size)
    return raw_train_ds, raw_val_ds, raw_test_ds


def prep_dataset(raw_train_ds, raw_val_ds, raw_test_ds):
    # Make a text-only dataset (without labels), then call adapt
    train_text = raw_train_ds.map(lambda x, y: x)
    vectorize_layer.adapt(train_text)

    # retrieve a batch (of 32 reviews and labels) from the dataset
    text_batch, label_batch = next(iter(raw_train_ds))

    train_ds = raw_train_ds.map(vectorize_text)
    val_ds = raw_val_ds.map(vectorize_text)
    test_ds = raw_test_ds.map(vectorize_text)

    train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
    val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
    test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
    return train_ds, val_ds, test_ds


def build_model(train_ds, val_ds, test_ds):
    model = tf.keras.Sequential([
        layers.Embedding(max_features + 1, embedding_dim),
        layers.Dropout(0.2),
        layers.GlobalAveragePooling1D(),
        layers.Dropout(0.2),
        layers.Dense(1)])

    model.summary()

    model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
                  optimizer='adam',
                  metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

    model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=epochs,
        callbacks=[MetricsLogCallback()])
    return model


def build_exportable_model(model):
    export_model = tf.keras.Sequential([
        vectorize_layer,
        model,
        layers.Activation('sigmoid')
    ])

    export_model.compile(
        loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
    )
    return export_model

### Training model

In [11]:
# download_data()   # uncomment this to download the dataset
raw_train_ds, raw_val_ds, raw_test_ds = get_raw_dataset()
train_ds, val_ds, test_ds = prep_dataset(raw_train_ds, raw_val_ds, raw_test_ds)
model = build_model(train_ds, val_ds, test_ds)
export_model = build_exportable_model(model)

Found 75000 files belonging to 3 classes.
Using 60000 files for training.
Found 75000 files belonging to 3 classes.
Using 15000 files for validation.
Found 25000 files belonging to 2 classes.
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          160016    
                                                                 
 dropout (Dropout)           (None, None, 16)          0         
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense (Dense)               (None, 1)                 17     

2022-04-06 00:43:00.040 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


Epoch 2/10

2022-04-06 00:43:09.895 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


Epoch 3/10

2022-04-06 00:43:19.683 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


Epoch 4/10

2022-04-06 00:43:29.342 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


Epoch 5/10

2022-04-06 00:43:39.189 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


Epoch 6/10

2022-04-06 00:43:49.094 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


Epoch 7/10

2022-04-06 00:43:59.087 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


Epoch 8/10

2022-04-06 00:44:08.982 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


Epoch 9/10

2022-04-06 00:44:18.777 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


Epoch 10/10

2022-04-06 00:44:28.618 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully




### Logging Model

In [12]:
model_loggable = {
    'model': export_model,
    'signatures': None,
    'options': None
}

mlf_run.log_model(model_loggable, mlf.ModelFramework.TENSORFLOW)

2022-04-06 00:44:29.207779: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
2022-04-06 00:44:29.869 INFO    tensorflow: Assets written to: /home/rizwan/enthire/mlfoundry-examples/examples/tensorflow/servicefoundry/logdirs/models/assets
2022-04-06 00:44:29.904 INFO    mlfoundry.mlfoundry_run: Model logged Successfully


### Logging Predictions

In [14]:
for text_batch, label_batch in raw_test_ds.take(1):
    X_test = text_batch.numpy()
    y_test = label_batch.numpy()

y_hat_test = export_model.predict(X_test)

### Logging Dataset Stats

In [15]:
y_hat_test = np.round(y_hat_test.reshape((batch_size))).astype('int32')

X_test_df = pd.DataFrame(X_test, columns=['text'])
X_test_df['targets'] = y_test
X_test_df['predictions'] = y_hat_test

mlf_run.log_dataset_stats(
    X_test_df, 
    data_slice="test",
    data_schema=mlf.Schema(
        feature_column_names=['text'],
        prediction_column_name="predictions",
        actual_column_name="targets"
    ),
    model_type="binary_classification",
)

  self.__compute_whylogs_stats(df[set(data_schema.feature_column_names)])
2022-04-06 01:05:38.086 INFO    whylogs.app.config: No config file loaded
  df[set(data_schema.feature_column_names)],
2022-04-06 01:05:38.118 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully
2022-04-06 01:05:38.120 INFO    mlfoundry.mlfoundry_run: Dataset stats have been successfully computed and logged


WARN: Missing config
