## Try this Notebook in Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truefoundry/mlfoundry-examples/blob/main/examples/tensorflow/sentiment_analysis.ipynb)

## Install dependencies

In [None]:
! pip install --quiet "numpy>=1.0.0,<2.0.0" "pandas>=1.0.0,<2.0.0" "matplotlib>=3.5.2,<3.6.0" "tensorflow>=2.0.0,<3.0.0" shap==0.40.0
! pip install -U "mlfoundry>=0.4.2,<0.5.0"

## Initialize MLFoundry Client

In [None]:
import mlfoundry as mlf

client = mlf.get_client()

---

In [None]:
import os
import getpass
import urllib.parse
import re
import shutil
import string

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.callbacks import Callback

import mlfoundry as mlf

## Training the model

## Using MlFoundry APIs

In [None]:
mlf_run = client.create_run(project_name='tensorflow-project')

### Building the tensorflow model

In [None]:
# To remove <br/> from input text
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')


batch_size = 32
seed = 42
max_features = 10000
sequence_length = 250
embedding_dim = 16
epochs = 10
AUTOTUNE = tf.data.AUTOTUNE

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)


def download_data():
    url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    dataset = tf.keras.utils.get_file("aclImdb_v1", url, untar=True, cache_dir='.', cache_subdir='')
    dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
    train_dir = os.path.join(dataset_dir, 'train')
    remove_dir = os.path.join(train_dir, 'unsup')
    shutil.rmtree(remove_dir)


def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

class MetricsLogCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        mlf_run.log_metrics(logs)   # logging metrics using mlfoundry run


def get_raw_dataset():
    raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
        'aclImdb/train',
        batch_size=batch_size,
        validation_split=0.2,
        subset='training',
        seed=seed)

    raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
        'aclImdb/train',
        batch_size=batch_size,
        validation_split=0.2,
        subset='validation',
        seed=seed)

    raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
        'aclImdb/test',
        batch_size=batch_size)
    return raw_train_ds, raw_val_ds, raw_test_ds


def prep_dataset(raw_train_ds, raw_val_ds, raw_test_ds):
    # Make a text-only dataset (without labels), then call adapt
    train_text = raw_train_ds.map(lambda x, y: x)
    vectorize_layer.adapt(train_text)

    # retrieve a batch (of 32 reviews and labels) from the dataset
    text_batch, label_batch = next(iter(raw_train_ds))

    train_ds = raw_train_ds.map(vectorize_text)
    val_ds = raw_val_ds.map(vectorize_text)
    test_ds = raw_test_ds.map(vectorize_text)

    train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
    val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
    test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
    return train_ds, val_ds, test_ds


def build_model(train_ds, val_ds, test_ds):
    model = tf.keras.Sequential([
        layers.Embedding(max_features + 1, embedding_dim),
        layers.Dropout(0.2),
        layers.GlobalAveragePooling1D(),
        layers.Dropout(0.2),
        layers.Dense(1)
    ])

    model.summary()

    model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
                  optimizer='adam',
                  metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

    model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=epochs,
        callbacks=[MetricsLogCallback()])
    return model


def build_exportable_model(model):
    export_model = tf.keras.Sequential([
        vectorize_layer,
        model,
        layers.Activation('sigmoid')
    ])

    export_model.compile(
        loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
    )
    return export_model

In [None]:
download_data()

### Training model

In [None]:
raw_train_ds, raw_val_ds, raw_test_ds = get_raw_dataset()
train_ds, val_ds, test_ds = prep_dataset(raw_train_ds, raw_val_ds, raw_test_ds)
model = build_model(train_ds, val_ds, test_ds)
export_model = build_exportable_model(model)

### Logging Model

In [None]:
model_loggable = {
    'model': export_model,
    'signatures': None,
    'options': None
}

mlf_run.log_model(
    name="sentiment-classifier",
    model=model_loggable,
    framework=mlf.ModelFramework.TENSORFLOW,
    description="example tensorflow model"
)